diff --git a/.github/workflows/main-publish.yml b/.github/workflows/main-publish.yml index 7268787ea..92e8fe972 100644 --- a/.github/workflows/main-publish.yml +++ b/.github/workflows/main-publish.yml @@ -27,6 +27,8 @@ jobs: - uses: actions/setup-python@v2 with: python-version: 3.8 + - name: Install Python dependencies for publish + run: python -m pip install dcicutils==7.5.0 - name: Publish env: PYPI_USER: ${{ secrets.PYPI_USER }} diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1a62bdf3b..ddf122fde 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -33,7 +33,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: 3.9 - name: Install/Link Postgres run: | diff --git a/.gitignore b/.gitignore index 31e7cff72..bac9a668c 100644 --- a/.gitignore +++ b/.gitignore @@ -63,3 +63,7 @@ # Vim *.swp + +# These are generated by prepare-local-dev from corresponding .template files +development.ini +test.ini diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 119e803f5..3bf69be77 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,50 @@ snovault Change Log ---------- +9.0.0 +===== +* Merge/unify ingestion and other code from cgap-portal and fourfront. + + +8.1.0 +===== + +* Add several modules/commands from upstream portals that are generic enough to live in + this repository (to reduce code/library maintenace overhead) + +* Port support for ``make deploy1`` from the portals: + + * In ``Makefile``: + + * Support for ``make deploy1`` + + * Support for ``make psql-dev`` + + * Support for ``make psql-test`` + + * Support for ``make kibana-start`` (commented out for now, pending testing) + + * Support for ``make kibana-start-test`` (commented out) + + * Support for ``make kibana-stop`` (commented out) + + * In ``pyproject.toml``: + + * Template file ``development.ini.template`` + + * Template file ``test.ini.template`` + + * Support for ``prepare-local-dev`` script, + which creates ``development.ini`` from ``development.ini.template`` + and ``test.ini`` from ``test.ini.template``. + + * Port the ``dev_servers.py`` support from CGAP. + + * In the ``scripts/`` dir: + + * Add ``scripts/psql-start`` + in support of ``make psql-dev`` and ``make psql-test``. + 8.0.1 ===== @@ -33,6 +77,7 @@ Change Log * Change ``pytest.yield_fixture`` to ``pytest.yield``. This is techinically incompatible since it would break downstream portals if they were below ``pytest`` 6, but they are both at ``pytest 7`` now, so they should be unaffected. * Address some places involving ``.execute(raw_string)`` that should be ``.execute(text(raw_string))``. + 7.3.0 ===== diff --git a/Makefile b/Makefile index c8d4043b4..c40721c4a 100644 --- a/Makefile +++ b/Makefile @@ -1,52 +1,85 @@ +SHELL=/bin/bash + clean: - rm -rf *.egg-info + make clean-python-caches -configure: # does any pre-requisite installs - @#pip install --upgrade pip==21.0.1 - pip install --upgrade pip - @#pip install poetry==1.1.9 # this version is known to work. -kmp 5-Oct-2021 - pip install wheel - pip install poetry +clean-python-caches: + rm -rf *.egg-info clear-poetry-cache: # clear poetry/pypi cache. for user to do explicitly, never automatic poetry cache clear pypi --all +aws-ip-ranges: + curl -o aws-ip-ranges.json https://ip-ranges.amazonaws.com/ip-ranges.json + macpoetry-install: scripts/macpoetry-install -lint: - poetry run flake8 snovault +configure: # does any pre-requisite installs + pip install --upgrade pip + pip install wheel + pip install poetry==1.4.2 -macbuild: +build-poetry: + make configure + poetry install + +macbuild-poetry: make configure make macpoetry-install - make build-after-poetry build: - make configure - make build-configured +ifeq ($(shell uname -s), Darwin) + @echo "Looks like this is Mac so executing: make macbuild" + make macbuild +else + make build-poetry + make build-after-poetry +endif -build-configured: - poetry install +macbuild: + make macbuild-poetry + make build-after-poetry + +build-after-poetry: # continuation of build after poetry install + make aws-ip-ranges + poetry run python setup_eb.py develop + make fix-dist-info + poetry run prepare-local-dev + +fix-dist-info: + @scripts/fix-dist-info build-for-ga: make configure poetry config --local virtualenvs.create true - make build-configured + poetry install + +deploy1: # starts postgres/ES locally and loads inserts, and also starts ingestion engine + @DEBUGLOG=`pwd` SNOVAULT_DB_TEST_PORT=`grep 'sqlalchemy[.]url =' development.ini | sed -E 's|.*:([0-9]+)/.*|\1|'` dev-servers-snovault development.ini --app-name app --clear --init --load + +psql-dev: # starts psql with the url after 'sqlalchemy.url =' in development.ini + @scripts/psql-start dev + +psql-test: # starts psql with a url constructed from data in 'ps aux'. + @scripts/psql-start test + +#kibana-start: # starts a dev version of kibana (default port) +# scripts/kibana-start +# +#kibana-start-test: # starts a test version of kibana (port chosen for active tests) +# scripts/kibana-start test +# +#kibana-stop: +# scripts/kibana-stop ES_URL = search-fourfront-testing-opensearch-kqm7pliix4wgiu4druk2indorq.us-east-1.es.amazonaws.com:443 -LOCAL_INSTAFAIL_OPTIONS = --timeout=200 -xvv --instafail +LOCAL_INSTAFAIL_OPTIONS = --timeout=400 -xvv --instafail LOCAL_MULTIFAIL_OPTIONS = --timeout=200 -vv GA_CICD_TESTING_OPTIONS = --timeout=400 -xvvv --durations=100 --aws-auth --es ${ES_URL} STATIC_ANALYSIS_OPTIONS = -vv -TEST_NAME ?= missing_TEST_NAME - -test-one: - - SQLALCHEMY_WARN_20=1 pytest ${LOCAL_MULTIFAIL_OPTIONS} -k ${TEST_NAME} - test: @git log -1 --decorate | head -1 @date @@ -54,6 +87,13 @@ test: @git log -1 --decorate | head -1 @date +ES_URL = search-fourfront-testing-opensearch-kqm7pliix4wgiu4druk2indorq.us-east-1.es.amazonaws.com:443 + +LOCAL_INSTAFAIL_OPTIONS = --timeout=400 -xvv --instafail +LOCAL_MULTIFAIL_OPTIONS = --timeout=200 -vv +GA_CICD_TESTING_OPTIONS = --timeout=400 -xvvv --durations=100 --aws-auth --es ${ES_URL} +STATIC_ANALYSIS_OPTIONS = -vv + test-full: @git log -1 --decorate | head -1 @date @@ -63,48 +103,98 @@ test-full: @git log -1 --decorate | head -1 @date -test-indexing: - SQLALCHEMY_WARN_20=1 poetry run pytest ${LOCAL_INSTAFAIL_OPTIONS} -m "indexing" - test-unit: SQLALCHEMY_WARN_20=1 poetry run pytest ${LOCAL_INSTAFAIL_OPTIONS} -m "not indexing" -test-indexing-full: - SQLALCHEMY_WARN_20=1 poetry run pytest ${LOCAL_MULTIFAIL_OPTIONS} -m "indexing" - test-unit-full: SQLALCHEMY_WARN_20=1 poetry run pytest ${LOCAL_MULTIFAIL_OPTIONS} -m "not indexing" +test-indexing-full: + make test-indexing-not-es-full + make test-indexing-es-full + +test-indexing-es-full: + SQLALCHEMY_WARN_20=1 poetry run pytest ${LOCAL_MULTIFAIL_OPTIONS} -m "indexing and es" + +test-indexing-not-es-full: + SQLALCHEMY_WARN_20=1 poetry run pytest ${LOCAL_MULTIFAIL_OPTIONS} -m "indexing and not es" + +test-indexing: + make test-indexing-not-es && make test-indexing-es + +test-indexing-es: + SQLALCHEMY_WARN_20=1 poetry run pytest ${LOCAL_INSTAFAIL_OPTIONS} -m "indexing and es" + +test-indexing-not-es: + SQLALCHEMY_WARN_20=1 poetry run pytest ${LOCAL_INSTAFAIL_OPTIONS} -m "indexing and not es" + +test-performance: + @echo "snovault has no performance tests right now, but it could." + +test-integrated: + @echo "snovault has no integrated tests right now, but it could." + test-static: NO_SERVER_FIXTURES=TRUE USE_SAMPLE_ENVUTILS=TRUE poetry run python -m pytest -vv -m static make lint -remote-test-indexing: - SQLALCHEMY_WARN_20=1 poetry run pytest ${GA_CICD_TESTING_OPTIONS} -m "indexing" +TEST_NAME ?= missing_TEST_NAME_parameter + +test-one: + SQLALCHEMY_WARN_20=1 poetry run python -m pytest ${LOCAL_MULTIFAIL_OPTIONS} -k ${TEST_NAME} + +remote-test: # Actually, we don't normally use this. Instead the GA workflow sets up two parallel tests. + make remote-test-indexing && make remote-test-unit remote-test-unit: + make remote-test-not-indexing + +remote-test-not-indexing: SQLALCHEMY_WARN_20=1 poetry run pytest ${GA_CICD_TESTING_OPTIONS} -m "not indexing" +remote-test-indexing: + make remote-test-indexing-not-es && make remote-test-indexing-es + +remote-test-indexing-es: + SQLALCHEMY_WARN_20=1 poetry run pytest ${GA_CICD_TESTING_OPTIONS} -m "indexing and es" + +remote-test-indexing-not-es: + SQLALCHEMY_WARN_20=1 poetry run pytest ${GA_CICD_TESTING_OPTIONS} -m "indexing and not es" + update: poetry update publish: - scripts/publish + poetry run publish-to-pypi publish-for-ga: - scripts/publish --noconfirm + poetry run publish-to-pypi --noconfirm -kill: +kill: # kills back-end processes associated with the application. Use with care. pkill -f postgres & - pkill -f elasticsearch & + pkill -f opensearch & + + +lint-full: + poetry run flake8 deploy/ || echo "flake8 failed for deploy/" + poetry run flake8 snovault/ || echo "flake8 failed for snovault/" + +lint: + poetry run flake8 deploy/ && poetry run flake8 snovault/ help: @make info info: @: $(info Here are some 'make' options:) + $(info - Use 'make aws-ip-ranges' to download latest ip range information. Invoked automatically when needed.) + $(info - Use 'make build' to build only application dependencies (or 'make macbuild' on OSX Catalina)) $(info - Use 'make clean' to clear out (non-python) dependencies) + $(info - Use 'make clear-poetry-cache' to clear the poetry pypi cache if in a bad state. (Safe, but later recaching can be slow.)) $(info - Use 'make configure' to install poetry, though 'make build' will do it automatically.) - $(info - Use 'make build' to build only application dependencies (or 'make macbuild' on OSX Catalina)) + $(info - Use 'make deploy1' to spin up postgres/elasticsearch and load inserts.) + $(info - Use 'make kill' to kill postgres and opensearch proccesses. Please use with care.) + $(info - Use 'make psql-dev' to start psql on data associated with an active 'make deploy1'.) + $(info - Use 'make psql-test' to start psql on data associated with an active test.) $(info - Use 'make test' to run tests with the normal options we use on travis) $(info - Use 'make update' to update dependencies (and the lock file)) diff --git a/base.ini b/base.ini new file mode 100644 index 000000000..51c5f8097 --- /dev/null +++ b/base.ini @@ -0,0 +1,62 @@ +[app:app] +use = egg:snovault +create_tables = true +sqlalchemy.url = postgresql:///encoded +retry.attempts = 3 +file_wfout_bucket = encoded-4dn-files +file_upload_profile_name = encoded-4dn-files +system_bucket = elasticbeanstalk-encoded-4dn-system +elasticsearch.server = 127.0.0.1:9200 +ontology_path = %(here)s/ontology.json +aws_ip_ranges_path = %(here)s/aws-ip-ranges.json +#this is to reroute downloads to something other than aws s3 +#download_proxy = https://download.encodeproject.org/ + +# # Only run ec2metadata on ec2 instances +# # XXX really need to reorganise ini files for more reuse +# hostname_command = command -v ec2metadata > /dev/null && ec2metadata --public-hostname || hostname +# +# multiauth.policies = auth0 session remoteuser accesskey +# multiauth.groupfinder = encoded.authorization.groupfinder +# +# multiauth.policy.session.namespace = mailto +# multiauth.policy.session.use = encoded.authentication.NamespacedAuthenticationPolicy +# multiauth.policy.session.base = pyramid.authentication.SessionAuthenticationPolicy +# +# multiauth.policy.remoteuser.namespace = remoteuser +# multiauth.policy.remoteuser.use = encoded.authentication.NamespacedAuthenticationPolicy +# multiauth.policy.remoteuser.base = pyramid.authentication.RemoteUserAuthenticationPolicy +# +# multiauth.policy.accesskey.namespace = accesskey +# multiauth.policy.accesskey.use = encoded.authentication.NamespacedAuthenticationPolicy +# multiauth.policy.accesskey.base = encoded.authentication.BasicAuthAuthenticationPolicy +# multiauth.policy.accesskey.check = encoded.authentication.basic_auth_check + +# multiauth.policy.auth0.use = encoded.authentication.NamespacedAuthenticationPolicy +# multiauth.policy.auth0.namespace = auth0 +# multiauth.policy.auth0.base = encoded.authentication.Auth0AuthenticationPolicy + +auth0.siteName = 4DN DCC Submission + +postgresql.statement_timeout = 120 +pyramid.default_locale_name = en +# Google analytics config +ga_config_location = ./src/encoded/static/ga_config.json + +# [composite:indexer] +# use = egg:encoded#indexer +# app = app +# path = /index +# timeout = 60 +# set embed_cache.capacity = 5000 +# set indexer = true + +# [composite:ingester] +# use = egg:encoded#ingester +# app = app +# path = /ingest +# timeout = 60 + +# [filter:memlimit] +# use = egg:encoded#memlimit +# rss_limit = 450MB diff --git a/bin/test b/bin/test index 110e4fcce..d3d02878a 100755 --- a/bin/test +++ b/bin/test @@ -10,14 +10,13 @@ while true; do done if [ "${do_moto_setup}" = 'yes' ]; then - make moto-setup -else - echo "NOTE: For efficiency, we're skipping the 'make moto-setup' step." - echo " You may need '--setup-moto' if moto server doesn't start." + echo "--setup-moto is no longer necessary." fi if [ "${TEST_JOB_ID}" = "" -a "${TRAVIS_JOB_ID}" != "" ]; then + echo "You've only set TRAVIS_JOB_ID, but should be setting only TEST_JOB_ID." + echo "PLEASE update your environment because that will soon break." export TEST_JOB_ID=${TRAVIS_JOB_ID} unset TRAVIS_JOB_ID fi diff --git a/development.ini.template b/development.ini.template new file mode 100644 index 000000000..fddce510f --- /dev/null +++ b/development.ini.template @@ -0,0 +1,94 @@ +### +# app configuration +# http://docs.pylonsproject.org/projects/pyramid/en/latest/narr/environment.html +### + +[app:app] +use = config:base.ini#app +sqlalchemy.url = postgresql://postgres@localhost:5441/postgres?host=/tmp/snovault/pgdata +blob_bucket = encoded-4dn-blobs +# metadata_bundles_bucket = ... not needed for snovault +load_test_only = true +create_tables = true +testing = true +postgresql.statement_timeout = 20 +mpindexer = true +indexer = true +elasticsearch.aws_auth = false +pyramid.reload_templates = true +pyramid.debug_authorization = false +pyramid.debug_notfound = true +pyramid.debug_routematch = false +pyramid.default_locale_name = en +# this line determines which load function is used in load_data +# most deployments use: "load_test_data = snovault.loadxl:load_test_data" +# but "load_test_data = snovault.loadxl:load_local_data" may also be appropriate. +load_test_data = snovault.loadxl:load_local_data +encoded_version = 100.200.300 +snovault_version = 200.300.400 +utils_version = 300.400.500 +eb_app_version = app-v-development-simulation +env.name = snovault-devlocal-${USER} + +[pipeline:debug] +pipeline = + egg:PasteDeploy#prefix + egg:repoze.debug#pdbpm + app +set pyramid.includes = + pyramid_translogger + +[composite:main] +use = egg:rutter#urlmap +/ = debug +/_indexer = indexer + +[composite:indexer] +use = config:base.ini#indexer + +### +# wsgi server configuration +### + +[server:main] +use = egg:waitress#main +host = 0.0.0.0 +port = 6543 +threads = 1 + +### +# logging configuration +# http://docs.pylonsproject.org/projects/pyramid/en/latest/narr/logging.html +### + +[loggers] +keys = root, wsgi, encoded + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = INFO +handlers = console + +[logger_wsgi] +level = DEBUG +handlers = +qualname = wsgi + +[logger_encoded] +level = DEBUG +handlers = +qualname = encoded + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s diff --git a/docs/source/invalidation.rst b/docs/source/invalidation.rst index 7543eca01..56d0bf125 100644 --- a/docs/source/invalidation.rst +++ b/docs/source/invalidation.rst @@ -7,7 +7,7 @@ Quick reference of important files (all under src/snovault/) * indexing_views.py - contains the function that builds the object model and other information that is actually indexed. This view is in charge of generating invalidation information * resource_views.py - contains functions for all commonly-used views, including those used in indexing_views * embed.py - key functions used in indexing_views.py and resource_views.py -* utils.py - contains functions used in the selective embedding process (used for @@embedded view) +* utils.py - contains functions used in the selective embedding process (used for ``@@embedded view``) * resources.py - contains class definitions for the basic items in Snovault and controls adding linked and rev_linked items * elasticsearch/indexer.py - coordinates the whole indexing/invalidation process = * elasticsearch/indexer_utils.py - holds function used to run invalidation @@ -16,7 +16,7 @@ Quick reference of important files (all under src/snovault/) Keeping elasticsearch in sync ----------------------------- -The /_indexer wsgi app (es_index_listener.py) drives the incremental indexing process. Previously in the ENCODE setup, the listener was driven by database transactions. We have moved this to a queue-based setup that does not operate on DB snapshot. At a fixed, short time interval (at time of writing: 3 seconds), the index listener calls the /index view (indexer.py) which works out what needs to be reindexed. The actual reindexing happens in parallel in multiprocessing subprocesses (mpindexer.py.) +The ``/_indexer`` wsgi app (``es_index_listener.py``) drives the incremental indexing process. Previously in the ENCODE setup, the listener was driven by database transactions. We have moved this to a queue-based setup that does not operate on DB snapshot. At a fixed, short time interval (at time of writing: 3 seconds), the index listener calls the ``/index`` view (``indexer.py``) which works out what needs to be reindexed. The actual reindexing happens in parallel in multiprocessing subprocesses (``mpindexer.py``.) Keeping indexing on track requires a couple components: 1. keeping track of what needs to be invalidated @@ -25,40 +25,41 @@ Keeping indexing on track requires a couple components: Keeping track of what needs to be invalidated --------------------------------------------- -When rendering the view of an item to be indexed (@@index-data, see src/snovault/indexing_views.py), we record the set of uuids traversed when building the item. This is the _linked_uuids, which is stored as an reified attribute on the request used to build the item. These are the main source of information of what needs to be invalidated when items are updated. Whenever an item is changed, a search is performed to find all items that contain the changed item in their linked_uuids; these items are also reindexed. The function responsible for this is `find_uuids_for_indexing` in src/snovault/elasticsearch/indexer_utils.py. +When rendering the view of an item to be indexed (``@@index-data``, see ``src/snovault/indexing_views.py``), we record the set of uuids traversed when building the item. This is the ``_linked_uuids``, which is stored as an reified attribute on the request used to build the item. These are the main source of information of what needs to be invalidated when items are updated. Whenever an item is changed, a search is performed to find all items that contain the changed item in their ``linked_uuids``; these items are also reindexed. The function responsible for this is ``find_uuids_for_indexing`` in ``src/snovault/elasticsearch/indexer_utils.py``. -Items are added to the set of request._linked_uuids in the `item_with_links` function in src/snovault/resources.py. This is the function used to control the _linked_uuids because it is closely tied with the @@object view of an item (defined in resource_views.py). The embedding process traverses the `embedded_list` of an object and uses the @@object view to build the total embedded object by iteratively visiting all its component objects. See the `embedding-and-indexing.rst` document for more information. +Items are added to the set of ``request._linked_uuids`` in the ``item_with_links`` function in ``src/snovault/resources.py``. This is the function used to control the ``_linked_uuids`` because it is closely tied with the ``@@object`` view of an item (defined in ``resource_views.py``). The embedding process traverses the ``embedded_list`` of an object and uses the ``@@object`` view to build the total embedded object by iteratively visiting all its component objects. See the ``embedding-and-indexing.rst`` document for more information. -Reverse links (rev_links) must also be kept track of in the invalidation process. In our system, we represent rev_links as linkTos; the ENCODE concept of a linkFrom has been removed. rev_links are added to a request much the same as items are added to _linked_uuids. See the get_rev_links function in src/snovault/resources.py. This function keeps track of information of where the rev_link originates from and what item it targets, which is important information because many rev links could be visited in the process of building an embedded item. +Reverse links (``rev_links``) must also be kept track of in the invalidation process. In our system, we represent ``rev_links`` as ``linkTos``; the ENCODE concept of a ``linkFrom`` has been removed. ``rev_links`` are added to a request much the same as items are added to ``_linked_uuids``. See the ``get_rev_links`` function in ``src/snovault/resources.py``. This function keeps track of information of where the ``rev_link`` originates from and what item it targets, which is important information because many rev links could be visited in the process of building an embedded item. -Both _linked_uuids and rev_links are only kept track of if we are indexing. This is done by setting request._indexing_view to True in indexing_views.py. The information about the linked uuids and uuids that reverse link to an item are stored in the Elasticsearch document for the item in the `linked_uuids` and `uuids_that_rev_link_to_me` fields, respectively. +Both ``_linked_uuids`` and ``rev_links`` are only kept track of if we are indexing. This is done by setting ``request._indexing_view`` to ``True`` in ``indexing_views.py``. The information about the linked uuids and uuids that reverse link to an item are stored in the Elasticsearch document for the item in the ``linked_uuids`` and ``uuids_that_rev_link_to_me`` fields, respectively. Finding items to invalidate --------------------------- -This has already been somewhat covered, but it's worth reiterating. Whenever an item is indexed, the `find_uuids_for_indexing` function is run to find all items in Elasticsearch that contain the indexed item in their linked_uuids. In addition to this, any items added from the `uuids_rev_linked_to_me` list generated from the @@index-data view are also invalidated, since new reverse links may have been created and those items need to be updated as well. All of these items are added to the secondary queue after a primary item has been indexed. +This has already been somewhat covered, but it's worth reiterating. Whenever an item is indexed, the ``find_uuids_for_indexing`` function is run to find all items in Elasticsearch that contain the indexed item in their linked_uuids. In addition to this, any items added from the ``uuids_rev_linked_to_me`` list generated from the ``@@index-data`` view are also invalidated, since new reverse links may have been created and those items need to be updated as well. All of these items are added to the secondary queue after a primary item has been indexed. Total Reindexing ---------------- -Cases can arise where a total reindexing needs to be triggered. This should be done by using `bin/create-mapping`, which executes code in create_mapping.py. The point of this code is primarily to build the mappings needed to make the indices in Elasticsearch. Secondarily, create-mapping also takes care of queueing objects for indexing. Check out the code in that file for more information. A total re-creation of Elasticsearch indices followed by reindexing can be triggered using: +Cases can arise where a total reindexing needs to be triggered. This should be done by using ``bin/create-mapping``, which executes code in create_mapping.py. The point of this code is primarily to build the mappings needed to make the indices in Elasticsearch. Secondarily, create-mapping also takes care of queueing objects for indexing. Check out the code in that file for more information. A total re-creation of Elasticsearch indices followed by reindexing can be triggered using:: -`bin/create-mapping production.ini --app-name app` -NOTE: use `development.ini` locally + bin/create-mapping production.ini --app-name app + +NOTE: For local debugging, such as when you're running ``make deploy1`` or ``make deploy2``, use ``development.ini`` instead. Purging items ------------- -There is another spot `find_uuids_for_indexing` is used, and that is to find all linked items when attempting to "purge" an item (fully remove from postgresql and Elasticsearch). Before removing an item, it is crucial to ensure that all links to that item have been removed, which is why this function is used. +There is another spot ``find_uuids_for_indexing`` is used, and that is to find all linked items when attempting to "purge" an item (fully remove from postgresql and Elasticsearch). Before removing an item, it is crucial to ensure that all links to that item have been removed, which is why this function is used. Invalidation Scope ------------------------- -Previously, `find_uuids_for_indexing` would take the uuids from _linked_uuids as is. Now, if given a diff (passed from SQS on edit) the uuids returned will be pruned to determine whether or not they actually need to be invalidated. The indexer does this by examining the diff received from SQS and the embedded list of all invalidated item types. If it detects the diff modified something that is embedded in the invalidated item type, all uuids of this type are invalidated. If not, those uuids are not queued for reindexing since the edit does not change the embedded view of the item. The followind diagram serves as a visual aid. +Previously, ``find_uuids_for_indexing`` would take the uuids from ``_linked_uuids`` as is. Now, if given a diff (passed from SQS on edit) the uuids returned will be pruned to determine whether or not they actually need to be invalidated. The indexer does this by examining the diff received from SQS and the embedded list of all invalidated item types. If it detects the diff modified something that is embedded in the invalidated item type, all uuids of this type are invalidated. If not, those uuids are not queued for reindexing since the edit does not change the embedded view of the item. The following diagram serves as a visual aid. Note that the above behavior is ONLY activate upon receiving a diff, which is computed only on item edits. Upon item creation/deletion the process remains the same, since there is no diff. It is also very important to note that any additional fields used in calculated properties are embedded as well. If not, then a field could be modified that would affect an embedded field but such edit would be invisible because we did not know the field was used. diff --git a/poetry.lock b/poetry.lock index b71188f6d..204947874 100644 --- a/poetry.lock +++ b/poetry.lock @@ -12,25 +12,6 @@ files = [ {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"}, ] -[[package]] -name = "attrs" -version = "22.2.0" -description = "Classes Without Boilerplate" -category = "main" -optional = false -python-versions = ">=3.6" -files = [ - {file = "attrs-22.2.0-py3-none-any.whl", hash = "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836"}, - {file = "attrs-22.2.0.tar.gz", hash = "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99"}, -] - -[package.extras] -cov = ["attrs[tests]", "coverage-enable-subprocess", "coverage[toml] (>=5.3)"] -dev = ["attrs[docs,tests]"] -docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope.interface"] -tests = ["attrs[tests-no-zope]", "zope.interface"] -tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy (>=0.971,<0.990)", "mypy (>=0.971,<0.990)", "pympler", "pympler", "pytest (>=4.3.0)", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-mypy-plugins", "pytest-xdist[psutil]", "pytest-xdist[psutil]"] - [[package]] name = "aws-requests-auth" version = "0.4.3" @@ -67,18 +48,18 @@ lxml = ["lxml"] [[package]] name = "boto3" -version = "1.26.108" +version = "1.26.135" description = "The AWS SDK for Python" category = "main" optional = false python-versions = ">= 3.7" files = [ - {file = "boto3-1.26.108-py3-none-any.whl", hash = "sha256:27fd0fedfdcab0bbb672b0dd18844049b86de15dccf042488c6420f1101e8a10"}, - {file = "boto3-1.26.108.tar.gz", hash = "sha256:22bb45185eaf0e4548a08d35ec11b910d55fa14f5ccd1048d1b95c8615afcc53"}, + {file = "boto3-1.26.135-py3-none-any.whl", hash = "sha256:ba7ca9215a1026620741273da10d0d3cceb9f7649f7c101e616a287071826f9d"}, + {file = "boto3-1.26.135.tar.gz", hash = "sha256:23523d5d6aa51bba2461d67f6eb458d83b6a52d18e3d953b1ce71209b66462ec"}, ] [package.dependencies] -botocore = ">=1.29.108,<1.30.0" +botocore = ">=1.29.135,<1.30.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.6.0,<0.7.0" @@ -87,14 +68,14 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "boto3-stubs" -version = "1.26.108" -description = "Type annotations for boto3 1.26.108 generated with mypy-boto3-builder 7.14.5" +version = "1.26.135" +description = "Type annotations for boto3 1.26.135 generated with mypy-boto3-builder 7.14.5" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "boto3-stubs-1.26.108.tar.gz", hash = "sha256:c8a2a592d53f6986eac52ba43fced7701d312e18db66f7b4bd3306972d18e75a"}, - {file = "boto3_stubs-1.26.108-py3-none-any.whl", hash = "sha256:69f7d7f9b8c5c58b2d726374f1643a41a17bcf7789fed3491113a6ac3611e0bf"}, + {file = "boto3-stubs-1.26.135.tar.gz", hash = "sha256:d4be8288892056725a37d87ed07062d6acc34af59f2fead9eb30a8d214406d5a"}, + {file = "boto3_stubs-1.26.135-py3-none-any.whl", hash = "sha256:c3cfafeb34a6443dee8066924023f476a8919e3514343623b3d8de2a8f842a3b"}, ] [package.dependencies] @@ -108,7 +89,7 @@ account = ["mypy-boto3-account (>=1.26.0,<1.27.0)"] acm = ["mypy-boto3-acm (>=1.26.0,<1.27.0)"] acm-pca = ["mypy-boto3-acm-pca (>=1.26.0,<1.27.0)"] alexaforbusiness = ["mypy-boto3-alexaforbusiness (>=1.26.0,<1.27.0)"] -all = ["mypy-boto3-accessanalyzer (>=1.26.0,<1.27.0)", "mypy-boto3-account (>=1.26.0,<1.27.0)", "mypy-boto3-acm (>=1.26.0,<1.27.0)", "mypy-boto3-acm-pca (>=1.26.0,<1.27.0)", "mypy-boto3-alexaforbusiness (>=1.26.0,<1.27.0)", "mypy-boto3-amp (>=1.26.0,<1.27.0)", "mypy-boto3-amplify (>=1.26.0,<1.27.0)", "mypy-boto3-amplifybackend (>=1.26.0,<1.27.0)", "mypy-boto3-amplifyuibuilder (>=1.26.0,<1.27.0)", "mypy-boto3-apigateway (>=1.26.0,<1.27.0)", "mypy-boto3-apigatewaymanagementapi (>=1.26.0,<1.27.0)", "mypy-boto3-apigatewayv2 (>=1.26.0,<1.27.0)", "mypy-boto3-appconfig (>=1.26.0,<1.27.0)", "mypy-boto3-appconfigdata (>=1.26.0,<1.27.0)", "mypy-boto3-appflow (>=1.26.0,<1.27.0)", "mypy-boto3-appintegrations (>=1.26.0,<1.27.0)", "mypy-boto3-application-autoscaling (>=1.26.0,<1.27.0)", "mypy-boto3-application-insights (>=1.26.0,<1.27.0)", "mypy-boto3-applicationcostprofiler (>=1.26.0,<1.27.0)", "mypy-boto3-appmesh (>=1.26.0,<1.27.0)", "mypy-boto3-apprunner (>=1.26.0,<1.27.0)", "mypy-boto3-appstream (>=1.26.0,<1.27.0)", "mypy-boto3-appsync (>=1.26.0,<1.27.0)", "mypy-boto3-arc-zonal-shift (>=1.26.0,<1.27.0)", "mypy-boto3-athena (>=1.26.0,<1.27.0)", "mypy-boto3-auditmanager (>=1.26.0,<1.27.0)", "mypy-boto3-autoscaling (>=1.26.0,<1.27.0)", "mypy-boto3-autoscaling-plans (>=1.26.0,<1.27.0)", "mypy-boto3-backup (>=1.26.0,<1.27.0)", "mypy-boto3-backup-gateway (>=1.26.0,<1.27.0)", "mypy-boto3-backupstorage (>=1.26.0,<1.27.0)", "mypy-boto3-batch (>=1.26.0,<1.27.0)", "mypy-boto3-billingconductor (>=1.26.0,<1.27.0)", "mypy-boto3-braket (>=1.26.0,<1.27.0)", "mypy-boto3-budgets (>=1.26.0,<1.27.0)", "mypy-boto3-ce (>=1.26.0,<1.27.0)", "mypy-boto3-chime (>=1.26.0,<1.27.0)", "mypy-boto3-chime-sdk-identity (>=1.26.0,<1.27.0)", "mypy-boto3-chime-sdk-media-pipelines (>=1.26.0,<1.27.0)", "mypy-boto3-chime-sdk-meetings (>=1.26.0,<1.27.0)", "mypy-boto3-chime-sdk-messaging (>=1.26.0,<1.27.0)", "mypy-boto3-chime-sdk-voice (>=1.26.0,<1.27.0)", "mypy-boto3-cleanrooms (>=1.26.0,<1.27.0)", "mypy-boto3-cloud9 (>=1.26.0,<1.27.0)", "mypy-boto3-cloudcontrol (>=1.26.0,<1.27.0)", "mypy-boto3-clouddirectory (>=1.26.0,<1.27.0)", "mypy-boto3-cloudformation (>=1.26.0,<1.27.0)", "mypy-boto3-cloudfront (>=1.26.0,<1.27.0)", "mypy-boto3-cloudhsm (>=1.26.0,<1.27.0)", "mypy-boto3-cloudhsmv2 (>=1.26.0,<1.27.0)", "mypy-boto3-cloudsearch (>=1.26.0,<1.27.0)", "mypy-boto3-cloudsearchdomain (>=1.26.0,<1.27.0)", "mypy-boto3-cloudtrail (>=1.26.0,<1.27.0)", "mypy-boto3-cloudtrail-data (>=1.26.0,<1.27.0)", "mypy-boto3-cloudwatch (>=1.26.0,<1.27.0)", "mypy-boto3-codeartifact (>=1.26.0,<1.27.0)", "mypy-boto3-codebuild (>=1.26.0,<1.27.0)", "mypy-boto3-codecatalyst (>=1.26.0,<1.27.0)", "mypy-boto3-codecommit (>=1.26.0,<1.27.0)", "mypy-boto3-codedeploy (>=1.26.0,<1.27.0)", "mypy-boto3-codeguru-reviewer (>=1.26.0,<1.27.0)", "mypy-boto3-codeguruprofiler (>=1.26.0,<1.27.0)", "mypy-boto3-codepipeline (>=1.26.0,<1.27.0)", "mypy-boto3-codestar (>=1.26.0,<1.27.0)", "mypy-boto3-codestar-connections (>=1.26.0,<1.27.0)", "mypy-boto3-codestar-notifications (>=1.26.0,<1.27.0)", "mypy-boto3-cognito-identity (>=1.26.0,<1.27.0)", "mypy-boto3-cognito-idp (>=1.26.0,<1.27.0)", "mypy-boto3-cognito-sync (>=1.26.0,<1.27.0)", "mypy-boto3-comprehend (>=1.26.0,<1.27.0)", "mypy-boto3-comprehendmedical (>=1.26.0,<1.27.0)", "mypy-boto3-compute-optimizer (>=1.26.0,<1.27.0)", "mypy-boto3-config (>=1.26.0,<1.27.0)", "mypy-boto3-connect (>=1.26.0,<1.27.0)", "mypy-boto3-connect-contact-lens (>=1.26.0,<1.27.0)", "mypy-boto3-connectcampaigns (>=1.26.0,<1.27.0)", "mypy-boto3-connectcases (>=1.26.0,<1.27.0)", "mypy-boto3-connectparticipant (>=1.26.0,<1.27.0)", "mypy-boto3-controltower (>=1.26.0,<1.27.0)", "mypy-boto3-cur (>=1.26.0,<1.27.0)", "mypy-boto3-customer-profiles (>=1.26.0,<1.27.0)", "mypy-boto3-databrew (>=1.26.0,<1.27.0)", "mypy-boto3-dataexchange (>=1.26.0,<1.27.0)", "mypy-boto3-datapipeline (>=1.26.0,<1.27.0)", "mypy-boto3-datasync (>=1.26.0,<1.27.0)", "mypy-boto3-dax (>=1.26.0,<1.27.0)", "mypy-boto3-detective (>=1.26.0,<1.27.0)", "mypy-boto3-devicefarm (>=1.26.0,<1.27.0)", "mypy-boto3-devops-guru (>=1.26.0,<1.27.0)", "mypy-boto3-directconnect (>=1.26.0,<1.27.0)", "mypy-boto3-discovery (>=1.26.0,<1.27.0)", "mypy-boto3-dlm (>=1.26.0,<1.27.0)", "mypy-boto3-dms (>=1.26.0,<1.27.0)", "mypy-boto3-docdb (>=1.26.0,<1.27.0)", "mypy-boto3-docdb-elastic (>=1.26.0,<1.27.0)", "mypy-boto3-drs (>=1.26.0,<1.27.0)", "mypy-boto3-ds (>=1.26.0,<1.27.0)", "mypy-boto3-dynamodb (>=1.26.0,<1.27.0)", "mypy-boto3-dynamodbstreams (>=1.26.0,<1.27.0)", "mypy-boto3-ebs (>=1.26.0,<1.27.0)", "mypy-boto3-ec2 (>=1.26.0,<1.27.0)", "mypy-boto3-ec2-instance-connect (>=1.26.0,<1.27.0)", "mypy-boto3-ecr (>=1.26.0,<1.27.0)", "mypy-boto3-ecr-public (>=1.26.0,<1.27.0)", "mypy-boto3-ecs (>=1.26.0,<1.27.0)", "mypy-boto3-efs (>=1.26.0,<1.27.0)", "mypy-boto3-eks (>=1.26.0,<1.27.0)", "mypy-boto3-elastic-inference (>=1.26.0,<1.27.0)", "mypy-boto3-elasticache (>=1.26.0,<1.27.0)", "mypy-boto3-elasticbeanstalk (>=1.26.0,<1.27.0)", "mypy-boto3-elastictranscoder (>=1.26.0,<1.27.0)", "mypy-boto3-elb (>=1.26.0,<1.27.0)", "mypy-boto3-elbv2 (>=1.26.0,<1.27.0)", "mypy-boto3-emr (>=1.26.0,<1.27.0)", "mypy-boto3-emr-containers (>=1.26.0,<1.27.0)", "mypy-boto3-emr-serverless (>=1.26.0,<1.27.0)", "mypy-boto3-es (>=1.26.0,<1.27.0)", "mypy-boto3-events (>=1.26.0,<1.27.0)", "mypy-boto3-evidently (>=1.26.0,<1.27.0)", "mypy-boto3-finspace (>=1.26.0,<1.27.0)", "mypy-boto3-finspace-data (>=1.26.0,<1.27.0)", "mypy-boto3-firehose (>=1.26.0,<1.27.0)", "mypy-boto3-fis (>=1.26.0,<1.27.0)", "mypy-boto3-fms (>=1.26.0,<1.27.0)", "mypy-boto3-forecast (>=1.26.0,<1.27.0)", "mypy-boto3-forecastquery (>=1.26.0,<1.27.0)", "mypy-boto3-frauddetector (>=1.26.0,<1.27.0)", "mypy-boto3-fsx (>=1.26.0,<1.27.0)", "mypy-boto3-gamelift (>=1.26.0,<1.27.0)", "mypy-boto3-gamesparks (>=1.26.0,<1.27.0)", "mypy-boto3-glacier (>=1.26.0,<1.27.0)", "mypy-boto3-globalaccelerator (>=1.26.0,<1.27.0)", "mypy-boto3-glue (>=1.26.0,<1.27.0)", "mypy-boto3-grafana (>=1.26.0,<1.27.0)", "mypy-boto3-greengrass (>=1.26.0,<1.27.0)", "mypy-boto3-greengrassv2 (>=1.26.0,<1.27.0)", "mypy-boto3-groundstation (>=1.26.0,<1.27.0)", "mypy-boto3-guardduty (>=1.26.0,<1.27.0)", "mypy-boto3-health (>=1.26.0,<1.27.0)", "mypy-boto3-healthlake (>=1.26.0,<1.27.0)", "mypy-boto3-honeycode (>=1.26.0,<1.27.0)", "mypy-boto3-iam (>=1.26.0,<1.27.0)", "mypy-boto3-identitystore (>=1.26.0,<1.27.0)", "mypy-boto3-imagebuilder (>=1.26.0,<1.27.0)", "mypy-boto3-importexport (>=1.26.0,<1.27.0)", "mypy-boto3-inspector (>=1.26.0,<1.27.0)", "mypy-boto3-inspector2 (>=1.26.0,<1.27.0)", "mypy-boto3-internetmonitor (>=1.26.0,<1.27.0)", "mypy-boto3-iot (>=1.26.0,<1.27.0)", "mypy-boto3-iot-data (>=1.26.0,<1.27.0)", "mypy-boto3-iot-jobs-data (>=1.26.0,<1.27.0)", "mypy-boto3-iot-roborunner (>=1.26.0,<1.27.0)", "mypy-boto3-iot1click-devices (>=1.26.0,<1.27.0)", "mypy-boto3-iot1click-projects (>=1.26.0,<1.27.0)", "mypy-boto3-iotanalytics (>=1.26.0,<1.27.0)", "mypy-boto3-iotdeviceadvisor (>=1.26.0,<1.27.0)", "mypy-boto3-iotevents (>=1.26.0,<1.27.0)", "mypy-boto3-iotevents-data (>=1.26.0,<1.27.0)", "mypy-boto3-iotfleethub (>=1.26.0,<1.27.0)", "mypy-boto3-iotfleetwise (>=1.26.0,<1.27.0)", "mypy-boto3-iotsecuretunneling (>=1.26.0,<1.27.0)", "mypy-boto3-iotsitewise (>=1.26.0,<1.27.0)", "mypy-boto3-iotthingsgraph (>=1.26.0,<1.27.0)", "mypy-boto3-iottwinmaker (>=1.26.0,<1.27.0)", "mypy-boto3-iotwireless (>=1.26.0,<1.27.0)", "mypy-boto3-ivs (>=1.26.0,<1.27.0)", "mypy-boto3-ivs-realtime (>=1.26.0,<1.27.0)", "mypy-boto3-ivschat (>=1.26.0,<1.27.0)", "mypy-boto3-kafka (>=1.26.0,<1.27.0)", "mypy-boto3-kafkaconnect (>=1.26.0,<1.27.0)", "mypy-boto3-kendra (>=1.26.0,<1.27.0)", "mypy-boto3-kendra-ranking (>=1.26.0,<1.27.0)", "mypy-boto3-keyspaces (>=1.26.0,<1.27.0)", "mypy-boto3-kinesis (>=1.26.0,<1.27.0)", "mypy-boto3-kinesis-video-archived-media (>=1.26.0,<1.27.0)", "mypy-boto3-kinesis-video-media (>=1.26.0,<1.27.0)", "mypy-boto3-kinesis-video-signaling (>=1.26.0,<1.27.0)", "mypy-boto3-kinesis-video-webrtc-storage (>=1.26.0,<1.27.0)", "mypy-boto3-kinesisanalytics (>=1.26.0,<1.27.0)", "mypy-boto3-kinesisanalyticsv2 (>=1.26.0,<1.27.0)", "mypy-boto3-kinesisvideo (>=1.26.0,<1.27.0)", "mypy-boto3-kms (>=1.26.0,<1.27.0)", "mypy-boto3-lakeformation (>=1.26.0,<1.27.0)", "mypy-boto3-lambda (>=1.26.0,<1.27.0)", "mypy-boto3-lex-models (>=1.26.0,<1.27.0)", "mypy-boto3-lex-runtime (>=1.26.0,<1.27.0)", "mypy-boto3-lexv2-models (>=1.26.0,<1.27.0)", "mypy-boto3-lexv2-runtime (>=1.26.0,<1.27.0)", "mypy-boto3-license-manager (>=1.26.0,<1.27.0)", "mypy-boto3-license-manager-linux-subscriptions (>=1.26.0,<1.27.0)", "mypy-boto3-license-manager-user-subscriptions (>=1.26.0,<1.27.0)", "mypy-boto3-lightsail (>=1.26.0,<1.27.0)", "mypy-boto3-location (>=1.26.0,<1.27.0)", "mypy-boto3-logs (>=1.26.0,<1.27.0)", "mypy-boto3-lookoutequipment (>=1.26.0,<1.27.0)", "mypy-boto3-lookoutmetrics (>=1.26.0,<1.27.0)", "mypy-boto3-lookoutvision (>=1.26.0,<1.27.0)", "mypy-boto3-m2 (>=1.26.0,<1.27.0)", "mypy-boto3-machinelearning (>=1.26.0,<1.27.0)", "mypy-boto3-macie (>=1.26.0,<1.27.0)", "mypy-boto3-macie2 (>=1.26.0,<1.27.0)", "mypy-boto3-managedblockchain (>=1.26.0,<1.27.0)", "mypy-boto3-marketplace-catalog (>=1.26.0,<1.27.0)", "mypy-boto3-marketplace-entitlement (>=1.26.0,<1.27.0)", "mypy-boto3-marketplacecommerceanalytics (>=1.26.0,<1.27.0)", "mypy-boto3-mediaconnect (>=1.26.0,<1.27.0)", "mypy-boto3-mediaconvert (>=1.26.0,<1.27.0)", "mypy-boto3-medialive (>=1.26.0,<1.27.0)", "mypy-boto3-mediapackage (>=1.26.0,<1.27.0)", "mypy-boto3-mediapackage-vod (>=1.26.0,<1.27.0)", "mypy-boto3-mediastore (>=1.26.0,<1.27.0)", "mypy-boto3-mediastore-data (>=1.26.0,<1.27.0)", "mypy-boto3-mediatailor (>=1.26.0,<1.27.0)", "mypy-boto3-memorydb (>=1.26.0,<1.27.0)", "mypy-boto3-meteringmarketplace (>=1.26.0,<1.27.0)", "mypy-boto3-mgh (>=1.26.0,<1.27.0)", "mypy-boto3-mgn (>=1.26.0,<1.27.0)", "mypy-boto3-migration-hub-refactor-spaces (>=1.26.0,<1.27.0)", "mypy-boto3-migrationhub-config (>=1.26.0,<1.27.0)", "mypy-boto3-migrationhuborchestrator (>=1.26.0,<1.27.0)", "mypy-boto3-migrationhubstrategy (>=1.26.0,<1.27.0)", "mypy-boto3-mobile (>=1.26.0,<1.27.0)", "mypy-boto3-mq (>=1.26.0,<1.27.0)", "mypy-boto3-mturk (>=1.26.0,<1.27.0)", "mypy-boto3-mwaa (>=1.26.0,<1.27.0)", "mypy-boto3-neptune (>=1.26.0,<1.27.0)", "mypy-boto3-network-firewall (>=1.26.0,<1.27.0)", "mypy-boto3-networkmanager (>=1.26.0,<1.27.0)", "mypy-boto3-nimble (>=1.26.0,<1.27.0)", "mypy-boto3-oam (>=1.26.0,<1.27.0)", "mypy-boto3-omics (>=1.26.0,<1.27.0)", "mypy-boto3-opensearch (>=1.26.0,<1.27.0)", "mypy-boto3-opensearchserverless (>=1.26.0,<1.27.0)", "mypy-boto3-opsworks (>=1.26.0,<1.27.0)", "mypy-boto3-opsworkscm (>=1.26.0,<1.27.0)", "mypy-boto3-organizations (>=1.26.0,<1.27.0)", "mypy-boto3-outposts (>=1.26.0,<1.27.0)", "mypy-boto3-panorama (>=1.26.0,<1.27.0)", "mypy-boto3-personalize (>=1.26.0,<1.27.0)", "mypy-boto3-personalize-events (>=1.26.0,<1.27.0)", "mypy-boto3-personalize-runtime (>=1.26.0,<1.27.0)", "mypy-boto3-pi (>=1.26.0,<1.27.0)", "mypy-boto3-pinpoint (>=1.26.0,<1.27.0)", "mypy-boto3-pinpoint-email (>=1.26.0,<1.27.0)", "mypy-boto3-pinpoint-sms-voice (>=1.26.0,<1.27.0)", "mypy-boto3-pinpoint-sms-voice-v2 (>=1.26.0,<1.27.0)", "mypy-boto3-pipes (>=1.26.0,<1.27.0)", "mypy-boto3-polly (>=1.26.0,<1.27.0)", "mypy-boto3-pricing (>=1.26.0,<1.27.0)", "mypy-boto3-privatenetworks (>=1.26.0,<1.27.0)", "mypy-boto3-proton (>=1.26.0,<1.27.0)", "mypy-boto3-qldb (>=1.26.0,<1.27.0)", "mypy-boto3-qldb-session (>=1.26.0,<1.27.0)", "mypy-boto3-quicksight (>=1.26.0,<1.27.0)", "mypy-boto3-ram (>=1.26.0,<1.27.0)", "mypy-boto3-rbin (>=1.26.0,<1.27.0)", "mypy-boto3-rds (>=1.26.0,<1.27.0)", "mypy-boto3-rds-data (>=1.26.0,<1.27.0)", "mypy-boto3-redshift (>=1.26.0,<1.27.0)", "mypy-boto3-redshift-data (>=1.26.0,<1.27.0)", "mypy-boto3-redshift-serverless (>=1.26.0,<1.27.0)", "mypy-boto3-rekognition (>=1.26.0,<1.27.0)", "mypy-boto3-resiliencehub (>=1.26.0,<1.27.0)", "mypy-boto3-resource-explorer-2 (>=1.26.0,<1.27.0)", "mypy-boto3-resource-groups (>=1.26.0,<1.27.0)", "mypy-boto3-resourcegroupstaggingapi (>=1.26.0,<1.27.0)", "mypy-boto3-robomaker (>=1.26.0,<1.27.0)", "mypy-boto3-rolesanywhere (>=1.26.0,<1.27.0)", "mypy-boto3-route53 (>=1.26.0,<1.27.0)", "mypy-boto3-route53-recovery-cluster (>=1.26.0,<1.27.0)", "mypy-boto3-route53-recovery-control-config (>=1.26.0,<1.27.0)", "mypy-boto3-route53-recovery-readiness (>=1.26.0,<1.27.0)", "mypy-boto3-route53domains (>=1.26.0,<1.27.0)", "mypy-boto3-route53resolver (>=1.26.0,<1.27.0)", "mypy-boto3-rum (>=1.26.0,<1.27.0)", "mypy-boto3-s3 (>=1.26.0,<1.27.0)", "mypy-boto3-s3control (>=1.26.0,<1.27.0)", "mypy-boto3-s3outposts (>=1.26.0,<1.27.0)", "mypy-boto3-sagemaker (>=1.26.0,<1.27.0)", "mypy-boto3-sagemaker-a2i-runtime (>=1.26.0,<1.27.0)", "mypy-boto3-sagemaker-edge (>=1.26.0,<1.27.0)", "mypy-boto3-sagemaker-featurestore-runtime (>=1.26.0,<1.27.0)", "mypy-boto3-sagemaker-geospatial (>=1.26.0,<1.27.0)", "mypy-boto3-sagemaker-metrics (>=1.26.0,<1.27.0)", "mypy-boto3-sagemaker-runtime (>=1.26.0,<1.27.0)", "mypy-boto3-savingsplans (>=1.26.0,<1.27.0)", "mypy-boto3-scheduler (>=1.26.0,<1.27.0)", "mypy-boto3-schemas (>=1.26.0,<1.27.0)", "mypy-boto3-sdb (>=1.26.0,<1.27.0)", "mypy-boto3-secretsmanager (>=1.26.0,<1.27.0)", "mypy-boto3-securityhub (>=1.26.0,<1.27.0)", "mypy-boto3-securitylake (>=1.26.0,<1.27.0)", "mypy-boto3-serverlessrepo (>=1.26.0,<1.27.0)", "mypy-boto3-service-quotas (>=1.26.0,<1.27.0)", "mypy-boto3-servicecatalog (>=1.26.0,<1.27.0)", "mypy-boto3-servicecatalog-appregistry (>=1.26.0,<1.27.0)", "mypy-boto3-servicediscovery (>=1.26.0,<1.27.0)", "mypy-boto3-ses (>=1.26.0,<1.27.0)", "mypy-boto3-sesv2 (>=1.26.0,<1.27.0)", "mypy-boto3-shield (>=1.26.0,<1.27.0)", "mypy-boto3-signer (>=1.26.0,<1.27.0)", "mypy-boto3-simspaceweaver (>=1.26.0,<1.27.0)", "mypy-boto3-sms (>=1.26.0,<1.27.0)", "mypy-boto3-sms-voice (>=1.26.0,<1.27.0)", "mypy-boto3-snow-device-management (>=1.26.0,<1.27.0)", "mypy-boto3-snowball (>=1.26.0,<1.27.0)", "mypy-boto3-sns (>=1.26.0,<1.27.0)", "mypy-boto3-sqs (>=1.26.0,<1.27.0)", "mypy-boto3-ssm (>=1.26.0,<1.27.0)", "mypy-boto3-ssm-contacts (>=1.26.0,<1.27.0)", "mypy-boto3-ssm-incidents (>=1.26.0,<1.27.0)", "mypy-boto3-ssm-sap (>=1.26.0,<1.27.0)", "mypy-boto3-sso (>=1.26.0,<1.27.0)", "mypy-boto3-sso-admin (>=1.26.0,<1.27.0)", "mypy-boto3-sso-oidc (>=1.26.0,<1.27.0)", "mypy-boto3-stepfunctions (>=1.26.0,<1.27.0)", "mypy-boto3-storagegateway (>=1.26.0,<1.27.0)", "mypy-boto3-sts (>=1.26.0,<1.27.0)", "mypy-boto3-support (>=1.26.0,<1.27.0)", "mypy-boto3-support-app (>=1.26.0,<1.27.0)", "mypy-boto3-swf (>=1.26.0,<1.27.0)", "mypy-boto3-synthetics (>=1.26.0,<1.27.0)", "mypy-boto3-textract (>=1.26.0,<1.27.0)", "mypy-boto3-timestream-query (>=1.26.0,<1.27.0)", "mypy-boto3-timestream-write (>=1.26.0,<1.27.0)", "mypy-boto3-tnb (>=1.26.0,<1.27.0)", "mypy-boto3-transcribe (>=1.26.0,<1.27.0)", "mypy-boto3-transfer (>=1.26.0,<1.27.0)", "mypy-boto3-translate (>=1.26.0,<1.27.0)", "mypy-boto3-voice-id (>=1.26.0,<1.27.0)", "mypy-boto3-vpc-lattice (>=1.26.0,<1.27.0)", "mypy-boto3-waf (>=1.26.0,<1.27.0)", "mypy-boto3-waf-regional (>=1.26.0,<1.27.0)", "mypy-boto3-wafv2 (>=1.26.0,<1.27.0)", "mypy-boto3-wellarchitected (>=1.26.0,<1.27.0)", "mypy-boto3-wisdom (>=1.26.0,<1.27.0)", "mypy-boto3-workdocs (>=1.26.0,<1.27.0)", "mypy-boto3-worklink (>=1.26.0,<1.27.0)", "mypy-boto3-workmail (>=1.26.0,<1.27.0)", "mypy-boto3-workmailmessageflow (>=1.26.0,<1.27.0)", "mypy-boto3-workspaces (>=1.26.0,<1.27.0)", "mypy-boto3-workspaces-web (>=1.26.0,<1.27.0)", "mypy-boto3-xray (>=1.26.0,<1.27.0)"] +all = ["mypy-boto3-accessanalyzer (>=1.26.0,<1.27.0)", "mypy-boto3-account (>=1.26.0,<1.27.0)", "mypy-boto3-acm (>=1.26.0,<1.27.0)", "mypy-boto3-acm-pca (>=1.26.0,<1.27.0)", "mypy-boto3-alexaforbusiness (>=1.26.0,<1.27.0)", "mypy-boto3-amp (>=1.26.0,<1.27.0)", "mypy-boto3-amplify (>=1.26.0,<1.27.0)", "mypy-boto3-amplifybackend (>=1.26.0,<1.27.0)", "mypy-boto3-amplifyuibuilder (>=1.26.0,<1.27.0)", "mypy-boto3-apigateway (>=1.26.0,<1.27.0)", "mypy-boto3-apigatewaymanagementapi (>=1.26.0,<1.27.0)", "mypy-boto3-apigatewayv2 (>=1.26.0,<1.27.0)", "mypy-boto3-appconfig (>=1.26.0,<1.27.0)", "mypy-boto3-appconfigdata (>=1.26.0,<1.27.0)", "mypy-boto3-appflow (>=1.26.0,<1.27.0)", "mypy-boto3-appintegrations (>=1.26.0,<1.27.0)", "mypy-boto3-application-autoscaling (>=1.26.0,<1.27.0)", "mypy-boto3-application-insights (>=1.26.0,<1.27.0)", "mypy-boto3-applicationcostprofiler (>=1.26.0,<1.27.0)", "mypy-boto3-appmesh (>=1.26.0,<1.27.0)", "mypy-boto3-apprunner (>=1.26.0,<1.27.0)", "mypy-boto3-appstream (>=1.26.0,<1.27.0)", "mypy-boto3-appsync (>=1.26.0,<1.27.0)", "mypy-boto3-arc-zonal-shift (>=1.26.0,<1.27.0)", "mypy-boto3-athena (>=1.26.0,<1.27.0)", "mypy-boto3-auditmanager (>=1.26.0,<1.27.0)", "mypy-boto3-autoscaling (>=1.26.0,<1.27.0)", "mypy-boto3-autoscaling-plans (>=1.26.0,<1.27.0)", "mypy-boto3-backup (>=1.26.0,<1.27.0)", "mypy-boto3-backup-gateway (>=1.26.0,<1.27.0)", "mypy-boto3-backupstorage (>=1.26.0,<1.27.0)", "mypy-boto3-batch (>=1.26.0,<1.27.0)", "mypy-boto3-billingconductor (>=1.26.0,<1.27.0)", "mypy-boto3-braket (>=1.26.0,<1.27.0)", "mypy-boto3-budgets (>=1.26.0,<1.27.0)", "mypy-boto3-ce (>=1.26.0,<1.27.0)", "mypy-boto3-chime (>=1.26.0,<1.27.0)", "mypy-boto3-chime-sdk-identity (>=1.26.0,<1.27.0)", "mypy-boto3-chime-sdk-media-pipelines (>=1.26.0,<1.27.0)", "mypy-boto3-chime-sdk-meetings (>=1.26.0,<1.27.0)", "mypy-boto3-chime-sdk-messaging (>=1.26.0,<1.27.0)", "mypy-boto3-chime-sdk-voice (>=1.26.0,<1.27.0)", "mypy-boto3-cleanrooms (>=1.26.0,<1.27.0)", "mypy-boto3-cloud9 (>=1.26.0,<1.27.0)", "mypy-boto3-cloudcontrol (>=1.26.0,<1.27.0)", "mypy-boto3-clouddirectory (>=1.26.0,<1.27.0)", "mypy-boto3-cloudformation (>=1.26.0,<1.27.0)", "mypy-boto3-cloudfront (>=1.26.0,<1.27.0)", "mypy-boto3-cloudhsm (>=1.26.0,<1.27.0)", "mypy-boto3-cloudhsmv2 (>=1.26.0,<1.27.0)", "mypy-boto3-cloudsearch (>=1.26.0,<1.27.0)", "mypy-boto3-cloudsearchdomain (>=1.26.0,<1.27.0)", "mypy-boto3-cloudtrail (>=1.26.0,<1.27.0)", "mypy-boto3-cloudtrail-data (>=1.26.0,<1.27.0)", "mypy-boto3-cloudwatch (>=1.26.0,<1.27.0)", "mypy-boto3-codeartifact (>=1.26.0,<1.27.0)", "mypy-boto3-codebuild (>=1.26.0,<1.27.0)", "mypy-boto3-codecatalyst (>=1.26.0,<1.27.0)", "mypy-boto3-codecommit (>=1.26.0,<1.27.0)", "mypy-boto3-codedeploy (>=1.26.0,<1.27.0)", "mypy-boto3-codeguru-reviewer (>=1.26.0,<1.27.0)", "mypy-boto3-codeguruprofiler (>=1.26.0,<1.27.0)", "mypy-boto3-codepipeline (>=1.26.0,<1.27.0)", "mypy-boto3-codestar (>=1.26.0,<1.27.0)", "mypy-boto3-codestar-connections (>=1.26.0,<1.27.0)", "mypy-boto3-codestar-notifications (>=1.26.0,<1.27.0)", "mypy-boto3-cognito-identity (>=1.26.0,<1.27.0)", "mypy-boto3-cognito-idp (>=1.26.0,<1.27.0)", "mypy-boto3-cognito-sync (>=1.26.0,<1.27.0)", "mypy-boto3-comprehend (>=1.26.0,<1.27.0)", "mypy-boto3-comprehendmedical (>=1.26.0,<1.27.0)", "mypy-boto3-compute-optimizer (>=1.26.0,<1.27.0)", "mypy-boto3-config (>=1.26.0,<1.27.0)", "mypy-boto3-connect (>=1.26.0,<1.27.0)", "mypy-boto3-connect-contact-lens (>=1.26.0,<1.27.0)", "mypy-boto3-connectcampaigns (>=1.26.0,<1.27.0)", "mypy-boto3-connectcases (>=1.26.0,<1.27.0)", "mypy-boto3-connectparticipant (>=1.26.0,<1.27.0)", "mypy-boto3-controltower (>=1.26.0,<1.27.0)", "mypy-boto3-cur (>=1.26.0,<1.27.0)", "mypy-boto3-customer-profiles (>=1.26.0,<1.27.0)", "mypy-boto3-databrew (>=1.26.0,<1.27.0)", "mypy-boto3-dataexchange (>=1.26.0,<1.27.0)", "mypy-boto3-datapipeline (>=1.26.0,<1.27.0)", "mypy-boto3-datasync (>=1.26.0,<1.27.0)", "mypy-boto3-dax (>=1.26.0,<1.27.0)", "mypy-boto3-detective (>=1.26.0,<1.27.0)", "mypy-boto3-devicefarm (>=1.26.0,<1.27.0)", "mypy-boto3-devops-guru (>=1.26.0,<1.27.0)", "mypy-boto3-directconnect (>=1.26.0,<1.27.0)", "mypy-boto3-discovery (>=1.26.0,<1.27.0)", "mypy-boto3-dlm (>=1.26.0,<1.27.0)", "mypy-boto3-dms (>=1.26.0,<1.27.0)", "mypy-boto3-docdb (>=1.26.0,<1.27.0)", "mypy-boto3-docdb-elastic (>=1.26.0,<1.27.0)", "mypy-boto3-drs (>=1.26.0,<1.27.0)", "mypy-boto3-ds (>=1.26.0,<1.27.0)", "mypy-boto3-dynamodb (>=1.26.0,<1.27.0)", "mypy-boto3-dynamodbstreams (>=1.26.0,<1.27.0)", "mypy-boto3-ebs (>=1.26.0,<1.27.0)", "mypy-boto3-ec2 (>=1.26.0,<1.27.0)", "mypy-boto3-ec2-instance-connect (>=1.26.0,<1.27.0)", "mypy-boto3-ecr (>=1.26.0,<1.27.0)", "mypy-boto3-ecr-public (>=1.26.0,<1.27.0)", "mypy-boto3-ecs (>=1.26.0,<1.27.0)", "mypy-boto3-efs (>=1.26.0,<1.27.0)", "mypy-boto3-eks (>=1.26.0,<1.27.0)", "mypy-boto3-elastic-inference (>=1.26.0,<1.27.0)", "mypy-boto3-elasticache (>=1.26.0,<1.27.0)", "mypy-boto3-elasticbeanstalk (>=1.26.0,<1.27.0)", "mypy-boto3-elastictranscoder (>=1.26.0,<1.27.0)", "mypy-boto3-elb (>=1.26.0,<1.27.0)", "mypy-boto3-elbv2 (>=1.26.0,<1.27.0)", "mypy-boto3-emr (>=1.26.0,<1.27.0)", "mypy-boto3-emr-containers (>=1.26.0,<1.27.0)", "mypy-boto3-emr-serverless (>=1.26.0,<1.27.0)", "mypy-boto3-es (>=1.26.0,<1.27.0)", "mypy-boto3-events (>=1.26.0,<1.27.0)", "mypy-boto3-evidently (>=1.26.0,<1.27.0)", "mypy-boto3-finspace (>=1.26.0,<1.27.0)", "mypy-boto3-finspace-data (>=1.26.0,<1.27.0)", "mypy-boto3-firehose (>=1.26.0,<1.27.0)", "mypy-boto3-fis (>=1.26.0,<1.27.0)", "mypy-boto3-fms (>=1.26.0,<1.27.0)", "mypy-boto3-forecast (>=1.26.0,<1.27.0)", "mypy-boto3-forecastquery (>=1.26.0,<1.27.0)", "mypy-boto3-frauddetector (>=1.26.0,<1.27.0)", "mypy-boto3-fsx (>=1.26.0,<1.27.0)", "mypy-boto3-gamelift (>=1.26.0,<1.27.0)", "mypy-boto3-gamesparks (>=1.26.0,<1.27.0)", "mypy-boto3-glacier (>=1.26.0,<1.27.0)", "mypy-boto3-globalaccelerator (>=1.26.0,<1.27.0)", "mypy-boto3-glue (>=1.26.0,<1.27.0)", "mypy-boto3-grafana (>=1.26.0,<1.27.0)", "mypy-boto3-greengrass (>=1.26.0,<1.27.0)", "mypy-boto3-greengrassv2 (>=1.26.0,<1.27.0)", "mypy-boto3-groundstation (>=1.26.0,<1.27.0)", "mypy-boto3-guardduty (>=1.26.0,<1.27.0)", "mypy-boto3-health (>=1.26.0,<1.27.0)", "mypy-boto3-healthlake (>=1.26.0,<1.27.0)", "mypy-boto3-honeycode (>=1.26.0,<1.27.0)", "mypy-boto3-iam (>=1.26.0,<1.27.0)", "mypy-boto3-identitystore (>=1.26.0,<1.27.0)", "mypy-boto3-imagebuilder (>=1.26.0,<1.27.0)", "mypy-boto3-importexport (>=1.26.0,<1.27.0)", "mypy-boto3-inspector (>=1.26.0,<1.27.0)", "mypy-boto3-inspector2 (>=1.26.0,<1.27.0)", "mypy-boto3-internetmonitor (>=1.26.0,<1.27.0)", "mypy-boto3-iot (>=1.26.0,<1.27.0)", "mypy-boto3-iot-data (>=1.26.0,<1.27.0)", "mypy-boto3-iot-jobs-data (>=1.26.0,<1.27.0)", "mypy-boto3-iot-roborunner (>=1.26.0,<1.27.0)", "mypy-boto3-iot1click-devices (>=1.26.0,<1.27.0)", "mypy-boto3-iot1click-projects (>=1.26.0,<1.27.0)", "mypy-boto3-iotanalytics (>=1.26.0,<1.27.0)", "mypy-boto3-iotdeviceadvisor (>=1.26.0,<1.27.0)", "mypy-boto3-iotevents (>=1.26.0,<1.27.0)", "mypy-boto3-iotevents-data (>=1.26.0,<1.27.0)", "mypy-boto3-iotfleethub (>=1.26.0,<1.27.0)", "mypy-boto3-iotfleetwise (>=1.26.0,<1.27.0)", "mypy-boto3-iotsecuretunneling (>=1.26.0,<1.27.0)", "mypy-boto3-iotsitewise (>=1.26.0,<1.27.0)", "mypy-boto3-iotthingsgraph (>=1.26.0,<1.27.0)", "mypy-boto3-iottwinmaker (>=1.26.0,<1.27.0)", "mypy-boto3-iotwireless (>=1.26.0,<1.27.0)", "mypy-boto3-ivs (>=1.26.0,<1.27.0)", "mypy-boto3-ivs-realtime (>=1.26.0,<1.27.0)", "mypy-boto3-ivschat (>=1.26.0,<1.27.0)", "mypy-boto3-kafka (>=1.26.0,<1.27.0)", "mypy-boto3-kafkaconnect (>=1.26.0,<1.27.0)", "mypy-boto3-kendra (>=1.26.0,<1.27.0)", "mypy-boto3-kendra-ranking (>=1.26.0,<1.27.0)", "mypy-boto3-keyspaces (>=1.26.0,<1.27.0)", "mypy-boto3-kinesis (>=1.26.0,<1.27.0)", "mypy-boto3-kinesis-video-archived-media (>=1.26.0,<1.27.0)", "mypy-boto3-kinesis-video-media (>=1.26.0,<1.27.0)", "mypy-boto3-kinesis-video-signaling (>=1.26.0,<1.27.0)", "mypy-boto3-kinesis-video-webrtc-storage (>=1.26.0,<1.27.0)", "mypy-boto3-kinesisanalytics (>=1.26.0,<1.27.0)", "mypy-boto3-kinesisanalyticsv2 (>=1.26.0,<1.27.0)", "mypy-boto3-kinesisvideo (>=1.26.0,<1.27.0)", "mypy-boto3-kms (>=1.26.0,<1.27.0)", "mypy-boto3-lakeformation (>=1.26.0,<1.27.0)", "mypy-boto3-lambda (>=1.26.0,<1.27.0)", "mypy-boto3-lex-models (>=1.26.0,<1.27.0)", "mypy-boto3-lex-runtime (>=1.26.0,<1.27.0)", "mypy-boto3-lexv2-models (>=1.26.0,<1.27.0)", "mypy-boto3-lexv2-runtime (>=1.26.0,<1.27.0)", "mypy-boto3-license-manager (>=1.26.0,<1.27.0)", "mypy-boto3-license-manager-linux-subscriptions (>=1.26.0,<1.27.0)", "mypy-boto3-license-manager-user-subscriptions (>=1.26.0,<1.27.0)", "mypy-boto3-lightsail (>=1.26.0,<1.27.0)", "mypy-boto3-location (>=1.26.0,<1.27.0)", "mypy-boto3-logs (>=1.26.0,<1.27.0)", "mypy-boto3-lookoutequipment (>=1.26.0,<1.27.0)", "mypy-boto3-lookoutmetrics (>=1.26.0,<1.27.0)", "mypy-boto3-lookoutvision (>=1.26.0,<1.27.0)", "mypy-boto3-m2 (>=1.26.0,<1.27.0)", "mypy-boto3-machinelearning (>=1.26.0,<1.27.0)", "mypy-boto3-macie (>=1.26.0,<1.27.0)", "mypy-boto3-macie2 (>=1.26.0,<1.27.0)", "mypy-boto3-managedblockchain (>=1.26.0,<1.27.0)", "mypy-boto3-marketplace-catalog (>=1.26.0,<1.27.0)", "mypy-boto3-marketplace-entitlement (>=1.26.0,<1.27.0)", "mypy-boto3-marketplacecommerceanalytics (>=1.26.0,<1.27.0)", "mypy-boto3-mediaconnect (>=1.26.0,<1.27.0)", "mypy-boto3-mediaconvert (>=1.26.0,<1.27.0)", "mypy-boto3-medialive (>=1.26.0,<1.27.0)", "mypy-boto3-mediapackage (>=1.26.0,<1.27.0)", "mypy-boto3-mediapackage-vod (>=1.26.0,<1.27.0)", "mypy-boto3-mediastore (>=1.26.0,<1.27.0)", "mypy-boto3-mediastore-data (>=1.26.0,<1.27.0)", "mypy-boto3-mediatailor (>=1.26.0,<1.27.0)", "mypy-boto3-memorydb (>=1.26.0,<1.27.0)", "mypy-boto3-meteringmarketplace (>=1.26.0,<1.27.0)", "mypy-boto3-mgh (>=1.26.0,<1.27.0)", "mypy-boto3-mgn (>=1.26.0,<1.27.0)", "mypy-boto3-migration-hub-refactor-spaces (>=1.26.0,<1.27.0)", "mypy-boto3-migrationhub-config (>=1.26.0,<1.27.0)", "mypy-boto3-migrationhuborchestrator (>=1.26.0,<1.27.0)", "mypy-boto3-migrationhubstrategy (>=1.26.0,<1.27.0)", "mypy-boto3-mobile (>=1.26.0,<1.27.0)", "mypy-boto3-mq (>=1.26.0,<1.27.0)", "mypy-boto3-mturk (>=1.26.0,<1.27.0)", "mypy-boto3-mwaa (>=1.26.0,<1.27.0)", "mypy-boto3-neptune (>=1.26.0,<1.27.0)", "mypy-boto3-network-firewall (>=1.26.0,<1.27.0)", "mypy-boto3-networkmanager (>=1.26.0,<1.27.0)", "mypy-boto3-nimble (>=1.26.0,<1.27.0)", "mypy-boto3-oam (>=1.26.0,<1.27.0)", "mypy-boto3-omics (>=1.26.0,<1.27.0)", "mypy-boto3-opensearch (>=1.26.0,<1.27.0)", "mypy-boto3-opensearchserverless (>=1.26.0,<1.27.0)", "mypy-boto3-opsworks (>=1.26.0,<1.27.0)", "mypy-boto3-opsworkscm (>=1.26.0,<1.27.0)", "mypy-boto3-organizations (>=1.26.0,<1.27.0)", "mypy-boto3-osis (>=1.26.0,<1.27.0)", "mypy-boto3-outposts (>=1.26.0,<1.27.0)", "mypy-boto3-panorama (>=1.26.0,<1.27.0)", "mypy-boto3-personalize (>=1.26.0,<1.27.0)", "mypy-boto3-personalize-events (>=1.26.0,<1.27.0)", "mypy-boto3-personalize-runtime (>=1.26.0,<1.27.0)", "mypy-boto3-pi (>=1.26.0,<1.27.0)", "mypy-boto3-pinpoint (>=1.26.0,<1.27.0)", "mypy-boto3-pinpoint-email (>=1.26.0,<1.27.0)", "mypy-boto3-pinpoint-sms-voice (>=1.26.0,<1.27.0)", "mypy-boto3-pinpoint-sms-voice-v2 (>=1.26.0,<1.27.0)", "mypy-boto3-pipes (>=1.26.0,<1.27.0)", "mypy-boto3-polly (>=1.26.0,<1.27.0)", "mypy-boto3-pricing (>=1.26.0,<1.27.0)", "mypy-boto3-privatenetworks (>=1.26.0,<1.27.0)", "mypy-boto3-proton (>=1.26.0,<1.27.0)", "mypy-boto3-qldb (>=1.26.0,<1.27.0)", "mypy-boto3-qldb-session (>=1.26.0,<1.27.0)", "mypy-boto3-quicksight (>=1.26.0,<1.27.0)", "mypy-boto3-ram (>=1.26.0,<1.27.0)", "mypy-boto3-rbin (>=1.26.0,<1.27.0)", "mypy-boto3-rds (>=1.26.0,<1.27.0)", "mypy-boto3-rds-data (>=1.26.0,<1.27.0)", "mypy-boto3-redshift (>=1.26.0,<1.27.0)", "mypy-boto3-redshift-data (>=1.26.0,<1.27.0)", "mypy-boto3-redshift-serverless (>=1.26.0,<1.27.0)", "mypy-boto3-rekognition (>=1.26.0,<1.27.0)", "mypy-boto3-resiliencehub (>=1.26.0,<1.27.0)", "mypy-boto3-resource-explorer-2 (>=1.26.0,<1.27.0)", "mypy-boto3-resource-groups (>=1.26.0,<1.27.0)", "mypy-boto3-resourcegroupstaggingapi (>=1.26.0,<1.27.0)", "mypy-boto3-robomaker (>=1.26.0,<1.27.0)", "mypy-boto3-rolesanywhere (>=1.26.0,<1.27.0)", "mypy-boto3-route53 (>=1.26.0,<1.27.0)", "mypy-boto3-route53-recovery-cluster (>=1.26.0,<1.27.0)", "mypy-boto3-route53-recovery-control-config (>=1.26.0,<1.27.0)", "mypy-boto3-route53-recovery-readiness (>=1.26.0,<1.27.0)", "mypy-boto3-route53domains (>=1.26.0,<1.27.0)", "mypy-boto3-route53resolver (>=1.26.0,<1.27.0)", "mypy-boto3-rum (>=1.26.0,<1.27.0)", "mypy-boto3-s3 (>=1.26.0,<1.27.0)", "mypy-boto3-s3control (>=1.26.0,<1.27.0)", "mypy-boto3-s3outposts (>=1.26.0,<1.27.0)", "mypy-boto3-sagemaker (>=1.26.0,<1.27.0)", "mypy-boto3-sagemaker-a2i-runtime (>=1.26.0,<1.27.0)", "mypy-boto3-sagemaker-edge (>=1.26.0,<1.27.0)", "mypy-boto3-sagemaker-featurestore-runtime (>=1.26.0,<1.27.0)", "mypy-boto3-sagemaker-geospatial (>=1.26.0,<1.27.0)", "mypy-boto3-sagemaker-metrics (>=1.26.0,<1.27.0)", "mypy-boto3-sagemaker-runtime (>=1.26.0,<1.27.0)", "mypy-boto3-savingsplans (>=1.26.0,<1.27.0)", "mypy-boto3-scheduler (>=1.26.0,<1.27.0)", "mypy-boto3-schemas (>=1.26.0,<1.27.0)", "mypy-boto3-sdb (>=1.26.0,<1.27.0)", "mypy-boto3-secretsmanager (>=1.26.0,<1.27.0)", "mypy-boto3-securityhub (>=1.26.0,<1.27.0)", "mypy-boto3-securitylake (>=1.26.0,<1.27.0)", "mypy-boto3-serverlessrepo (>=1.26.0,<1.27.0)", "mypy-boto3-service-quotas (>=1.26.0,<1.27.0)", "mypy-boto3-servicecatalog (>=1.26.0,<1.27.0)", "mypy-boto3-servicecatalog-appregistry (>=1.26.0,<1.27.0)", "mypy-boto3-servicediscovery (>=1.26.0,<1.27.0)", "mypy-boto3-ses (>=1.26.0,<1.27.0)", "mypy-boto3-sesv2 (>=1.26.0,<1.27.0)", "mypy-boto3-shield (>=1.26.0,<1.27.0)", "mypy-boto3-signer (>=1.26.0,<1.27.0)", "mypy-boto3-simspaceweaver (>=1.26.0,<1.27.0)", "mypy-boto3-sms (>=1.26.0,<1.27.0)", "mypy-boto3-sms-voice (>=1.26.0,<1.27.0)", "mypy-boto3-snow-device-management (>=1.26.0,<1.27.0)", "mypy-boto3-snowball (>=1.26.0,<1.27.0)", "mypy-boto3-sns (>=1.26.0,<1.27.0)", "mypy-boto3-sqs (>=1.26.0,<1.27.0)", "mypy-boto3-ssm (>=1.26.0,<1.27.0)", "mypy-boto3-ssm-contacts (>=1.26.0,<1.27.0)", "mypy-boto3-ssm-incidents (>=1.26.0,<1.27.0)", "mypy-boto3-ssm-sap (>=1.26.0,<1.27.0)", "mypy-boto3-sso (>=1.26.0,<1.27.0)", "mypy-boto3-sso-admin (>=1.26.0,<1.27.0)", "mypy-boto3-sso-oidc (>=1.26.0,<1.27.0)", "mypy-boto3-stepfunctions (>=1.26.0,<1.27.0)", "mypy-boto3-storagegateway (>=1.26.0,<1.27.0)", "mypy-boto3-sts (>=1.26.0,<1.27.0)", "mypy-boto3-support (>=1.26.0,<1.27.0)", "mypy-boto3-support-app (>=1.26.0,<1.27.0)", "mypy-boto3-swf (>=1.26.0,<1.27.0)", "mypy-boto3-synthetics (>=1.26.0,<1.27.0)", "mypy-boto3-textract (>=1.26.0,<1.27.0)", "mypy-boto3-timestream-query (>=1.26.0,<1.27.0)", "mypy-boto3-timestream-write (>=1.26.0,<1.27.0)", "mypy-boto3-tnb (>=1.26.0,<1.27.0)", "mypy-boto3-transcribe (>=1.26.0,<1.27.0)", "mypy-boto3-transfer (>=1.26.0,<1.27.0)", "mypy-boto3-translate (>=1.26.0,<1.27.0)", "mypy-boto3-voice-id (>=1.26.0,<1.27.0)", "mypy-boto3-vpc-lattice (>=1.26.0,<1.27.0)", "mypy-boto3-waf (>=1.26.0,<1.27.0)", "mypy-boto3-waf-regional (>=1.26.0,<1.27.0)", "mypy-boto3-wafv2 (>=1.26.0,<1.27.0)", "mypy-boto3-wellarchitected (>=1.26.0,<1.27.0)", "mypy-boto3-wisdom (>=1.26.0,<1.27.0)", "mypy-boto3-workdocs (>=1.26.0,<1.27.0)", "mypy-boto3-worklink (>=1.26.0,<1.27.0)", "mypy-boto3-workmail (>=1.26.0,<1.27.0)", "mypy-boto3-workmailmessageflow (>=1.26.0,<1.27.0)", "mypy-boto3-workspaces (>=1.26.0,<1.27.0)", "mypy-boto3-workspaces-web (>=1.26.0,<1.27.0)", "mypy-boto3-xray (>=1.26.0,<1.27.0)"] amp = ["mypy-boto3-amp (>=1.26.0,<1.27.0)"] amplify = ["mypy-boto3-amplify (>=1.26.0,<1.27.0)"] amplifybackend = ["mypy-boto3-amplifybackend (>=1.26.0,<1.27.0)"] @@ -137,7 +118,7 @@ backup-gateway = ["mypy-boto3-backup-gateway (>=1.26.0,<1.27.0)"] backupstorage = ["mypy-boto3-backupstorage (>=1.26.0,<1.27.0)"] batch = ["mypy-boto3-batch (>=1.26.0,<1.27.0)"] billingconductor = ["mypy-boto3-billingconductor (>=1.26.0,<1.27.0)"] -boto3 = ["boto3 (==1.26.108)", "botocore (==1.29.108)"] +boto3 = ["boto3 (==1.26.135)", "botocore (==1.29.135)"] braket = ["mypy-boto3-braket (>=1.26.0,<1.27.0)"] budgets = ["mypy-boto3-budgets (>=1.26.0,<1.27.0)"] ce = ["mypy-boto3-ce (>=1.26.0,<1.27.0)"] @@ -342,6 +323,7 @@ opensearchserverless = ["mypy-boto3-opensearchserverless (>=1.26.0,<1.27.0)"] opsworks = ["mypy-boto3-opsworks (>=1.26.0,<1.27.0)"] opsworkscm = ["mypy-boto3-opsworkscm (>=1.26.0,<1.27.0)"] organizations = ["mypy-boto3-organizations (>=1.26.0,<1.27.0)"] +osis = ["mypy-boto3-osis (>=1.26.0,<1.27.0)"] outposts = ["mypy-boto3-outposts (>=1.26.0,<1.27.0)"] panorama = ["mypy-boto3-panorama (>=1.26.0,<1.27.0)"] personalize = ["mypy-boto3-personalize (>=1.26.0,<1.27.0)"] @@ -452,14 +434,14 @@ xray = ["mypy-boto3-xray (>=1.26.0,<1.27.0)"] [[package]] name = "botocore" -version = "1.29.108" +version = "1.29.135" description = "Low-level, data-driven core of boto 3." category = "main" optional = false python-versions = ">= 3.7" files = [ - {file = "botocore-1.29.108-py3-none-any.whl", hash = "sha256:e5df8bd26971d6b257d9af2ec923548856decd9ff47bea1ab2736a4231bddac2"}, - {file = "botocore-1.29.108.tar.gz", hash = "sha256:e62154af6771690e4833f6102a5e31fcc3687449e6110ae32919d134394a29ea"}, + {file = "botocore-1.29.135-py3-none-any.whl", hash = "sha256:06502a4473924ef60ac0de908385a5afab9caee6c5b49cf6a330fab0d76ddf5f"}, + {file = "botocore-1.29.135.tar.gz", hash = "sha256:0c61d4e5e04fe5329fa65da6b31492ef9d0d5174d72fc2af69de2ed0f87804ca"}, ] [package.dependencies] @@ -472,14 +454,14 @@ crt = ["awscrt (==0.16.9)"] [[package]] name = "botocore-stubs" -version = "1.29.108" +version = "1.29.130" description = "Type annotations and code completion for botocore" category = "dev" optional = false python-versions = ">=3.7,<4.0" files = [ - {file = "botocore_stubs-1.29.108-py3-none-any.whl", hash = "sha256:0b2990b2f4c80e6336e4461554152436221e41aad3b810d3b86deefa76bad02d"}, - {file = "botocore_stubs-1.29.108.tar.gz", hash = "sha256:83c4307beb3767131a5f1f97aee7270ba25773d2227ced90cc687f2a01187cb6"}, + {file = "botocore_stubs-1.29.130-py3-none-any.whl", hash = "sha256:622c4a5cd740498439008d81c5ded612146f4f0d575341c12591f978edbbe733"}, + {file = "botocore_stubs-1.29.130.tar.gz", hash = "sha256:5f6f1967d23c45834858a055cbf65b66863f9f28d05f32f57bf52864a13512d9"}, ] [package.dependencies] @@ -488,21 +470,21 @@ typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.9\""} [[package]] name = "certifi" -version = "2022.12.7" +version = "2023.5.7" description = "Python package for providing Mozilla's CA Bundle." category = "main" optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2022.12.7-py3-none-any.whl", hash = "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"}, - {file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"}, + {file = "certifi-2023.5.7-py3-none-any.whl", hash = "sha256:c6c2e98f5c7869efca1f8916fed228dd91539f9f1b444c314c06eef02980c716"}, + {file = "certifi-2023.5.7.tar.gz", hash = "sha256:0f0d56dc5a6ad56fd4ba36484d6cc34451e1c6548c61daad8c320169f91eddc7"}, ] [[package]] name = "cffi" version = "1.15.1" description = "Foreign Function Interface for Python calling C code." -category = "dev" +category = "main" optional = false python-versions = "*" files = [ @@ -693,62 +675,63 @@ files = [ [[package]] name = "coverage" -version = "6.5.0" +version = "7.2.6" description = "Code coverage measurement for Python" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "coverage-6.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ef8674b0ee8cc11e2d574e3e2998aea5df5ab242e012286824ea3c6970580e53"}, - {file = "coverage-6.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:784f53ebc9f3fd0e2a3f6a78b2be1bd1f5575d7863e10c6e12504f240fd06660"}, - {file = "coverage-6.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b4a5be1748d538a710f87542f22c2cad22f80545a847ad91ce45e77417293eb4"}, - {file = "coverage-6.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:83516205e254a0cb77d2d7bb3632ee019d93d9f4005de31dca0a8c3667d5bc04"}, - {file = "coverage-6.5.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af4fffaffc4067232253715065e30c5a7ec6faac36f8fc8d6f64263b15f74db0"}, - {file = "coverage-6.5.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:97117225cdd992a9c2a5515db1f66b59db634f59d0679ca1fa3fe8da32749cae"}, - {file = "coverage-6.5.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1170fa54185845505fbfa672f1c1ab175446c887cce8212c44149581cf2d466"}, - {file = "coverage-6.5.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:11b990d520ea75e7ee8dcab5bc908072aaada194a794db9f6d7d5cfd19661e5a"}, - {file = "coverage-6.5.0-cp310-cp310-win32.whl", hash = "sha256:5dbec3b9095749390c09ab7c89d314727f18800060d8d24e87f01fb9cfb40b32"}, - {file = "coverage-6.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:59f53f1dc5b656cafb1badd0feb428c1e7bc19b867479ff72f7a9dd9b479f10e"}, - {file = "coverage-6.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4a5375e28c5191ac38cca59b38edd33ef4cc914732c916f2929029b4bfb50795"}, - {file = "coverage-6.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4ed2820d919351f4167e52425e096af41bfabacb1857186c1ea32ff9983ed75"}, - {file = "coverage-6.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:33a7da4376d5977fbf0a8ed91c4dffaaa8dbf0ddbf4c8eea500a2486d8bc4d7b"}, - {file = "coverage-6.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8fb6cf131ac4070c9c5a3e21de0f7dc5a0fbe8bc77c9456ced896c12fcdad91"}, - {file = "coverage-6.5.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a6b7d95969b8845250586f269e81e5dfdd8ff828ddeb8567a4a2eaa7313460c4"}, - {file = "coverage-6.5.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:1ef221513e6f68b69ee9e159506d583d31aa3567e0ae84eaad9d6ec1107dddaa"}, - {file = "coverage-6.5.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cca4435eebea7962a52bdb216dec27215d0df64cf27fc1dd538415f5d2b9da6b"}, - {file = "coverage-6.5.0-cp311-cp311-win32.whl", hash = "sha256:98e8a10b7a314f454d9eff4216a9a94d143a7ee65018dd12442e898ee2310578"}, - {file = "coverage-6.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:bc8ef5e043a2af066fa8cbfc6e708d58017024dc4345a1f9757b329a249f041b"}, - {file = "coverage-6.5.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:4433b90fae13f86fafff0b326453dd42fc9a639a0d9e4eec4d366436d1a41b6d"}, - {file = "coverage-6.5.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4f05d88d9a80ad3cac6244d36dd89a3c00abc16371769f1340101d3cb899fc3"}, - {file = "coverage-6.5.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:94e2565443291bd778421856bc975d351738963071e9b8839ca1fc08b42d4bef"}, - {file = "coverage-6.5.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:027018943386e7b942fa832372ebc120155fd970837489896099f5cfa2890f79"}, - {file = "coverage-6.5.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:255758a1e3b61db372ec2736c8e2a1fdfaf563977eedbdf131de003ca5779b7d"}, - {file = "coverage-6.5.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:851cf4ff24062c6aec510a454b2584f6e998cada52d4cb58c5e233d07172e50c"}, - {file = "coverage-6.5.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:12adf310e4aafddc58afdb04d686795f33f4d7a6fa67a7a9d4ce7d6ae24d949f"}, - {file = "coverage-6.5.0-cp37-cp37m-win32.whl", hash = "sha256:b5604380f3415ba69de87a289a2b56687faa4fe04dbee0754bfcae433489316b"}, - {file = "coverage-6.5.0-cp37-cp37m-win_amd64.whl", hash = "sha256:4a8dbc1f0fbb2ae3de73eb0bdbb914180c7abfbf258e90b311dcd4f585d44bd2"}, - {file = "coverage-6.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d900bb429fdfd7f511f868cedd03a6bbb142f3f9118c09b99ef8dc9bf9643c3c"}, - {file = "coverage-6.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2198ea6fc548de52adc826f62cb18554caedfb1d26548c1b7c88d8f7faa8f6ba"}, - {file = "coverage-6.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c4459b3de97b75e3bd6b7d4b7f0db13f17f504f3d13e2a7c623786289dd670e"}, - {file = "coverage-6.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:20c8ac5386253717e5ccc827caad43ed66fea0efe255727b1053a8154d952398"}, - {file = "coverage-6.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b07130585d54fe8dff3d97b93b0e20290de974dc8177c320aeaf23459219c0b"}, - {file = "coverage-6.5.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:dbdb91cd8c048c2b09eb17713b0c12a54fbd587d79adcebad543bc0cd9a3410b"}, - {file = "coverage-6.5.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:de3001a203182842a4630e7b8d1a2c7c07ec1b45d3084a83d5d227a3806f530f"}, - {file = "coverage-6.5.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e07f4a4a9b41583d6eabec04f8b68076ab3cd44c20bd29332c6572dda36f372e"}, - {file = "coverage-6.5.0-cp38-cp38-win32.whl", hash = "sha256:6d4817234349a80dbf03640cec6109cd90cba068330703fa65ddf56b60223a6d"}, - {file = "coverage-6.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:7ccf362abd726b0410bf8911c31fbf97f09f8f1061f8c1cf03dfc4b6372848f6"}, - {file = "coverage-6.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:633713d70ad6bfc49b34ead4060531658dc6dfc9b3eb7d8a716d5873377ab745"}, - {file = "coverage-6.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:95203854f974e07af96358c0b261f1048d8e1083f2de9b1c565e1be4a3a48cfc"}, - {file = "coverage-6.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9023e237f4c02ff739581ef35969c3739445fb059b060ca51771e69101efffe"}, - {file = "coverage-6.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:265de0fa6778d07de30bcf4d9dc471c3dc4314a23a3c6603d356a3c9abc2dfcf"}, - {file = "coverage-6.5.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f830ed581b45b82451a40faabb89c84e1a998124ee4212d440e9c6cf70083e5"}, - {file = "coverage-6.5.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7b6be138d61e458e18d8e6ddcddd36dd96215edfe5f1168de0b1b32635839b62"}, - {file = "coverage-6.5.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:42eafe6778551cf006a7c43153af1211c3aaab658d4d66fa5fcc021613d02518"}, - {file = "coverage-6.5.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:723e8130d4ecc8f56e9a611e73b31219595baa3bb252d539206f7bbbab6ffc1f"}, - {file = "coverage-6.5.0-cp39-cp39-win32.whl", hash = "sha256:d9ecf0829c6a62b9b573c7bb6d4dcd6ba8b6f80be9ba4fc7ed50bf4ac9aecd72"}, - {file = "coverage-6.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:fc2af30ed0d5ae0b1abdb4ebdce598eafd5b35397d4d75deb341a614d333d987"}, - {file = "coverage-6.5.0-pp36.pp37.pp38-none-any.whl", hash = "sha256:1431986dac3923c5945271f169f59c45b8802a114c8f548d611f2015133df77a"}, - {file = "coverage-6.5.0.tar.gz", hash = "sha256:f642e90754ee3e06b0e7e51bce3379590e76b7f76b708e1a71ff043f87025c84"}, + {file = "coverage-7.2.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:496b86f1fc9c81a1cd53d8842ef712e950a4611bba0c42d33366a7b91ba969ec"}, + {file = "coverage-7.2.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fbe6e8c0a9a7193ba10ee52977d4d5e7652957c1f56ccefed0701db8801a2a3b"}, + {file = "coverage-7.2.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76d06b721c2550c01a60e5d3093f417168658fb454e5dfd9a23570e9bffe39a1"}, + {file = "coverage-7.2.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:77a04b84d01f0e12c66f16e69e92616442dc675bbe51b90bfb074b1e5d1c7fbd"}, + {file = "coverage-7.2.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:35db06450272473eab4449e9c2ad9bc6a0a68dab8e81a0eae6b50d9c2838767e"}, + {file = "coverage-7.2.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6727a0d929ff0028b1ed8b3e7f8701670b1d7032f219110b55476bb60c390bfb"}, + {file = "coverage-7.2.6-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:aac1d5fdc5378f6bac2c0c7ebe7635a6809f5b4376f6cf5d43243c1917a67087"}, + {file = "coverage-7.2.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1c9e4a5eb1bbc3675ee57bc31f8eea4cd7fb0cbcbe4912cf1cb2bf3b754f4a80"}, + {file = "coverage-7.2.6-cp310-cp310-win32.whl", hash = "sha256:71f739f97f5f80627f1fee2331e63261355fd1e9a9cce0016394b6707ac3f4ec"}, + {file = "coverage-7.2.6-cp310-cp310-win_amd64.whl", hash = "sha256:fde5c7a9d9864d3e07992f66767a9817f24324f354caa3d8129735a3dc74f126"}, + {file = "coverage-7.2.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bc7b667f8654376e9353dd93e55e12ce2a59fb6d8e29fce40de682273425e044"}, + {file = "coverage-7.2.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:697f4742aa3f26c107ddcb2b1784a74fe40180014edbd9adaa574eac0529914c"}, + {file = "coverage-7.2.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:541280dde49ce74a4262c5e395b48ea1207e78454788887118c421cb4ffbfcac"}, + {file = "coverage-7.2.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e7f1a8328eeec34c54f1d5968a708b50fc38d31e62ca8b0560e84a968fbf9a9"}, + {file = "coverage-7.2.6-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4bbd58eb5a2371bf160590f4262109f66b6043b0b991930693134cb617bc0169"}, + {file = "coverage-7.2.6-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ae82c5f168d2a39a5d69a12a69d4dc23837a43cf2ca99be60dfe59996ea6b113"}, + {file = "coverage-7.2.6-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f5440cdaf3099e7ab17a5a7065aed59aff8c8b079597b61c1f8be6f32fe60636"}, + {file = "coverage-7.2.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a6f03f87fea579d55e0b690d28f5042ec1368650466520fbc400e7aeaf09e995"}, + {file = "coverage-7.2.6-cp311-cp311-win32.whl", hash = "sha256:dc4d5187ef4d53e0d4c8eaf530233685667844c5fb0b855fea71ae659017854b"}, + {file = "coverage-7.2.6-cp311-cp311-win_amd64.whl", hash = "sha256:c93d52c3dc7b9c65e39473704988602300e3cc1bad08b5ab5b03ca98bbbc68c1"}, + {file = "coverage-7.2.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:42c692b55a647a832025a4c048007034fe77b162b566ad537ce65ad824b12a84"}, + {file = "coverage-7.2.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7786b2fa7809bf835f830779ad285215a04da76293164bb6745796873f0942d"}, + {file = "coverage-7.2.6-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:25bad4196104761bc26b1dae9b57383826542ec689ff0042f7f4f4dd7a815cba"}, + {file = "coverage-7.2.6-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2692306d3d4cb32d2cceed1e47cebd6b1d2565c993d6d2eda8e6e6adf53301e6"}, + {file = "coverage-7.2.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:392154d09bd4473b9d11351ab5d63391f3d5d24d752f27b3be7498b0ee2b5226"}, + {file = "coverage-7.2.6-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:fa079995432037b5e2ef5ddbb270bcd2ded9f52b8e191a5de11fe59a00ea30d8"}, + {file = "coverage-7.2.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d712cefff15c712329113b01088ba71bbcef0f7ea58478ca0bbec63a824844cb"}, + {file = "coverage-7.2.6-cp37-cp37m-win32.whl", hash = "sha256:004948e296149644d208964300cb3d98affc5211e9e490e9979af4030b0d6473"}, + {file = "coverage-7.2.6-cp37-cp37m-win_amd64.whl", hash = "sha256:c1d7a31603c3483ac49c1726723b0934f88f2c011c660e6471e7bd735c2fa110"}, + {file = "coverage-7.2.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3436927d1794fa6763b89b60c896f9e3bd53212001026ebc9080d23f0c2733c1"}, + {file = "coverage-7.2.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:44c9b9f1a245f3d0d202b1a8fa666a80b5ecbe4ad5d0859c0fb16a52d9763224"}, + {file = "coverage-7.2.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e3783a286d5a93a2921396d50ce45a909aa8f13eee964465012f110f0cbb611"}, + {file = "coverage-7.2.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3cff6980fe7100242170092bb40d2b1cdad79502cd532fd26b12a2b8a5f9aee0"}, + {file = "coverage-7.2.6-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c534431153caffc7c495c3eddf7e6a6033e7f81d78385b4e41611b51e8870446"}, + {file = "coverage-7.2.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:3062fd5c62df988cea9f2972c593f77fed1182bfddc5a3b12b1e606cb7aba99e"}, + {file = "coverage-7.2.6-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:6284a2005e4f8061c58c814b1600ad0074ccb0289fe61ea709655c5969877b70"}, + {file = "coverage-7.2.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:97729e6828643f168a2a3f07848e1b1b94a366b13a9f5aba5484c2215724edc8"}, + {file = "coverage-7.2.6-cp38-cp38-win32.whl", hash = "sha256:dc11b42fa61ff1e788dd095726a0aed6aad9c03d5c5984b54cb9e1e67b276aa5"}, + {file = "coverage-7.2.6-cp38-cp38-win_amd64.whl", hash = "sha256:cbcc874f454ee51f158afd604a315f30c0e31dff1d5d5bf499fc529229d964dd"}, + {file = "coverage-7.2.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d3cacc6a665221108ecdf90517a8028d07a2783df3417d12dcfef1c517e67478"}, + {file = "coverage-7.2.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:272ab31228a9df857ab5df5d67936d8861464dc89c5d3fab35132626e9369379"}, + {file = "coverage-7.2.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a8723ccec4e564d4b9a79923246f7b9a8de4ec55fa03ec4ec804459dade3c4f"}, + {file = "coverage-7.2.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5906f6a84b47f995cd1bf0aca1c72d591c55ee955f98074e93660d64dfc66eb9"}, + {file = "coverage-7.2.6-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52c139b7ab3f0b15f9aad0a3fedef5a1f8c0b2bdc291d88639ca2c97d3682416"}, + {file = "coverage-7.2.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a5ffd45c6b93c23a8507e2f436983015c6457aa832496b6a095505ca2f63e8f1"}, + {file = "coverage-7.2.6-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:4f3c7c19581d471af0e9cb49d928172cd8492cd78a2b7a4e82345d33662929bb"}, + {file = "coverage-7.2.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2e8c0e79820cdd67978e1120983786422d279e07a381dbf89d03bbb23ec670a6"}, + {file = "coverage-7.2.6-cp39-cp39-win32.whl", hash = "sha256:13cde6bb0e58fb67d09e2f373de3899d1d1e866c5a9ff05d93615f2f54fbd2bb"}, + {file = "coverage-7.2.6-cp39-cp39-win_amd64.whl", hash = "sha256:6b9f64526286255735847aed0221b189486e0b9ed943446936e41b7e44b08783"}, + {file = "coverage-7.2.6-pp37.pp38.pp39-none-any.whl", hash = "sha256:6babcbf1e66e46052442f10833cfc4a0d3554d8276aa37af8531a83ed3c1a01d"}, + {file = "coverage-7.2.6.tar.gz", hash = "sha256:2025f913f2edb0272ef15d00b1f335ff8908c921c8eb2013536fcaf61f5a683d"}, ] [package.dependencies] @@ -757,53 +740,33 @@ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.1 [package.extras] toml = ["tomli"] -[[package]] -name = "coveralls" -version = "3.3.1" -description = "Show coverage stats online via coveralls.io" -category = "dev" -optional = false -python-versions = ">= 3.5" -files = [ - {file = "coveralls-3.3.1-py2.py3-none-any.whl", hash = "sha256:f42015f31d386b351d4226389b387ae173207058832fbf5c8ec4b40e27b16026"}, - {file = "coveralls-3.3.1.tar.gz", hash = "sha256:b32a8bb5d2df585207c119d6c01567b81fba690c9c10a753bfe27a335bfc43ea"}, -] - -[package.dependencies] -coverage = ">=4.1,<6.0.0 || >6.1,<6.1.1 || >6.1.1,<7.0" -docopt = ">=0.6.1" -requests = ">=1.0.0" - -[package.extras] -yaml = ["PyYAML (>=3.10)"] - [[package]] name = "cryptography" -version = "40.0.1" +version = "40.0.2" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." -category = "dev" +category = "main" optional = false python-versions = ">=3.6" files = [ - {file = "cryptography-40.0.1-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:918cb89086c7d98b1b86b9fdb70c712e5a9325ba6f7d7cfb509e784e0cfc6917"}, - {file = "cryptography-40.0.1-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:9618a87212cb5200500e304e43691111570e1f10ec3f35569fdfcd17e28fd797"}, - {file = "cryptography-40.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a4805a4ca729d65570a1b7cac84eac1e431085d40387b7d3bbaa47e39890b88"}, - {file = "cryptography-40.0.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63dac2d25c47f12a7b8aa60e528bfb3c51c5a6c5a9f7c86987909c6c79765554"}, - {file = "cryptography-40.0.1-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:0a4e3406cfed6b1f6d6e87ed243363652b2586b2d917b0609ca4f97072994405"}, - {file = "cryptography-40.0.1-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:1e0af458515d5e4028aad75f3bb3fe7a31e46ad920648cd59b64d3da842e4356"}, - {file = "cryptography-40.0.1-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:d8aa3609d337ad85e4eb9bb0f8bcf6e4409bfb86e706efa9a027912169e89122"}, - {file = "cryptography-40.0.1-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:cf91e428c51ef692b82ce786583e214f58392399cf65c341bc7301d096fa3ba2"}, - {file = "cryptography-40.0.1-cp36-abi3-win32.whl", hash = "sha256:650883cc064297ef3676b1db1b7b1df6081794c4ada96fa457253c4cc40f97db"}, - {file = "cryptography-40.0.1-cp36-abi3-win_amd64.whl", hash = "sha256:a805a7bce4a77d51696410005b3e85ae2839bad9aa38894afc0aa99d8e0c3160"}, - {file = "cryptography-40.0.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:cd033d74067d8928ef00a6b1327c8ea0452523967ca4463666eeba65ca350d4c"}, - {file = "cryptography-40.0.1-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d36bbeb99704aabefdca5aee4eba04455d7a27ceabd16f3b3ba9bdcc31da86c4"}, - {file = "cryptography-40.0.1-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:32057d3d0ab7d4453778367ca43e99ddb711770477c4f072a51b3ca69602780a"}, - {file = "cryptography-40.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:f5d7b79fa56bc29580faafc2ff736ce05ba31feaa9d4735048b0de7d9ceb2b94"}, - {file = "cryptography-40.0.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7c872413353c70e0263a9368c4993710070e70ab3e5318d85510cc91cce77e7c"}, - {file = "cryptography-40.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:28d63d75bf7ae4045b10de5413fb1d6338616e79015999ad9cf6fc538f772d41"}, - {file = "cryptography-40.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6f2bbd72f717ce33100e6467572abaedc61f1acb87b8d546001328d7f466b778"}, - {file = "cryptography-40.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:cc3a621076d824d75ab1e1e530e66e7e8564e357dd723f2533225d40fe35c60c"}, - {file = "cryptography-40.0.1.tar.gz", hash = "sha256:2803f2f8b1e95f614419926c7e6f55d828afc614ca5ed61543877ae668cc3472"}, + {file = "cryptography-40.0.2-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:8f79b5ff5ad9d3218afb1e7e20ea74da5f76943ee5edb7f76e56ec5161ec782b"}, + {file = "cryptography-40.0.2-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:05dc219433b14046c476f6f09d7636b92a1c3e5808b9a6536adf4932b3b2c440"}, + {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4df2af28d7bedc84fe45bd49bc35d710aede676e2a4cb7fc6d103a2adc8afe4d"}, + {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0dcca15d3a19a66e63662dc8d30f8036b07be851a8680eda92d079868f106288"}, + {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:a04386fb7bc85fab9cd51b6308633a3c271e3d0d3eae917eebab2fac6219b6d2"}, + {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:adc0d980fd2760c9e5de537c28935cc32b9353baaf28e0814df417619c6c8c3b"}, + {file = "cryptography-40.0.2-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:d5a1bd0e9e2031465761dfa920c16b0065ad77321d8a8c1f5ee331021fda65e9"}, + {file = "cryptography-40.0.2-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:a95f4802d49faa6a674242e25bfeea6fc2acd915b5e5e29ac90a32b1139cae1c"}, + {file = "cryptography-40.0.2-cp36-abi3-win32.whl", hash = "sha256:aecbb1592b0188e030cb01f82d12556cf72e218280f621deed7d806afd2113f9"}, + {file = "cryptography-40.0.2-cp36-abi3-win_amd64.whl", hash = "sha256:b12794f01d4cacfbd3177b9042198f3af1c856eedd0a98f10f141385c809a14b"}, + {file = "cryptography-40.0.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:142bae539ef28a1c76794cca7f49729e7c54423f615cfd9b0b1fa90ebe53244b"}, + {file = "cryptography-40.0.2-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:956ba8701b4ffe91ba59665ed170a2ebbdc6fc0e40de5f6059195d9f2b33ca0e"}, + {file = "cryptography-40.0.2-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4f01c9863da784558165f5d4d916093737a75203a5c5286fde60e503e4276c7a"}, + {file = "cryptography-40.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:3daf9b114213f8ba460b829a02896789751626a2a4e7a43a28ee77c04b5e4958"}, + {file = "cryptography-40.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48f388d0d153350f378c7f7b41497a54ff1513c816bcbbcafe5b829e59b9ce5b"}, + {file = "cryptography-40.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c0764e72b36a3dc065c155e5b22f93df465da9c39af65516fe04ed3c68c92636"}, + {file = "cryptography-40.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:cbaba590180cba88cb99a5f76f90808a624f18b169b90a4abb40c1fd8c19420e"}, + {file = "cryptography-40.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7a38250f433cd41df7fcb763caa3ee9362777fdb4dc642b9a349721d2bf47404"}, + {file = "cryptography-40.0.2.tar.gz", hash = "sha256:c33c0d32b8594fa647d2e01dbccc303478e16fdd7cf98652d5b3ed11aa5e5c99"}, ] [package.dependencies] @@ -821,14 +784,14 @@ tox = ["tox"] [[package]] name = "dcicutils" -version = "7.0.0" +version = "7.5.1" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" category = "main" optional = false python-versions = ">=3.7,<3.10" files = [ - {file = "dcicutils-7.0.0-py3-none-any.whl", hash = "sha256:a18ec1685761bee73747dcfbf8633a2ff62d9af91ff56c510bacc202023fc3b9"}, - {file = "dcicutils-7.0.0.tar.gz", hash = "sha256:36eb1025b27f3f466df8e59e8a36260be547a1a46d6995d4fe8d954cc482fb82"}, + {file = "dcicutils-7.5.1-py3-none-any.whl", hash = "sha256:5c77b00116c3d1f9f413b30aa66865a9c50d88d02ea20349a4bb25e1e45eea85"}, + {file = "dcicutils-7.5.1.tar.gz", hash = "sha256:c4211161f48ce727e9cd3e2b52f66336949c5a6ce6d518c908dfee9153f7b121"}, ] [package.dependencies] @@ -840,6 +803,7 @@ elasticsearch = "7.13.4" gitpython = ">=3.1.2,<4.0.0" opensearch-py = ">=2.0.1,<3.0.0" PyJWT = ">=2.6.0,<3.0.0" +pyOpenSSL = ">=23.1.1,<24.0.0" pytz = ">=2020.4" PyYAML = ">=5.1,<5.5" redis = ">=4.5.1,<5.0.0" @@ -847,6 +811,7 @@ requests = ">=2.21.0,<3.0.0" rfc3986 = ">=1.4.0,<2.0.0" structlog = ">=19.2.0,<20.0.0" toml = ">=0.10.1,<1" +tqdm = ">=4.65.0,<5.0.0" typing-extensions = ">=3.8" urllib3 = ">=1.26.6,<2.0.0" webtest = ">=2.0.34,<3.0.0" @@ -873,27 +838,16 @@ websocket-client = ">=0.32.0" ssh = ["paramiko (>=2.4.2)"] tls = ["cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=17.5.0)"] -[[package]] -name = "docopt" -version = "0.6.2" -description = "Pythonic argument parser, that will make you smile" -category = "dev" -optional = false -python-versions = "*" -files = [ - {file = "docopt-0.6.2.tar.gz", hash = "sha256:49b3a825280bd66b3aa83585ef59c4a8c82f2c8a522dbe754a8bc8d08c85c491"}, -] - [[package]] name = "docutils" -version = "0.19" +version = "0.20.1" description = "Docutils -- Python Documentation Utilities" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "docutils-0.19-py3-none-any.whl", hash = "sha256:5e1de4d849fee02c63b040a4a3fd567f4ab104defd8a5511fbbc24a8a017efbc"}, - {file = "docutils-0.19.tar.gz", hash = "sha256:33995a6753c30b7f577febfc2c50411fec6aac7f7ffeb7c4cfe5991072dcf9e6"}, + {file = "docutils-0.20.1-py3-none-any.whl", hash = "sha256:96f387a2c5562db4476f09f13bbab2192e764cac08ebbf3a34a95d9b1e4a59d6"}, + {file = "docutils-0.20.1.tar.gz", hash = "sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b"}, ] [[package]] @@ -942,7 +896,7 @@ develop = ["coverage (<5.0.0)", "mock", "pytest (>=3.0.0)", "pytest-cov", "pytes name = "exceptiongroup" version = "1.1.1" description = "Backport of PEP 654 (exception groups)" -category = "main" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1163,7 +1117,7 @@ files = [ name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" -category = "main" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1303,7 +1257,7 @@ files = [ name = "mirakuru" version = "2.5.1" description = "Process executor (not only) for tests." -category = "main" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1316,14 +1270,14 @@ psutil = {version = ">=4.0.0", markers = "sys_platform != \"cygwin\""} [[package]] name = "moto" -version = "4.1.6" +version = "4.1.9" description = "" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "moto-4.1.6-py2.py3-none-any.whl", hash = "sha256:cfe398a1f6e317d061c47c3d2dd8c6893f3eb49154984a7cbb8bcd4ba517d67d"}, - {file = "moto-4.1.6.tar.gz", hash = "sha256:fdcc2731212ca050a28b2bc83e87628294bcbd55cb4f4c4692f972023fb1e7e6"}, + {file = "moto-4.1.9-py2.py3-none-any.whl", hash = "sha256:d9f5d0e3d027df350ff3552da851644ce192cbf7e7a9e8766fca4b5b6b550df0"}, + {file = "moto-4.1.9.tar.gz", hash = "sha256:d4bb629686b8b92e480f9784316bd0f379b148a5caee7c07aecbde6033a885e1"}, ] [package.dependencies] @@ -1338,17 +1292,17 @@ werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1" xmltodict = "*" [package.extras] -all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "py-partiql-parser (==0.1.0)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "py-partiql-parser (==0.3.0)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] apigatewayv2 = ["PyYAML (>=5.1)"] appsync = ["graphql-core"] awslambda = ["docker (>=3.0.0)"] batch = ["docker (>=3.0.0)"] -cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "py-partiql-parser (==0.1.0)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "py-partiql-parser (==0.3.0)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] ds = ["sshpubkeys (>=3.1.0)"] -dynamodb = ["docker (>=3.0.0)"] -dynamodbstreams = ["docker (>=3.0.0)"] +dynamodb = ["docker (>=3.0.0)", "py-partiql-parser (==0.3.0)"] +dynamodbstreams = ["docker (>=3.0.0)", "py-partiql-parser (==0.3.0)"] ebs = ["sshpubkeys (>=3.1.0)"] ec2 = ["sshpubkeys (>=3.1.0)"] efs = ["sshpubkeys (>=3.1.0)"] @@ -1356,8 +1310,8 @@ eks = ["sshpubkeys (>=3.1.0)"] glue = ["pyparsing (>=3.0.7)"] iotdata = ["jsondiff (>=1.1.2)"] route53resolver = ["sshpubkeys (>=3.1.0)"] -s3 = ["PyYAML (>=5.1)", "py-partiql-parser (==0.1.0)"] -server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "ecdsa (!=0.15)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "py-partiql-parser (==0.1.0)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +s3 = ["PyYAML (>=5.1)", "py-partiql-parser (==0.3.0)"] +server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "ecdsa (!=0.15)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "py-partiql-parser (==0.3.0)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] ssm = ["PyYAML (>=5.1)"] xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] @@ -1400,14 +1354,14 @@ kerberos = ["requests-kerberos"] [[package]] name = "packaging" -version = "23.0" +version = "23.1" description = "Core utilities for Python packages" -category = "main" +category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "packaging-23.0-py3-none-any.whl", hash = "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2"}, - {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, + {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"}, + {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, ] [[package]] @@ -1430,26 +1384,24 @@ totp = ["cryptography"] [[package]] name = "pastedeploy" -version = "3.0.1" +version = "1.5.2" description = "Load, configure, and compose WSGI applications and servers" category = "main" optional = false -python-versions = ">=3.7" +python-versions = "*" files = [ - {file = "PasteDeploy-3.0.1-py3-none-any.whl", hash = "sha256:6195c921b1c3ed9722e4e3e6aa29b70deebb2429b4ca3ff3d49185c8e80003bb"}, - {file = "PasteDeploy-3.0.1.tar.gz", hash = "sha256:5f4b4d5fddd39b8947ea727161e366bf55b90efc60a4d1dd7976b9031d0b4e5f"}, + {file = "PasteDeploy-1.5.2-py2.py3-none-any.whl", hash = "sha256:39973e73f391335fac8bc8a8a95f7d34a9f42e2775600ce2dc518d93b37ef943"}, + {file = "PasteDeploy-1.5.2.tar.gz", hash = "sha256:d5858f89a255e6294e63ed46b73613c56e3b9a2d82a42f1df4d06c8421a9e3cb"}, ] [package.extras] -docs = ["Sphinx (>=1.7.5)", "pylons-sphinx-themes"] paste = ["Paste"] -testing = ["Paste", "pytest", "pytest-cov"] [[package]] name = "pillow" version = "9.5.0" description = "Python Imaging Library (Fork)" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1525,70 +1477,55 @@ files = [ docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"] tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] -[[package]] -name = "pip-licenses" -version = "3.5.5" -description = "Dump the software license list of Python packages installed with pip." -category = "dev" -optional = false -python-versions = "~=3.7" -files = [ - {file = "pip-licenses-3.5.5.tar.gz", hash = "sha256:748cfd7aca6e05032f9fa85691301295f4d943e87955be6914ca49abe3c075a4"}, - {file = "pip_licenses-3.5.5-py3-none-any.whl", hash = "sha256:6129c116bab2b202d90d6e3a96092df4ad84c0c4d57bb70192fc03f8bf06d181"}, -] - -[package.dependencies] -PTable = "*" - -[package.extras] -test = ["docutils", "pytest-cov", "pytest-pycodestyle", "pytest-runner"] - [[package]] name = "pipdeptree" -version = "2.7.0" +version = "2.7.1" description = "Command line utility to show dependency tree of packages." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "pipdeptree-2.7.0-py3-none-any.whl", hash = "sha256:f1ed934abb3f5e561ae22118d93d45132d174b94a3664396a4a3f99494f79028"}, - {file = "pipdeptree-2.7.0.tar.gz", hash = "sha256:1c79e28267ddf90ea2293f982db4f5df7a76befca483c68da6c83c4370989e8d"}, + {file = "pipdeptree-2.7.1-py3-none-any.whl", hash = "sha256:bb0ffa98a49b0b4076364b367d1df37fcf6628ec3b5cbb61cf4bbaedc7502db0"}, + {file = "pipdeptree-2.7.1.tar.gz", hash = "sha256:550bd7679379e7290739384f3e9518835620e814cc29ba709513952b627da506"}, ] [package.extras] graphviz = ["graphviz (>=0.20.1)"] -test = ["covdefaults (>=2.3)", "diff-cover (>=7.5)", "pip (>=23.0.1)", "pytest (>=7.2.2)", "pytest-cov (>=4)", "pytest-mock (>=3.10)", "virtualenv (>=20.21,<21)"] +test = ["covdefaults (>=2.3)", "diff-cover (>=7.5)", "pip (>=23.1)", "pytest (>=7.3.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)", "virtualenv (>=20.21,<21)"] [[package]] name = "plaster" -version = "1.1.2" +version = "1.0" description = "A loader interface around multiple config file formats." category = "main" optional = false -python-versions = ">=3.7" +python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*" files = [ - {file = "plaster-1.1.2-py2.py3-none-any.whl", hash = "sha256:42992ab1f4865f1278e2ad740e8ad145683bb4022e03534265528f0c23c0df2d"}, - {file = "plaster-1.1.2.tar.gz", hash = "sha256:f8befc54bf8c1147c10ab40297ec84c2676fa2d4ea5d6f524d9436a80074ef98"}, + {file = "plaster-1.0-py2.py3-none-any.whl", hash = "sha256:215c921a438b5349931fd7df9a5a11a3572947f20f4bc6dd622ac08f1c3ba249"}, + {file = "plaster-1.0.tar.gz", hash = "sha256:8351c7c7efdf33084c1de88dd0f422cbe7342534537b553c49b857b12d98c8c3"}, ] +[package.dependencies] +setuptools = "*" + [package.extras] docs = ["Sphinx", "pylons-sphinx-themes"] testing = ["pytest", "pytest-cov"] [[package]] name = "plaster-pastedeploy" -version = "1.0.1" +version = "0.6" description = "A loader implementing the PasteDeploy syntax to be used by plaster." category = "main" optional = false -python-versions = ">=3.7" +python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*" files = [ - {file = "plaster_pastedeploy-1.0.1-py2.py3-none-any.whl", hash = "sha256:ad3550cc744648969ed3b810f33c9344f515ee8d8a8cec18e8f2c4a643c2181f"}, - {file = "plaster_pastedeploy-1.0.1.tar.gz", hash = "sha256:be262e6d2e41a7264875daa2fe2850cbb0615728bcdc92828fdc72736e381412"}, + {file = "plaster_pastedeploy-0.6-py2.py3-none-any.whl", hash = "sha256:71e29b0ab90df8343bca5f0debe4706f0f8147308a78922c8c26e8252809bce4"}, + {file = "plaster_pastedeploy-0.6.tar.gz", hash = "sha256:c231130cb86ae414084008fe1d1797db7e61dc5eaafb5e755de21387c27c6fae"}, ] [package.dependencies] -PasteDeploy = ">=2.0" +PasteDeploy = ">=1.5.0" plaster = ">=0.5" [package.extras] @@ -1598,7 +1535,7 @@ testing = ["pytest", "pytest-cov"] name = "pluggy" version = "1.0.0" description = "plugin and hook calling mechanisms for python" -category = "main" +category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -1614,7 +1551,7 @@ testing = ["pytest", "pytest-benchmark"] name = "port-for" version = "0.6.3" description = "Utility that helps with local TCP ports management. It can find an unused TCP localhost port and remember the association." -category = "main" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1624,26 +1561,26 @@ files = [ [[package]] name = "psutil" -version = "5.9.4" +version = "5.9.5" description = "Cross-platform lib for process and system monitoring in Python." category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ - {file = "psutil-5.9.4-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c1ca331af862803a42677c120aff8a814a804e09832f166f226bfd22b56feee8"}, - {file = "psutil-5.9.4-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:68908971daf802203f3d37e78d3f8831b6d1014864d7a85937941bb35f09aefe"}, - {file = "psutil-5.9.4-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:3ff89f9b835100a825b14c2808a106b6fdcc4b15483141482a12c725e7f78549"}, - {file = "psutil-5.9.4-cp27-cp27m-win32.whl", hash = "sha256:852dd5d9f8a47169fe62fd4a971aa07859476c2ba22c2254d4a1baa4e10b95ad"}, - {file = "psutil-5.9.4-cp27-cp27m-win_amd64.whl", hash = "sha256:9120cd39dca5c5e1c54b59a41d205023d436799b1c8c4d3ff71af18535728e94"}, - {file = "psutil-5.9.4-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:6b92c532979bafc2df23ddc785ed116fced1f492ad90a6830cf24f4d1ea27d24"}, - {file = "psutil-5.9.4-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:efeae04f9516907be44904cc7ce08defb6b665128992a56957abc9b61dca94b7"}, - {file = "psutil-5.9.4-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:54d5b184728298f2ca8567bf83c422b706200bcbbfafdc06718264f9393cfeb7"}, - {file = "psutil-5.9.4-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:16653106f3b59386ffe10e0bad3bb6299e169d5327d3f187614b1cb8f24cf2e1"}, - {file = "psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54c0d3d8e0078b7666984e11b12b88af2db11d11249a8ac8920dd5ef68a66e08"}, - {file = "psutil-5.9.4-cp36-abi3-win32.whl", hash = "sha256:149555f59a69b33f056ba1c4eb22bb7bf24332ce631c44a319cec09f876aaeff"}, - {file = "psutil-5.9.4-cp36-abi3-win_amd64.whl", hash = "sha256:fd8522436a6ada7b4aad6638662966de0d61d241cb821239b2ae7013d41a43d4"}, - {file = "psutil-5.9.4-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:6001c809253a29599bc0dfd5179d9f8a5779f9dffea1da0f13c53ee568115e1e"}, - {file = "psutil-5.9.4.tar.gz", hash = "sha256:3d7f9739eb435d4b1338944abe23f49584bde5395f27487d2ee25ad9a8774a62"}, + {file = "psutil-5.9.5-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:be8929ce4313f9f8146caad4272f6abb8bf99fc6cf59344a3167ecd74f4f203f"}, + {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ab8ed1a1d77c95453db1ae00a3f9c50227ebd955437bcf2a574ba8adbf6a74d5"}, + {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:4aef137f3345082a3d3232187aeb4ac4ef959ba3d7c10c33dd73763fbc063da4"}, + {file = "psutil-5.9.5-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:ea8518d152174e1249c4f2a1c89e3e6065941df2fa13a1ab45327716a23c2b48"}, + {file = "psutil-5.9.5-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:acf2aef9391710afded549ff602b5887d7a2349831ae4c26be7c807c0a39fac4"}, + {file = "psutil-5.9.5-cp27-none-win32.whl", hash = "sha256:5b9b8cb93f507e8dbaf22af6a2fd0ccbe8244bf30b1baad6b3954e935157ae3f"}, + {file = "psutil-5.9.5-cp27-none-win_amd64.whl", hash = "sha256:8c5f7c5a052d1d567db4ddd231a9d27a74e8e4a9c3f44b1032762bd7b9fdcd42"}, + {file = "psutil-5.9.5-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:3c6f686f4225553615612f6d9bc21f1c0e305f75d7d8454f9b46e901778e7217"}, + {file = "psutil-5.9.5-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7a7dd9997128a0d928ed4fb2c2d57e5102bb6089027939f3b722f3a210f9a8da"}, + {file = "psutil-5.9.5-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89518112647f1276b03ca97b65cc7f64ca587b1eb0278383017c2a0dcc26cbe4"}, + {file = "psutil-5.9.5-cp36-abi3-win32.whl", hash = "sha256:104a5cc0e31baa2bcf67900be36acde157756b9c44017b86b2c049f11957887d"}, + {file = "psutil-5.9.5-cp36-abi3-win_amd64.whl", hash = "sha256:b258c0c1c9d145a1d5ceffab1134441c4c5113b2417fafff7315a917a026c3c9"}, + {file = "psutil-5.9.5-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:c607bb3b57dc779d55e1554846352b4e358c10fff3abf3514a7a6601beebdb30"}, + {file = "psutil-5.9.5.tar.gz", hash = "sha256:5410638e4df39c54d957fc51ce03048acd8e6d60abc0f5107af51e5fb566eb3c"}, ] [package.extras] @@ -1721,17 +1658,6 @@ files = [ {file = "psycopg2_binary-2.9.6-cp39-cp39-win_amd64.whl", hash = "sha256:f6a88f384335bb27812293fdb11ac6aee2ca3f51d3c7820fe03de0a304ab6249"}, ] -[[package]] -name = "ptable" -version = "0.9.2" -description = "A simple Python library for easily displaying tabular data in a visually appealing ASCII table format" -category = "dev" -optional = false -python-versions = "*" -files = [ - {file = "PTable-0.9.2.tar.gz", hash = "sha256:aa7fc151cb40f2dabcd2275ba6f7fd0ff8577a86be3365cd3fb297cbe09cc292"}, -] - [[package]] name = "pybrowserid" version = "0.14.0" @@ -1763,7 +1689,7 @@ files = [ name = "pycparser" version = "2.21" description = "C parser in Python" -category = "dev" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -1785,14 +1711,14 @@ files = [ [[package]] name = "pyjwt" -version = "2.6.0" +version = "2.7.0" description = "JSON Web Token implementation in Python" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "PyJWT-2.6.0-py3-none-any.whl", hash = "sha256:d83c3d892a77bbb74d3e1a2cfa90afaadb60945205d1095d9221f04466f64c14"}, - {file = "PyJWT-2.6.0.tar.gz", hash = "sha256:69285c7e31fc44f68a1feb309e948e0df53259d579295e6cfe2b1792329f05fd"}, + {file = "PyJWT-2.7.0-py3-none-any.whl", hash = "sha256:ba2b425b15ad5ef12f200dc67dd56af4e26de2331f965c5439994dad075876e1"}, + {file = "PyJWT-2.7.0.tar.gz", hash = "sha256:bd6ca4a3c4285c1a2d4349e5a035fdf8fb94e04ccd0fcbe6ba289dae9cc3e074"}, ] [package.extras] @@ -1801,6 +1727,25 @@ dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pyte docs = ["sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"] tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] +[[package]] +name = "pyopenssl" +version = "23.1.1" +description = "Python wrapper module around the OpenSSL library" +category = "main" +optional = false +python-versions = ">=3.6" +files = [ + {file = "pyOpenSSL-23.1.1-py3-none-any.whl", hash = "sha256:9e0c526404a210df9d2b18cd33364beadb0dc858a739b885677bc65e105d4a4c"}, + {file = "pyOpenSSL-23.1.1.tar.gz", hash = "sha256:841498b9bec61623b1b6c47ebbc02367c07d60e0e195f19790817f10cc8db0b7"}, +] + +[package.dependencies] +cryptography = ">=38.0.0,<41" + +[package.extras] +docs = ["sphinx (!=5.2.0,!=5.2.0.post0)", "sphinx-rtd-theme"] +test = ["flaky", "pretend", "pytest (>=3.0.1)"] + [[package]] name = "pyparsing" version = "3.0.9" @@ -1843,20 +1788,6 @@ webob = ">=1.8.3" docs = ["Sphinx (>=1.8.1)", "docutils", "pylons-sphinx-latesturl", "pylons-sphinx-themes (>=1.0.8)", "repoze.sphinx.autointerface", "sphinxcontrib-autoprogram"] testing = ["coverage", "nose", "virtualenv", "webtest (>=1.3.1)", "zope.component (>=4.0)"] -[[package]] -name = "pyramid-localroles" -version = "0.1" -description = "Local roles authorization policy for Pyramid" -category = "main" -optional = false -python-versions = "*" -files = [ - {file = "pyramid_localroles-0.1.zip", hash = "sha256:4d297d06677d54471cbca9f2744e9224528204cf83045a2a19b6f84707d0ef1b"}, -] - -[package.dependencies] -pyramid = "*" - [[package]] name = "pyramid-multiauth" version = "0.9.0" @@ -1928,18 +1859,17 @@ setuptools = "*" [[package]] name = "pytest" -version = "7.2.2" +version = "7.3.1" description = "pytest: simple powerful testing with Python" -category = "main" +category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "pytest-7.2.2-py3-none-any.whl", hash = "sha256:130328f552dcfac0b1cec75c12e3f005619dc5f874f0a06e8ff7263f0ee6225e"}, - {file = "pytest-7.2.2.tar.gz", hash = "sha256:c99ab0c73aceb050f68929bc93af19ab6db0558791c6a0715723abe9d0ade9d4"}, + {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"}, + {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"}, ] [package.dependencies] -attrs = ">=19.2.0" colorama = {version = "*", markers = "sys_platform == \"win32\""} exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} iniconfig = "*" @@ -1948,18 +1878,18 @@ pluggy = ">=0.12,<2.0" tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] -testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] [[package]] name = "pytest-cov" -version = "4.0.0" +version = "4.1.0" description = "Pytest plugin for measuring coverage." category = "dev" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "pytest-cov-4.0.0.tar.gz", hash = "sha256:996b79efde6433cdbd0088872dbc5fb3ed7fe1578b68cdbba634f14bb8dd0470"}, - {file = "pytest_cov-4.0.0-py3-none-any.whl", hash = "sha256:2feb1b751d66a8bd934e5edfa2e961d11309dc37b73b0eabe73b5945fee20f6b"}, + {file = "pytest-cov-4.1.0.tar.gz", hash = "sha256:3904b13dfbfec47f003b8e77fd5b589cd11904a21ddf1ab38a64f204d6a10ef6"}, + {file = "pytest_cov-4.1.0-py3-none-any.whl", hash = "sha256:6ba70b9e97e69fcc3fb45bfeab2d0a138fb65c4d0d6a41ef33983ad114be8c3a"}, ] [package.dependencies] @@ -2006,7 +1936,7 @@ dev = ["pre-commit", "pytest-asyncio", "tox"] name = "pytest-redis" version = "2.4.0" description = "Redis fixtures and fixture factories for Pytest." -category = "main" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -2191,18 +2121,18 @@ rdflib = "*" [[package]] name = "redis" -version = "4.5.4" +version = "4.5.5" description = "Python client for Redis database and key-value store" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "redis-4.5.4-py3-none-any.whl", hash = "sha256:2c19e6767c474f2e85167909061d525ed65bea9301c0770bb151e041b7ac89a2"}, - {file = "redis-4.5.4.tar.gz", hash = "sha256:73ec35da4da267d6847e47f68730fdd5f62e2ca69e3ef5885c6a78a9374c3893"}, + {file = "redis-4.5.5-py3-none-any.whl", hash = "sha256:77929bc7f5dab9adf3acba2d3bb7d7658f1e0c2f1cafe7eb36434e751c471119"}, + {file = "redis-4.5.5.tar.gz", hash = "sha256:dc87a0bdef6c8bfe1ef1e1c40be7034390c2ae02d92dcd0c7ca1729443899880"}, ] [package.dependencies] -async-timeout = {version = ">=4.0.2", markers = "python_version <= \"3.11.2\""} +async-timeout = {version = ">=4.0.2", markers = "python_full_version <= \"3.11.2\""} [package.extras] hiredis = ["hiredis (>=1.0.0)"] @@ -2229,21 +2159,21 @@ testing = ["WebOb", "coverage", "nose"] [[package]] name = "requests" -version = "2.28.2" +version = "2.31.0" description = "Python HTTP for Humans." category = "main" optional = false -python-versions = ">=3.7, <4" +python-versions = ">=3.7" files = [ - {file = "requests-2.28.2-py3-none-any.whl", hash = "sha256:64299f4909223da747622c030b781c0d7811e359c37124b4bd368fb8c6518baa"}, - {file = "requests-2.28.2.tar.gz", hash = "sha256:98b1b2782e3c6c4904938b84c0eb932721069dfdb9134313beff7c83c2df24bf"}, + {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, + {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, ] [package.dependencies] certifi = ">=2017.4.17" charset-normalizer = ">=2,<4" idna = ">=2.5,<4" -urllib3 = ">=1.21.1,<1.27" +urllib3 = ">=1.21.1,<3" [package.extras] socks = ["PySocks (>=1.5.6,!=1.5.7)"] @@ -2305,14 +2235,14 @@ testing = ["WebTest", "coverage", "pytest", "pytest-cov"] [[package]] name = "s3transfer" -version = "0.6.0" +version = "0.6.1" description = "An Amazon S3 Transfer Manager" category = "main" optional = false python-versions = ">= 3.7" files = [ - {file = "s3transfer-0.6.0-py3-none-any.whl", hash = "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd"}, - {file = "s3transfer-0.6.0.tar.gz", hash = "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"}, + {file = "s3transfer-0.6.1-py3-none-any.whl", hash = "sha256:3c0da2d074bf35d6870ef157158641178a4204a6e689e82546083e31e0311346"}, + {file = "s3transfer-0.6.1.tar.gz", hash = "sha256:640bb492711f4c0c0905e1f62b6aaeb771881935ad27884852411f8e9cacbca9"}, ] [package.dependencies] @@ -2323,19 +2253,19 @@ crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"] [[package]] name = "setuptools" -version = "67.6.1" +version = "67.8.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "setuptools-67.6.1-py3-none-any.whl", hash = "sha256:e728ca814a823bf7bf60162daf9db95b93d532948c4c0bea762ce62f60189078"}, - {file = "setuptools-67.6.1.tar.gz", hash = "sha256:257de92a9d50a60b8e22abfcbb771571fde0dbf3ec234463212027a4eeecbe9a"}, + {file = "setuptools-67.8.0-py3-none-any.whl", hash = "sha256:5df61bf30bb10c6f756eb19e7c9f3b473051f48db77fddbe06ff2ca307df9a6f"}, + {file = "setuptools-67.8.0.tar.gz", hash = "sha256:62642358adc77ffa87233bc4d2354c4b2682d214048f500964dbe760ccedf102"}, ] [package.extras] docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] [[package]] @@ -2459,14 +2389,14 @@ files = [ [[package]] name = "soupsieve" -version = "2.4" +version = "2.4.1" description = "A modern CSS selector implementation for Beautiful Soup." category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "soupsieve-2.4-py3-none-any.whl", hash = "sha256:49e5368c2cda80ee7e84da9dbe3e110b70a4575f196efb74e51b94549d921955"}, - {file = "soupsieve-2.4.tar.gz", hash = "sha256:e28dba9ca6c7c00173e34e4ba57448f0688bb681b7c5e8bf4971daafc093d69a"}, + {file = "soupsieve-2.4.1-py3-none-any.whl", hash = "sha256:1c1bfee6819544a3447586c889157365a27e10d88cde3ad3da0cf0ddf646feb8"}, + {file = "soupsieve-2.4.1.tar.gz", hash = "sha256:89d12b2d5dfcd2c9e8c22326da9d9aa9cb3dfab0a83a024f05704076ee8d35ea"}, ] [[package]] @@ -2490,53 +2420,53 @@ keepalive = ["keepalive (>=0.5)"] [[package]] name = "sqlalchemy" -version = "1.4.47" +version = "1.4.48" description = "Database Abstraction Library" category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" files = [ - {file = "SQLAlchemy-1.4.47-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:dcfb480bfc9e1fab726003ae00a6bfc67a29bad275b63a4e36d17fe7f13a624e"}, - {file = "SQLAlchemy-1.4.47-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:28fda5a69d6182589892422c5a9b02a8fd1125787aab1d83f1392aa955bf8d0a"}, - {file = "SQLAlchemy-1.4.47-cp27-cp27m-win32.whl", hash = "sha256:45e799c1a41822eba6bee4e59b0e38764e1a1ee69873ab2889079865e9ea0e23"}, - {file = "SQLAlchemy-1.4.47-cp27-cp27m-win_amd64.whl", hash = "sha256:10edbb92a9ef611f01b086e271a9f6c1c3e5157c3b0c5ff62310fb2187acbd4a"}, - {file = "SQLAlchemy-1.4.47-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:7a4df53472c9030a8ddb1cce517757ba38a7a25699bbcabd57dcc8a5d53f324e"}, - {file = "SQLAlchemy-1.4.47-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:511d4abc823152dec49461209607bbfb2df60033c8c88a3f7c93293b8ecbb13d"}, - {file = "SQLAlchemy-1.4.47-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dbe57f39f531c5d68d5594ea4613daa60aba33bb51a8cc42f96f17bbd6305e8d"}, - {file = "SQLAlchemy-1.4.47-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ca8ab6748e3ec66afccd8b23ec2f92787a58d5353ce9624dccd770427ee67c82"}, - {file = "SQLAlchemy-1.4.47-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:299b5c5c060b9fbe51808d0d40d8475f7b3873317640b9b7617c7f988cf59fda"}, - {file = "SQLAlchemy-1.4.47-cp310-cp310-win32.whl", hash = "sha256:684e5c773222781775c7f77231f412633d8af22493bf35b7fa1029fdf8066d10"}, - {file = "SQLAlchemy-1.4.47-cp310-cp310-win_amd64.whl", hash = "sha256:2bba39b12b879c7b35cde18b6e14119c5f1a16bd064a48dd2ac62d21366a5e17"}, - {file = "SQLAlchemy-1.4.47-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:795b5b9db573d3ed61fae74285d57d396829e3157642794d3a8f72ec2a5c719b"}, - {file = "SQLAlchemy-1.4.47-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:989c62b96596b7938cbc032e39431e6c2d81b635034571d6a43a13920852fb65"}, - {file = "SQLAlchemy-1.4.47-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3b67bda733da1dcdccaf354e71ef01b46db483a4f6236450d3f9a61efdba35a"}, - {file = "SQLAlchemy-1.4.47-cp311-cp311-win32.whl", hash = "sha256:9a198f690ac12a3a807e03a5a45df6a30cd215935f237a46f4248faed62e69c8"}, - {file = "SQLAlchemy-1.4.47-cp311-cp311-win_amd64.whl", hash = "sha256:03be6f3cb66e69fb3a09b5ea89d77e4bc942f3bf84b207dba84666a26799c166"}, - {file = "SQLAlchemy-1.4.47-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:16ee6fea316790980779268da47a9260d5dd665c96f225d28e7750b0bb2e2a04"}, - {file = "SQLAlchemy-1.4.47-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:557675e0befafa08d36d7a9284e8761c97490a248474d778373fb96b0d7fd8de"}, - {file = "SQLAlchemy-1.4.47-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:bb2797fee8a7914fb2c3dc7de404d3f96eb77f20fc60e9ee38dc6b0ca720f2c2"}, - {file = "SQLAlchemy-1.4.47-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28297aa29e035f29cba6b16aacd3680fbc6a9db682258d5f2e7b49ec215dbe40"}, - {file = "SQLAlchemy-1.4.47-cp36-cp36m-win32.whl", hash = "sha256:998e782c8d9fd57fa8704d149ccd52acf03db30d7dd76f467fd21c1c21b414fa"}, - {file = "SQLAlchemy-1.4.47-cp36-cp36m-win_amd64.whl", hash = "sha256:dde4d02213f1deb49eaaf8be8a6425948963a7af84983b3f22772c63826944de"}, - {file = "SQLAlchemy-1.4.47-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:e98ef1babe34f37f443b7211cd3ee004d9577a19766e2dbacf62fce73c76245a"}, - {file = "SQLAlchemy-1.4.47-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:14a3879853208a242b5913f3a17c6ac0eae9dc210ff99c8f10b19d4a1ed8ed9b"}, - {file = "SQLAlchemy-1.4.47-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:7120a2f72599d4fed7c001fa1cbbc5b4d14929436135768050e284f53e9fbe5e"}, - {file = "SQLAlchemy-1.4.47-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:048509d7f3ac27b83ad82fd96a1ab90a34c8e906e4e09c8d677fc531d12c23c5"}, - {file = "SQLAlchemy-1.4.47-cp37-cp37m-win32.whl", hash = "sha256:6572d7c96c2e3e126d0bb27bfb1d7e2a195b68d951fcc64c146b94f088e5421a"}, - {file = "SQLAlchemy-1.4.47-cp37-cp37m-win_amd64.whl", hash = "sha256:a6c3929df5eeaf3867724003d5c19fed3f0c290f3edc7911616616684f200ecf"}, - {file = "SQLAlchemy-1.4.47-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:71d4bf7768169c4502f6c2b0709a02a33703544f611810fb0c75406a9c576ee1"}, - {file = "SQLAlchemy-1.4.47-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd45c60cc4f6d68c30d5179e2c2c8098f7112983532897566bb69c47d87127d3"}, - {file = "SQLAlchemy-1.4.47-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0fdbb8e9d4e9003f332a93d6a37bca48ba8095086c97a89826a136d8eddfc455"}, - {file = "SQLAlchemy-1.4.47-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f216a51451a0a0466e082e163591f6dcb2f9ec182adb3f1f4b1fd3688c7582c"}, - {file = "SQLAlchemy-1.4.47-cp38-cp38-win32.whl", hash = "sha256:bd988b3362d7e586ef581eb14771bbb48793a4edb6fcf62da75d3f0f3447060b"}, - {file = "SQLAlchemy-1.4.47-cp38-cp38-win_amd64.whl", hash = "sha256:32ab09f2863e3de51529aa84ff0e4fe89a2cb1bfbc11e225b6dbc60814e44c94"}, - {file = "SQLAlchemy-1.4.47-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:07764b240645627bc3e82596435bd1a1884646bfc0721642d24c26b12f1df194"}, - {file = "SQLAlchemy-1.4.47-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e2a42017984099ef6f56438a6b898ce0538f6fadddaa902870c5aa3e1d82583"}, - {file = "SQLAlchemy-1.4.47-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:6b6d807c76c20b4bc143a49ad47782228a2ac98bdcdcb069da54280e138847fc"}, - {file = "SQLAlchemy-1.4.47-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6a94632ba26a666e7be0a7d7cc3f7acab622a04259a3aa0ee50ff6d44ba9df0d"}, - {file = "SQLAlchemy-1.4.47-cp39-cp39-win32.whl", hash = "sha256:f80915681ea9001f19b65aee715115f2ad310730c8043127cf3e19b3009892dd"}, - {file = "SQLAlchemy-1.4.47-cp39-cp39-win_amd64.whl", hash = "sha256:fc700b862e0a859a37faf85367e205e7acaecae5a098794aff52fdd8aea77b12"}, - {file = "SQLAlchemy-1.4.47.tar.gz", hash = "sha256:95fc02f7fc1f3199aaa47a8a757437134cf618e9d994c84effd53f530c38586f"}, + {file = "SQLAlchemy-1.4.48-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:4bac3aa3c3d8bc7408097e6fe8bf983caa6e9491c5d2e2488cfcfd8106f13b6a"}, + {file = "SQLAlchemy-1.4.48-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:dbcae0e528d755f4522cad5842f0942e54b578d79f21a692c44d91352ea6d64e"}, + {file = "SQLAlchemy-1.4.48-cp27-cp27m-win32.whl", hash = "sha256:cbbe8b8bffb199b225d2fe3804421b7b43a0d49983f81dc654d0431d2f855543"}, + {file = "SQLAlchemy-1.4.48-cp27-cp27m-win_amd64.whl", hash = "sha256:627e04a5d54bd50628fc8734d5fc6df2a1aa5962f219c44aad50b00a6cdcf965"}, + {file = "SQLAlchemy-1.4.48-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:9af1db7a287ef86e0f5cd990b38da6bd9328de739d17e8864f1817710da2d217"}, + {file = "SQLAlchemy-1.4.48-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:ce7915eecc9c14a93b73f4e1c9d779ca43e955b43ddf1e21df154184f39748e5"}, + {file = "SQLAlchemy-1.4.48-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5381ddd09a99638f429f4cbe1b71b025bed318f6a7b23e11d65f3eed5e181c33"}, + {file = "SQLAlchemy-1.4.48-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:87609f6d4e81a941a17e61a4c19fee57f795e96f834c4f0a30cee725fc3f81d9"}, + {file = "SQLAlchemy-1.4.48-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb0808ad34167f394fea21bd4587fc62f3bd81bba232a1e7fbdfa17e6cfa7cd7"}, + {file = "SQLAlchemy-1.4.48-cp310-cp310-win32.whl", hash = "sha256:d53cd8bc582da5c1c8c86b6acc4ef42e20985c57d0ebc906445989df566c5603"}, + {file = "SQLAlchemy-1.4.48-cp310-cp310-win_amd64.whl", hash = "sha256:4355e5915844afdc5cf22ec29fba1010166e35dd94a21305f49020022167556b"}, + {file = "SQLAlchemy-1.4.48-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:066c2b0413e8cb980e6d46bf9d35ca83be81c20af688fedaef01450b06e4aa5e"}, + {file = "SQLAlchemy-1.4.48-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c99bf13e07140601d111a7c6f1fc1519914dd4e5228315bbda255e08412f61a4"}, + {file = "SQLAlchemy-1.4.48-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ee26276f12614d47cc07bc85490a70f559cba965fb178b1c45d46ffa8d73fda"}, + {file = "SQLAlchemy-1.4.48-cp311-cp311-win32.whl", hash = "sha256:49c312bcff4728bffc6fb5e5318b8020ed5c8b958a06800f91859fe9633ca20e"}, + {file = "SQLAlchemy-1.4.48-cp311-cp311-win_amd64.whl", hash = "sha256:cef2e2abc06eab187a533ec3e1067a71d7bbec69e582401afdf6d8cad4ba3515"}, + {file = "SQLAlchemy-1.4.48-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:3509159e050bd6d24189ec7af373359f07aed690db91909c131e5068176c5a5d"}, + {file = "SQLAlchemy-1.4.48-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2fc2ab4d9f6d9218a5caa4121bdcf1125303482a1cdcfcdbd8567be8518969c0"}, + {file = "SQLAlchemy-1.4.48-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e1ddbbcef9bcedaa370c03771ebec7e39e3944782bef49e69430383c376a250b"}, + {file = "SQLAlchemy-1.4.48-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f82d8efea1ca92b24f51d3aea1a82897ed2409868a0af04247c8c1e4fef5890"}, + {file = "SQLAlchemy-1.4.48-cp36-cp36m-win32.whl", hash = "sha256:e3e98d4907805b07743b583a99ecc58bf8807ecb6985576d82d5e8ae103b5272"}, + {file = "SQLAlchemy-1.4.48-cp36-cp36m-win_amd64.whl", hash = "sha256:25887b4f716e085a1c5162f130b852f84e18d2633942c8ca40dfb8519367c14f"}, + {file = "SQLAlchemy-1.4.48-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:0817c181271b0ce5df1aa20949f0a9e2426830fed5ecdcc8db449618f12c2730"}, + {file = "SQLAlchemy-1.4.48-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe1dd2562313dd9fe1778ed56739ad5d9aae10f9f43d9f4cf81d65b0c85168bb"}, + {file = "SQLAlchemy-1.4.48-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:68413aead943883b341b2b77acd7a7fe2377c34d82e64d1840860247cec7ff7c"}, + {file = "SQLAlchemy-1.4.48-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbde5642104ac6e95f96e8ad6d18d9382aa20672008cf26068fe36f3004491df"}, + {file = "SQLAlchemy-1.4.48-cp37-cp37m-win32.whl", hash = "sha256:11c6b1de720f816c22d6ad3bbfa2f026f89c7b78a5c4ffafb220e0183956a92a"}, + {file = "SQLAlchemy-1.4.48-cp37-cp37m-win_amd64.whl", hash = "sha256:eb5464ee8d4bb6549d368b578e9529d3c43265007193597ddca71c1bae6174e6"}, + {file = "SQLAlchemy-1.4.48-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:92e6133cf337c42bfee03ca08c62ba0f2d9695618c8abc14a564f47503157be9"}, + {file = "SQLAlchemy-1.4.48-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:44d29a3fc6d9c45962476b470a81983dd8add6ad26fdbfae6d463b509d5adcda"}, + {file = "SQLAlchemy-1.4.48-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:005e942b451cad5285015481ae4e557ff4154dde327840ba91b9ac379be3b6ce"}, + {file = "SQLAlchemy-1.4.48-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c8cfe951ed074ba5e708ed29c45397a95c4143255b0d022c7c8331a75ae61f3"}, + {file = "SQLAlchemy-1.4.48-cp38-cp38-win32.whl", hash = "sha256:2b9af65cc58726129d8414fc1a1a650dcdd594ba12e9c97909f1f57d48e393d3"}, + {file = "SQLAlchemy-1.4.48-cp38-cp38-win_amd64.whl", hash = "sha256:2b562e9d1e59be7833edf28b0968f156683d57cabd2137d8121806f38a9d58f4"}, + {file = "SQLAlchemy-1.4.48-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:a1fc046756cf2a37d7277c93278566ddf8be135c6a58397b4c940abf837011f4"}, + {file = "SQLAlchemy-1.4.48-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d9b55252d2ca42a09bcd10a697fa041e696def9dfab0b78c0aaea1485551a08"}, + {file = "SQLAlchemy-1.4.48-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:6dab89874e72a9ab5462997846d4c760cdb957958be27b03b49cf0de5e5c327c"}, + {file = "SQLAlchemy-1.4.48-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1fd8b5ee5a3acc4371f820934b36f8109ce604ee73cc668c724abb054cebcb6e"}, + {file = "SQLAlchemy-1.4.48-cp39-cp39-win32.whl", hash = "sha256:eee09350fd538e29cfe3a496ec6f148504d2da40dbf52adefb0d2f8e4d38ccc4"}, + {file = "SQLAlchemy-1.4.48-cp39-cp39-win_amd64.whl", hash = "sha256:7ad2b0f6520ed5038e795cc2852eb5c1f20fa6831d73301ced4aafbe3a10e1f6"}, + {file = "SQLAlchemy-1.4.48.tar.gz", hash = "sha256:b47bc287096d989a0838ce96f7d8e966914a24da877ed41a7531d44b55cdb8df"}, ] [package.dependencies] @@ -2617,7 +2547,7 @@ files = [ name = "tomli" version = "2.0.1" description = "A lil' TOML parser" -category = "main" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -2625,6 +2555,27 @@ files = [ {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] +[[package]] +name = "tqdm" +version = "4.65.0" +description = "Fast, Extensible Progress Meter" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tqdm-4.65.0-py3-none-any.whl", hash = "sha256:c4f53a17fe37e132815abceec022631be8ffe1b9381c2e6e30aa70edc99e9671"}, + {file = "tqdm-4.65.0.tar.gz", hash = "sha256:1871fb68a86b8fb3b59ca4cdd3dcccbc7e6d613eeed31f4c332531977b89beb5"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["py-make (>=0.1.0)", "twine", "wheel"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + [[package]] name = "transaction" version = "3.1.0" @@ -2662,38 +2613,38 @@ docs = ["Sphinx (>=1.3.1)", "docutils", "pylons-sphinx-themes"] [[package]] name = "types-awscrt" -version = "0.16.13.post1" +version = "0.16.17" description = "Type annotations and code completion for awscrt" category = "dev" optional = false python-versions = ">=3.7,<4.0" files = [ - {file = "types_awscrt-0.16.13.post1-py3-none-any.whl", hash = "sha256:697c52422bc3f24302402139ec4511723feb990b5a36a8505a941bbbee1322d5"}, - {file = "types_awscrt-0.16.13.post1.tar.gz", hash = "sha256:7f537fc433264a748145ae1148a7a61b33b6f5492d73ef51e5deb1ff8d5d1787"}, + {file = "types_awscrt-0.16.17-py3-none-any.whl", hash = "sha256:e28fb3f20568ce9e96e33e01e0b87b891822f36b8f368adb582553b016d4aa08"}, + {file = "types_awscrt-0.16.17.tar.gz", hash = "sha256:9e447df3ad46767887d14fa9c856df94f80e8a0a7f0169577ab23b52ee37bcdf"}, ] [[package]] name = "types-pyyaml" -version = "6.0.12.9" +version = "6.0.12.10" description = "Typing stubs for PyYAML" category = "dev" optional = false python-versions = "*" files = [ - {file = "types-PyYAML-6.0.12.9.tar.gz", hash = "sha256:c51b1bd6d99ddf0aa2884a7a328810ebf70a4262c292195d3f4f9a0005f9eeb6"}, - {file = "types_PyYAML-6.0.12.9-py3-none-any.whl", hash = "sha256:5aed5aa66bd2d2e158f75dda22b059570ede988559f030cf294871d3b647e3e8"}, + {file = "types-PyYAML-6.0.12.10.tar.gz", hash = "sha256:ebab3d0700b946553724ae6ca636ea932c1b0868701d4af121630e78d695fc97"}, + {file = "types_PyYAML-6.0.12.10-py3-none-any.whl", hash = "sha256:662fa444963eff9b68120d70cda1af5a5f2aa57900003c2006d7626450eaae5f"}, ] [[package]] name = "types-s3transfer" -version = "0.6.0.post7" +version = "0.6.1" description = "Type annotations and code completion for s3transfer" category = "dev" optional = false python-versions = ">=3.7,<4.0" files = [ - {file = "types_s3transfer-0.6.0.post7-py3-none-any.whl", hash = "sha256:d9c669b30fdd61347720434aacb8ecc4645d900712a70b10f495104f9039c07b"}, - {file = "types_s3transfer-0.6.0.post7.tar.gz", hash = "sha256:40e665643f0647832d51c4a26d8a8275cda9134b02bf22caf28198b79bcad382"}, + {file = "types_s3transfer-0.6.1-py3-none-any.whl", hash = "sha256:6d1ac1dedac750d570428362acdf60fdd4f277b0788855c3894d3226756b2bfb"}, + {file = "types_s3transfer-0.6.1.tar.gz", hash = "sha256:75ac1d7143d58c1e6af467cfd4a96c67ee058a3adf7c249d9309999e1f5f41e4"}, ] [package.dependencies] @@ -2701,26 +2652,26 @@ types-awscrt = "*" [[package]] name = "typing-extensions" -version = "4.5.0" +version = "4.6.1" description = "Backported and Experimental Type Hints for Python 3.7+" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "typing_extensions-4.5.0-py3-none-any.whl", hash = "sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"}, - {file = "typing_extensions-4.5.0.tar.gz", hash = "sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb"}, + {file = "typing_extensions-4.6.1-py3-none-any.whl", hash = "sha256:6bac751f4789b135c43228e72de18637e9a6c29d12777023a703fd1a6858469f"}, + {file = "typing_extensions-4.6.1.tar.gz", hash = "sha256:558bc0c4145f01e6405f4a5fdbd82050bd221b119f4bf72a961a1cfd471349d6"}, ] [[package]] name = "urllib3" -version = "1.26.15" +version = "1.26.16" description = "HTTP library with thread-safe connection pooling, file post, and more." category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" files = [ - {file = "urllib3-1.26.15-py2.py3-none-any.whl", hash = "sha256:aa751d169e23c7479ce47a0cb0da579e3ede798f994f5816a74e4f4500dcea42"}, - {file = "urllib3-1.26.15.tar.gz", hash = "sha256:8a388717b9476f934a21484e8c8e61875ab60644d29b9b39e11e4b9dc1c6b305"}, + {file = "urllib3-1.26.16-py2.py3-none-any.whl", hash = "sha256:8d36afa7616d8ab714608411b4a3b13e58f463aee519024578e062e141dce20f"}, + {file = "urllib3-1.26.16.tar.gz", hash = "sha256:8f135f6502756bde6b2a9b28989df5fbe87c9970cecaa69041edcce7f0589b14"}, ] [package.extras] @@ -2790,14 +2741,14 @@ testing = ["coverage", "pytest (>=3.1.0)", "pytest-cov", "pytest-xdist"] [[package]] name = "websocket-client" -version = "1.5.1" +version = "1.5.2" description = "WebSocket client for Python with low level API options" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "websocket-client-1.5.1.tar.gz", hash = "sha256:3f09e6d8230892547132177f575a4e3e73cfdf06526e20cc02aa1c3b47184d40"}, - {file = "websocket_client-1.5.1-py3-none-any.whl", hash = "sha256:cdf5877568b7e83aa7cf2244ab56a3213de587bbe0ce9d8b9600fc77b455d89e"}, + {file = "websocket-client-1.5.2.tar.gz", hash = "sha256:c7d67c13b928645f259d9b847ab5b57fd2d127213ca41ebd880de1f553b7c23b"}, + {file = "websocket_client-1.5.2-py3-none-any.whl", hash = "sha256:f8c64e28cd700e7ba1f04350d66422b6833b82a796b525a51e740b8cc8dab4b1"}, ] [package.extras] @@ -2829,21 +2780,21 @@ tests = ["PasteDeploy", "WSGIProxy2", "coverage", "mock", "nose (<1.3.0)", "pyqu [[package]] name = "werkzeug" -version = "2.2.3" +version = "2.3.4" description = "The comprehensive WSGI web application library." category = "dev" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "Werkzeug-2.2.3-py3-none-any.whl", hash = "sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"}, - {file = "Werkzeug-2.2.3.tar.gz", hash = "sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe"}, + {file = "Werkzeug-2.3.4-py3-none-any.whl", hash = "sha256:48e5e61472fee0ddee27ebad085614ebedb7af41e88f687aaf881afb723a162f"}, + {file = "Werkzeug-2.3.4.tar.gz", hash = "sha256:1d5a58e0377d1fe39d061a5de4469e414e78ccb1e1e59c0f5ad6fa1c36c52b76"}, ] [package.dependencies] MarkupSafe = ">=2.1.1" [package.extras] -watchdog = ["watchdog"] +watchdog = ["watchdog (>=2.3)"] [[package]] name = "wheel" @@ -2995,5 +2946,5 @@ test = ["zope.testing"] [metadata] lock-version = "2.0" -python-versions = ">=3.8.1,<3.9" -content-hash = "db79cdb884068977d9167b4f32c61d7acc29eb073799ba1635a5036e7db40e91" +python-versions = ">=3.8.1,<3.10" +content-hash = "7e2fc0b62f4f41c4abb2c5383b7e5d6abf05f6a9890578af5b3231ee8ddbff68" diff --git a/pyproject.toml b/pyproject.toml index aa60a92ef..1adbce250 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicsnovault" -version = "8.0.1" +version = "9.0.0" description = "Storage support for 4DN Data Portals." authors = ["4DN-DCIC Team "] license = "MIT" @@ -23,7 +23,6 @@ classifiers = [ 'Intended Audience :: Science/Research', 'Framework :: Pyramid', - # Pick your license as you wish (should match "license" above) 'License :: OSI Approved :: MIT License', 'Topic :: Database :: Database Engines/Servers', @@ -32,30 +31,30 @@ classifiers = [ # that you indicate whether you support Python 2, Python 3 or both. 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', ] [tool.poetry.dependencies] -python = ">=3.8.1,<3.9" +python = ">=3.8.1,<3.10" aws_requests_auth = "^0.4.1" -# TODO: This is a backport of Python's statistics library for versions earlier than Python 3.4, -# so may no longer be needed. Something to investigate later. -kmp 20-Feb-2020 -# "backports.statistics" = "0.1.0" -botocore = ">=1.27.36" # no particular version required, but this speeds up search -boto3 = ">=1.24.36" # no particular version required, but this speeds up search +botocore = ">=1.26.133" # no particular version required, but this speeds up search +boto3 = ">=1.26.133" # no particular version required, but this speeds up search elasticsearch = "7.13.4" # versions >= 7.14.0 lock out AWS ES elasticsearch_dsl = "^7.4.0" -dcicutils = "^7.0.0" +#dcicutils = "7.4.4.5b21" +dcicutils = "^7.5.0" future = ">=0.15.2,<1" html5lib = ">=1.1" # experimental, should be OK now that we're not using moto server humanfriendly = "^1.44.9" jsonschema_serialize_fork = "^2.1.1" netaddr = ">=0.8.0,<1" passlib = "^1.7.4" +pillow = "^9.5.0" psutil = "^5.9.0" psycopg2-binary = "^2.9.1" PyBrowserID = ">=0.10.0,<1" +pyjwt = "^2.6.0" pyramid = "1.10.4" -pyramid_localroles = ">=0.1,<1" pyramid-multiauth = ">=0.9.0,<1" pyramid-retry = "^1.0" pyramid-tm = "^2.5" @@ -65,6 +64,7 @@ python_magic = ">=0.4.27" pytz = ">=2021.3" rdflib = "^4.2.2" rdflib-jsonld = ">=0.5.0,<1.0.0" +redis = "^4.5.1" rutter = ">=0.3,<1" simplejson = "^3.17.6" SPARQLWrapper = "^1.8.5" @@ -83,28 +83,24 @@ xlrd = "^1.0.0" "zope.deprecation" = "^4.4.0" "zope.interface" = ">=4.7.2,<6" "zope.sqlalchemy" = "1.6" -pytest-redis = "^2.0.0" -redis = "^4.5.1" [tool.poetry.dev-dependencies] -botocore-stubs = ">=1.27.36" # no particular version required, but this speeds up search -boto3-stubs = ">=1.24.36" # no particular version required, but this speeds up search +botocore-stubs = ">=1.29.119" # no particular version required, but this speeds up search +boto3-stubs = ">=1.26.119" # no particular version required, but this speeds up search coverage = ">=6.2" codacy-coverage = ">=1.3.11" -coveralls = ">=3.3.1" +# When we add coverage, this must be loaded manually in GA workflow for coverage because a dependency on 2to3 +# in its docopts dependency makes a problem for laoding it here in poetry. -kmp 25-Apr-2023 +# coveralls = ">=3.3.1" docutils = ">=0.16,<1" flake8 = ">=3.9.2" flaky = ">=3.7.0" -# moto 2.0.0 (see https://github.com/spulec/moto/blob/master/CHANGELOG.md) has incompatibilities in how it needs -# to be configured, so will require some adaptation. moto 1.3,14 will support python 3.8, but python 3.9 support -# needs moto 2.2.5. If we do eventually upgrade, there are some tests in test_storage.py that may be possible to -# simplify. -kmp 5-Feb-2022 -# -# We tried 1.3.14 but it adds stuff we don't need (until Python 3.8) and it doesn't work as well. -# See https://github.com/spulec/moto/blob/master/CHANGELOG.md moto = "^4.0.3" +PasteDeploy = "1.5.2" +plaster = "1.0" +plaster-pastedeploy = "0.6" pipdeptree = ">=2.3.3" -pip-licenses = "^3.5.3" +# pip-licenses = ">=3.5.3" # Not even sure we need an explicit dependence on Pillow, though it might help keep from searching older versions. # -kmp 22-Feb-2022 Pillow = ">=6.2.2" # later version known to work - Will 11/17/20 @@ -116,6 +112,7 @@ pytest-cov = ">=2.2.1" pytest-instafail = ">=0.3.0" # TODO: Investigate whether a major version upgrade is allowable for 'pytest-mock'. pytest-mock = ">=0.11.0" +pytest-redis = "^2.0.0" # TODO: Investigate whether a major version upgrade is allowable for 'pytest-runner'. pytest-runner = ">=4.0" pytest-timeout = ">=1.0.0" @@ -128,13 +125,25 @@ pytest-timeout = ">=1.0.0" # And we only need .safe_load in testing, so we're moving this to dev dependencies. -kmp 22-Feb-2022 PyYAML = ">=5.1,<5.5" "repoze.debug" = ">=1.0.2" -# Used only by moto, not explicitly by us. -# responses = "^0.17.0" # 0.17.0 is the last version compliant with Python 3.6 -wheel = ">=0.29.0" +wheel = ">=0.40.0" [tool.poetry.scripts] +dev-servers-snovault = "snovault.dev_servers:main" +list-db-tables = "snovault.commands.list_db_tables:main" +prepare-local-dev = "snovault.commands.prepare_template:prepare_local_dev_main" +publish-to-pypi = "dcicutils.scripts.publish_to_pypi:main" wipe-test-indices = "snovault.commands.wipe_test_indices:main" +[paste.app_factory] +main = "snovault:main" + +[paste.composite_factory] +indexer = "snovault.elasticsearch.es_index_listener:composite" +ingester = "snovault.ingestion.ingestion_listener:composite" + +# [paste.filter_app_factory] +# memlimit = "encoded.memlimit:filter_app" + [build-system] requires = ["poetry_core>=1.0.0"] build-backend = "poetry.core.masonry.api" diff --git a/pytest.ini b/pytest.ini index e047a1b51..d47e76a3f 100644 --- a/pytest.ini +++ b/pytest.ini @@ -15,6 +15,7 @@ markers = integrated: an integration test integratedx: an excludable integration test, redundantly testing functionality also covered by a unit test performance: mark a test as a performance test (deselect with '-m "not performance"') + setone: (deprecated) instead of '-m "setone"', please use '-m "not indexing"', TODO: refs and scripts to be rewritten slow: mark a test as slow (deselect with '-m "not slow"') static: mark as a test that is testing the static form of code, not its runtime functionality storage: mark a test as about storage (deselect with '-m "not storage"') diff --git a/scripts/fix-dist-info b/scripts/fix-dist-info new file mode 100755 index 000000000..b82ea945e --- /dev/null +++ b/scripts/fix-dist-info @@ -0,0 +1,34 @@ +#!/bin/bash + +# This is a workaround for a problem where some versions of poetry, +# stray .dist-info files created by "python setup_eb.py develop" +# seem to break "make deploy1". With poetry 1.0.10 loaded, that +# doesn't seem to happen, but with later versions it does. So we've +# put this check in place just in case. Hopefully in the future we +# can remove this workaround, which is pretty ugly. -kmp 11-Nov-2020 + +# First we find out if there are any .dist-info files. +# If there are none, the 'ls' command will fail, hopefully quietly, +# and the value of dist_info_found will be the empty string. +# Otherwise, it will be the file listing. + +repo_name=`grep "url =" .git/config | sed -E 's|^.*/([^.]*)[.]git.*$|\1|g'` +dist_info_found=`ls -dal ${VIRTUAL_ENV}/lib/python3.[0-9]*/site-packages/${repo_name}-[0-9]*.[0-9]*.dist-info 2>/dev/null` + +if [ "${repo_name}" = "cgap-portal" -o "${repo_name}" = "fourfront" ]; then + dist_name=encoded +else + dist_name=${repo_name} +fi + +# Now we test whether there are files to delete, and if there are we do it. + +if [ -n "${dist_info_found}" ]; then + echo "Unwanted .dist_info files for the '${dist_name}' library were found:" + echo "${dist_info_found}" + echo "Cleaning up..." + rm -rf ${VIRTUAL_ENV}/lib/python3.[0-9]*/site-packages/${dist_name}-[0-9]*.[0-9]*.dist-info + echo "Done cleaning up." +else + echo "No unwanted .dist_info files for the '${dist_name}' library found." +fi diff --git a/scripts/macpoetry-install b/scripts/macpoetry-install index 675f1b155..d613df170 100755 --- a/scripts/macpoetry-install +++ b/scripts/macpoetry-install @@ -1,3 +1,6 @@ #!/bin/bash +# For some reason on Mac M1 (as of July 2023) pyyaml install via poetry is problematic. +pip install pyyaml==5.4.1 + CFLAGS="-I$(brew --prefix zlib)/include" LDFLAGS="-L$(brew --prefix zlib)/lib" poetry install diff --git a/scripts/psql-start b/scripts/psql-start new file mode 100755 index 000000000..ddc75949d --- /dev/null +++ b/scripts/psql-start @@ -0,0 +1,75 @@ +#!/bin/bash + +port=$1 + +if [ ! -f development.ini -o ! -f test.ini ]; then + echo "Correcting absence of one or more .ini files..." + prepare-local-dev +fi + +dev_url_line=`grep 'sqlalchemy[.]url =' development.ini` + +dev_url=`echo "${dev_url_line}" | sed -E 's/^.* = (.*)$/\1/'` +dev_port=`echo "${dev_url_line}" | sed -E 's|^.* = .*:([0-9]+)/postgres[?].*$|\1|'` + + +# echo "dev_url=${dev_url}" +# echo "dev_port=${dev_port}" + + +# There seem be two processes, one for postgres and one for postgres-engine. +# The relevant data can be obtained from either, but matching both +# the match for postgres[^-] excludes the matches on postgres-engine so we +# can assume the match is unique. + +if [ "$port" = 'test' ]; then + + test_process=`ps aux | grep '.*[p]ostgres -D.*/private[a-zA-Z0-9_/-]*/postgresql[^-]'` + + if [ -z "${test_process}" ]; then + + echo "No test process found." + exit 1 + + else + + test_url=`echo "$test_process" | sed -E 's|^.*postgres[ ]+-D[ ]+([/a-zA-Z0-9_-]+)[ ]+.*-p[ ]+([0-9]+)([^0-9].*)?$|postgresql://postgres@localhost:\2/postgres?host=\1|'` + psql "${test_url}" + + # psql `ps aux | grep '.*[p]ostgres -D.*/private[a-zA-Z0-9_/-]*/postgresql[^-]' | sed -E 's|^.*postgres[ ]+-D[ ]+([/a-zA-Z0-9_-]+)[ ]+.*-p[ ]+([0-9]+)([^0-9].*)?$|postgresql://postgres@localhost:\2/postgres?host=\1|'` + + fi + +elif [ "$port" = 'dev' -o "$port" = "$dev_port" ]; then + + dev_url=`grep 'sqlalchemy[.]url =' development.ini | sed -E 's/^.* = (.*)/\1/'` + psql "${dev_url}" + +elif [[ "${port}" =~ ^[0-9]+$ ]]; then + + port_process=`ps aux | grep ".*[p]ostgres -D.*/private[a-zA-Z0-9_/-]*/postgresql[^-].*-p[ ]+${port}.*"` + + if [ -z "${port_process}" ]; then + + echo "No postgres process found on port ${port}." + exit 1 + + else + + port_url=`echo "$test_process" | sed -E 's|^.*postgres[ ]+-D[ ]+([/a-zA-Z0-9_-]+)[ ]+.*-p[ ]+([0-9]+)([^0-9].*)?$|postgresql://postgres@localhost:\2/postgres?host=\1|'` + psql "${port_url}" + + # psql `ps aux | grep '.*[p]ostgres -D.*/private[a-zA-Z0-9_/-]*/postgresql[^-]' | sed -E 's|^.*postgres[ ]+-D[ ]+([/a-zA-Z0-9_-]+)[ ]+.*-p[ ]+([0-9]+)([^0-9].*)?$|postgresql://postgres@localhost:\2/postgres?host=\1|'` + + fi + +else + + echo "Syntax: $0 [ | test | dev ]" + echo "" + echo "Starts psql for debugging in a way that corresponds to the given port." + echo "The port can be an integer or one of the special tokens 'dev' or 'test'." + echo "If 'dev' is given, the port from development.ini (currently '${dev_port}') is used." + echo "If 'test' is given, the port will be found from data in 'ps aux'." + +fi diff --git a/setup_eb.py b/setup_eb.py new file mode 100644 index 000000000..949da1423 --- /dev/null +++ b/setup_eb.py @@ -0,0 +1,97 @@ +import os +import re +import toml.decoder + +from setuptools import setup, find_packages + + +ROOT_DIR = os.path.abspath(os.path.dirname(__file__)) + +PYPROJECT_TOML = toml.decoder.load(os.path.join(ROOT_DIR, 'pyproject.toml')) +POETRY_DATA = PYPROJECT_TOML['tool']['poetry'] + +_CARET_MATCH = re.compile(r"[\^]([0-9]+)([.].*)?$") +_TILDE_MATCH = re.compile(r"[~]([0-9]+[.])([0-9]+)([.].*)?$") + + +def fix_requirement(requirement): + m = _CARET_MATCH.match(requirement) + if m: + return ">=%s%s,<%s" % (m.group(1), m.group(2), int(m.group(1)) + 1) + m = _TILDE_MATCH.match(requirement) + if m: + return ">=%s%s%s,<%s%s" % (m.group(1), m.group(2), m.group(3), m.group(1), int(m.group(2)) + 1) + if requirement[0].isdigit(): + return "==" + requirement + else: + return requirement + + +_EMAIL_MATCH = re.compile(r"^([^<]*)[<]([^>]*)[>]$") + + +def author_and_email(authorship_spec): + m = _EMAIL_MATCH.match(authorship_spec) + if m: + return m.group(1), m.group(2) + else: + raise ValueError("Expect authorship in format 'human_name ': %s" % authorship_spec) + + +def get_requirements(kind='dependencies'): + return [ + pkg + fix_requirement(requirement) + for pkg, requirement in POETRY_DATA[kind].items() + if pkg != "python" + ] + + +def flatten_config_data(key, dictionary): + return "%s\n%s\n\n" % (key, "\n".join([ + key + " = " + val + for key, val in dictionary.items() + ])) + + +def entry_points(): + result = flatten_config_data("[console_scripts]", POETRY_DATA['scripts']) + paste_dict = PYPROJECT_TOML['paste'] + for subkey in paste_dict: + result += flatten_config_data('[paste.%s]' % subkey, paste_dict[subkey]) + return result + + +ENTRY_POINTS = entry_points() + +PACKAGE_NAME = POETRY_DATA['name'].replace('dcic','') +SUBDIR = POETRY_DATA['packages'][0]['from'] +README = open(os.path.join(ROOT_DIR, 'README.rst')).read() +DESCRIPTION = POETRY_DATA['description'] +LONG_DESCRIPTION = README +AUTHOR, AUTHOR_EMAIL = author_and_email(POETRY_DATA['authors'][0]) +URL = 'http://data.4dnucleome.org' +LICENSE = 'MIT' +INSTALL_REQUIRES = get_requirements() +TESTS_REQUIRE = get_requirements('dev-dependencies') +VERSION = POETRY_DATA['version'] + +if __name__ == '__main__': + + setup( + name=PACKAGE_NAME, + version=VERSION, + description=DESCRIPTION, + long_description=LONG_DESCRIPTION, + packages=find_packages(SUBDIR), + package_dir={'': SUBDIR}, + include_package_data=True, + zip_safe=False, + author=AUTHOR, + author_email=AUTHOR_EMAIL, + url=URL, + license=LICENSE, + install_requires=INSTALL_REQUIRES, + tests_require=TESTS_REQUIRE, + extras_require={'test': TESTS_REQUIRE}, + entry_points=ENTRY_POINTS, + ) diff --git a/snovault/__init__.py b/snovault/__init__.py index 5e63fc33e..b08c05da2 100644 --- a/snovault/__init__.py +++ b/snovault/__init__.py @@ -5,7 +5,7 @@ from dcicutils.log_utils import set_logging from pyramid.config import Configurator from pyramid.settings import asbool -from pyramid_localroles import LocalRolesAuthorizationPolicy +from .local_roles import LocalRolesAuthorizationPolicy from .app import app_version, session, configure_dbsession, changelogs, json_from_path from .calculated import calculated_property # noqa @@ -21,30 +21,35 @@ def includeme(config): config.include('pyramid_retry') config.include('pyramid_tm') - config.include('.util') - config.include('.stats') - config.include('.batchupgrade') - config.include('.calculated') - config.include('.config') - config.include('.connection') - config.include('.embed') - config.include('.json_renderer') - config.include('.validation') - config.include('.predicates') - config.include('.invalidation') - config.include('.upgrader') - config.include('.aggregated_items') - config.include('.storage') - config.include('.typeinfo') - config.include('.resources') - config.include('.attachment') - config.include('.schema_graph') - config.include('.jsonld_context') - config.include('.schema_views') - config.include('.crud_views') - config.include('.indexing_views') - config.include('.resource_views') - config.include('.settings') + config.include('snovault.authentication') + config.include('snovault.util') + config.include('snovault.drs') + config.include('snovault.stats') + config.include('snovault.batchupgrade') + config.include('snovault.calculated') + config.include('snovault.config') + config.include('snovault.connection') + config.include('snovault.custom_embed') + config.include('snovault.embed') + config.include('snovault.json_renderer') + config.include('snovault.validation') + config.include('snovault.predicates') + config.include('snovault.invalidation') + config.include('snovault.upgrader') + config.include('snovault.aggregated_items') + config.include('snovault.storage') + config.include('snovault.typeinfo') + config.include('snovault.types') + config.include('snovault.resources') + config.include('snovault.attachment') + config.include('snovault.schema_graph') + config.include('snovault.jsonld_context') + config.include('snovault.schema_views') + config.include('snovault.crud_views') + config.include('snovault.indexing_views') + config.include('snovault.resource_views') + config.include('snovault.settings') + config.include('snovault.server_defaults') def main(global_config, **local_config): @@ -83,16 +88,19 @@ def main(global_config, **local_config): config.include('.renderers') + if settings.get('elasticsearch.server'): + config.include('snovault.search.search') + config.include('snovault.search.compound_search') + # only include this stuff if we're testing if asbool(settings.get('testing', False)): config.include('snovault.tests.testing_views') - config.include('snovault.tests.authentication') config.include('snovault.tests.root') - if settings.get('elasticsearch.server'): - config.include('snovault.tests.search') # in addition, enable invalidation scope for testing - but NOT by default settings[INVALIDATION_SCOPE_ENABLED] = True + else: + config.include('snovault.root') if 'elasticsearch.server' in config.registry.settings: config.include('snovault.elasticsearch') diff --git a/snovault/app.py b/snovault/app.py index 2e6ef36a5..91c75d202 100644 --- a/snovault/app.py +++ b/snovault/app.py @@ -4,6 +4,7 @@ import json import os import psycopg2 +import psycopg2.extensions import subprocess import zope.sqlalchemy @@ -12,8 +13,8 @@ from pyramid.path import AssetResolver, caller_package from pyramid.session import SignedCookieSessionFactory from pyramid.settings import asbool -from pyramid_localroles import LocalRolesAuthorizationPolicy -from sqlalchemy import engine_from_config, event, orm +from .local_roles import LocalRolesAuthorizationPolicy +from sqlalchemy import engine_from_config, event, orm # , text as psql_text from webob.cookies import JSONSerializer from .interfaces import DBSESSION @@ -82,7 +83,13 @@ def connect(dbapi_connection, connection_record): timeout_ms = int(timeout_ms) cursor = dbapi_connection.cursor() try: - cursor.execute("SET statement_timeout TO %d" % timeout_ms) + # cursor: psycopg2.extensions.cursor + # This call to psycopg2.extensions.cursor.execute expects a real string. Giving it an sqlalchemy.text + # object will fail because something will try to do a boolean test, probably "if thing_to_execute:..." + # and __bool__ is not defined on sqlalchemy.txt + # Bottom line: Cannot wrap this string with psql_text(...) like we do elsewhere. It's not ready. + # Might be we could do such a wrapper if we called execute on some other object. + cursor.execute("SET statement_timeout = %d;" % timeout_ms) except psycopg2.Error: dbapi_connection.rollback() finally: diff --git a/snovault/appdefs.py b/snovault/appdefs.py new file mode 100644 index 000000000..221d2e113 --- /dev/null +++ b/snovault/appdefs.py @@ -0,0 +1,7 @@ +APP_VERSION_REGISTRY_KEY = 'snovault.app_version' + +# This order determines order that items will be mapped + added to the queue +# Can use item type (e.g. file_fastq) or class name (e.g. FileFastq) +# This order is not meaningful for snovault as it is not a standalone +# app. This value should be overridden in the downstream application +ITEM_INDEX_ORDER = [] diff --git a/snovault/authentication.py b/snovault/authentication.py new file mode 100644 index 000000000..947f99c2d --- /dev/null +++ b/snovault/authentication.py @@ -0,0 +1,695 @@ +import base64 +import datetime +import jwt +import os +import requests +import structlog + +from dateutil.parser import isoparse +from dcicutils.lang_utils import conjoined_list +from dcicutils.misc_utils import remove_element, ignorable, ignored +from operator import itemgetter +from passlib.context import CryptContext +from pyramid.authentication import ( + BasicAuthAuthenticationPolicy as _BasicAuthAuthenticationPolicy, + CallbackAuthenticationPolicy +) +from pyramid.httpexceptions import HTTPForbidden, HTTPUnauthorized +from pyramid.path import DottedNameResolver, caller_package +from pyramid.security import NO_PERMISSION_REQUIRED +from pyramid.view import view_config +from snovault import ROOT, COLLECTIONS +from snovault.calculated import calculate_properties +from snovault.crud_views import collection_add as sno_collection_add +from snovault.project_app import app_project +from snovault.schema_utils import validate_request +from snovault.util import debug_log +from snovault.validation import ValidationFailure +from snovault.validators import no_validate_item_content_post +from urllib.parse import urlencode + + +log = structlog.getLogger(__name__) + + +CRYPT_CONTEXT = __name__ + ':crypt_context' + + +JWT_ENCODING_ALGORITHM = 'HS256' + +# Might need to keep a list of previously used algorithms here, not just the one we use now. +# Decryption algorithm used to default to a long list, but more recent versions of jwt library +# say we should stop assuming that. +# +# In case it goes away, as far as I can tell, the default for decoding from their +# default_algorithms() method used to be what we've got in JWT_ALL_ALGORITHMS here. +# -kmp 15-May-2020 + +JWT_ALL_ALGORITHMS = ['ES512', 'RS384', 'HS512', 'ES256', 'none', + 'RS256', 'PS512', 'ES384', 'HS384', 'ES521', + 'PS384', 'HS256', 'PS256', 'RS512'] + +# Probably we could get away with fewer, but I think not as few as just our own encoding algorithm, +# so for now I believe the above list was the default, and this just rearranges it to prefer the one +# we use for encoding. -kmp 19-Jan-2021 + +JWT_DECODING_ALGORITHMS = [JWT_ENCODING_ALGORITHM] + remove_element(JWT_ENCODING_ALGORITHM, JWT_ALL_ALGORITHMS) + + +# envs where the back-end will accept automated user registration +# TODO: move to dcicutils +AUTO_REGISTRATION_ENVS = ['cgap-training'] + + +def includeme(config): + config.include('.edw_hash') + setting_prefix = 'passlib.' + passlib_settings = { + k[len(setting_prefix):]: v + for k, v in config.registry.settings.items() + if k.startswith(setting_prefix) + } + if not passlib_settings: + passlib_settings = {'schemes': 'edw_hash, unix_disabled'} + crypt_context = CryptContext(**passlib_settings) + config.registry[CRYPT_CONTEXT] = crypt_context + + # basic login route + config.add_route('login', '/login') + config.add_route('logout', '/logout') + config.add_route('me', '/me') + config.add_route('impersonate-user', '/impersonate-user') + config.add_route('session-properties', '/session-properties') + config.add_route('create-unauthorized-user', '/create-unauthorized-user') + config.scan(__name__) + + +class NamespacedAuthenticationPolicy(object): + """ Wrapper for authentication policy classes + + As userids are included in the list of principals, it seems good practice + to namespace them to avoid clashes. + + Constructor Arguments + + ``namespace`` + + The namespace used (string). + + ``base`` + + The base authentication policy (class or dotted name). + + Remaining arguments are passed to the ``base`` constructor. + + Example + + To make a ``REMOTE_USER`` 'admin' be 'user.admin' + + .. code-block:: python + + policy = NamespacedAuthenticationPolicy('user', + 'pyramid.authentication.RemoteUserAuthenticationPolicy') + """ + + def __new__(cls, namespace, base, *args, **kw): + # Dotted name support makes it easy to configure with pyramid_multiauth + name_resolver = DottedNameResolver(caller_package()) + base = name_resolver.maybe_resolve(base) + # Dynamically create a subclass + name = 'Namespaced_%s_%s' % (namespace, base.__name__) + klass = type(name, (cls, base), {'_namespace_prefix': namespace + '.'}) + return super(NamespacedAuthenticationPolicy, klass).__new__(klass) + + def __init__(self, namespace, base, *args, **kw): + ignored(namespace, base) # TODO: SHOULD this be ignored? + super().__init__(*args, **kw) + + def unauthenticated_userid(self, request): + return app_project().namespaced_authentication_policy_unauthenticated_userid(self, request) + + def _unauthenticated_userid_implementation(self, request): + userid = super().unauthenticated_userid(request) + if userid is not None: + userid = self._namespace_prefix + userid + return userid + + def authenticated_userid(self, request, set_user_info_property=True): + # TODO: Maybe something like ... + # return app_project().login_policy.authenticated_userid(request, set_user_info_property) + return app_project().namespaced_authentication_policy_authenticated_userid(self, request, set_user_info_property) + + def _authenticated_userid_implementation(self, request, set_user_info_property=True): + """ + Adds `request.user_info` for all authentication types. + Fetches and returns some user details if called. + """ + namespaced_userid = super().authenticated_userid(request) + + if not set_user_info_property: + return namespaced_userid + + if namespaced_userid is not None: + # userid, if present, may be in form of UUID (if remoteuser) or an email (if Auth0). + namespace, userid = namespaced_userid.split(".", 1) + + # Allow access basic user credentials from request obj after authenticating & saving request + def get_user_info(request): + user_props = request.embed('/session-properties', as_user=userid) # Performs an authentication against DB for user. + if not user_props.get('details'): + raise HTTPUnauthorized( + title="Could not find user info for {}".format(userid), + headers={ + 'WWW-Authenticate': + "Bearer realm=\"{}\"; Basic realm=\"{}\"".format(request.domain, request.domain) + } + ) + return user_props + + # If not authenticated (not in our DB), request.user_info will throw an HTTPUnauthorized error. + request.set_property(get_user_info, "user_info", True) + + return namespaced_userid + + def remember(self, request, principal, **kw): + if not principal.startswith(self._namespace_prefix): + return [] + principal = principal[len(self._namespace_prefix):] + return super().remember(request, principal, **kw) + + +class BasicAuthAuthenticationPolicy(_BasicAuthAuthenticationPolicy): + def __init__(self, check, *args, **kw): + # Dotted name support makes it easy to configure with pyramid_multiauth + name_resolver = DottedNameResolver(caller_package()) + check = name_resolver.maybe_resolve(check) + super().__init__(check, *args, **kw) + + +class LoginDenied(HTTPUnauthorized): + title = 'Login Failure' + + def __init__(self, domain=None, *args, **kwargs): + super().__init__(*args, **kwargs) + if not self.headers.get('WWW-Authenticate') and domain: + # headers['WWW-Authenticate'] might be set in constructor thru headers + self.headers['WWW-Authenticate'] = "Bearer realm=\"{}\"; Basic realm=\"{}\"".format(domain, domain) + + +_fake_user = object() + + +class Auth0AuthenticationPolicy(CallbackAuthenticationPolicy): + + login_path = '/login' + method = 'POST' + + def unauthenticated_userid(self, request): + """ + So basically this is used to do a login, instead of the actual + login view... not sure why, but yeah.. + """ + + # we will cache it for the life of this request, cause pyramids does traversal + cached = getattr(request, '_auth0_authenticated', _fake_user) + + if cached is not _fake_user: + return cached + + # try to find the token in the request (should be in the header) + id_token = get_jwt(request) + if not id_token: + # can I thrown an 403 here? + # print('Missing assertion.', 'unauthenticated_userid', request) + return None + + jwt_info = self.get_token_info(id_token, request) + if not jwt_info: + return None + + email = request._auth0_authenticated = jwt_info['email'].lower() + + # At this point, email has been authenticated with their Auth0 provider and via `get_token_info`, + # but we don't know yet if this email is in our database. `authenticated_userid` should take care of this. + + app_project().note_auth0_authentication_policy_unauthenticated_userid(self, request, email, id_token) + + return email + + @staticmethod + def email_is_partners_or_hms(payload): + """ + Checks that the given JWT payload belongs to a partners email. + """ + for identity in payload.get('identities', []): # if auth0 decoded + if identity.get('connection', '') in ['partners', 'hms-it']: + return True + + # XXX: Refactor to use regex? Also should potentially be data-driven? + if 'partners' in payload.get('sub', ''): + return True + elif 'harvard.edu' in payload.get('sub', ''): + return True + elif payload.get('email_verified'): + return True + else: + return False + + @staticmethod + def get_token_info(token, request): + """ + Given a jwt get token info from auth0, handle retrying and whatnot. + This is only called if we receive a Bearer token in Authorization header. + """ + try: + # lets see if we have an auth0 token or our own + registry = request.registry + auth0_client = registry.settings.get('auth0.client') + auth0_secret = registry.settings.get('auth0.secret') + if auth0_client and auth0_secret: + # leeway accounts for clock drift between us and auth0 + payload = jwt.decode(token, auth0_secret, + algorithms=JWT_DECODING_ALGORITHMS, + audience=auth0_client, leeway=30) + if 'email' in payload and Auth0AuthenticationPolicy.email_is_partners_or_hms(payload): + request.set_property(lambda r: False, 'auth0_expired') + return payload + + else: # we don't have the key, let auth0 do the work for us + warn_msg = "No Auth0 keys present - falling back to making outbound network request to have Auth0 validate for us" + log.warning(warn_msg) + user_url = "https://{domain}/tokeninfo".format(domain='hms-dbmi.auth0.com') + resp = requests.post(user_url, {'id_token': token}) + payload = resp.json() + if 'email' in payload and Auth0AuthenticationPolicy.email_is_partners_or_hms(payload): + request.set_property(lambda r: False, 'auth0_expired') + return payload + + except jwt.exceptions.ExpiredSignatureError as e: + ignorable(e) + # Normal/expected expiration. + + # Allow us to return 403 code &or unset cookie in renderers.py + request.set_property(lambda r: True, 'auth0_expired') + + return None + + except (ValueError, jwt.exceptions.InvalidTokenError, jwt.exceptions.InvalidKeyError) as e: + # Catch errors from decoding JWT or unauthorized users. + print('Invalid JWT assertion : %s (%s)' % (e, type(e).__name__)) + log.error("Error with JWT token (now unset) - " + str(e)) + request.set_property(lambda r: True, 'auth0_expired') # Allow us to return 403 code &or unset cookie in renderers.py + return None + + print("didn't get email or email is not verified") + return None + + +def get_jwt_from_auth_header(request): + if "Authorization" in request.headers: + try: + # Ensure this is a JWT token, not basic auth. + # Per https://developer.mozilla.org/en-US/docs/Web/HTTP/Authentication and + # https://tools.ietf.org/html/rfc6750, JWT is introduced by 'bearer', as in + # Authorization: Bearer something.something.something + # rather than, for example, the 'basic' key information, which as discussed in + # https://tools.ietf.org/html/rfc7617 is base64 encoded and looks like: + # Authorization: Basic QWxhZGRpbjpvcGVuIHNlc2FtZQ== + # See also https://jwt.io/introduction/ for other info specific to JWT. + [auth_type, auth_data] = request.headers['Authorization'].strip().split(' ', 1) + if auth_type.lower() == 'bearer': + return auth_data.strip() # The spec says exactly one space, but then a token, so spaces don't matter + except Exception: + return None + return None + + +def get_jwt(request): + + # First try to obtain JWT from headers (case: some REST API requests) + token = get_jwt_from_auth_header(request) + + # If the JWT is not in the headers, get it from cookies (case: AJAX requests from portal & other clients) + if not token: + token = request.cookies.get('jwtToken') + + return token + + +@view_config(route_name='login', request_method='POST', permission=NO_PERMISSION_REQUIRED) +@debug_log +def login_view(context, request, samesite: str = "strict"): + return app_project().login(context, request, samesite=samesite) + + +def login(context, request, *, samesite: str = "strict"): + """ + Save JWT as httpOnly cookie + """ + ignored(context) + + # Allow providing token thru Authorization header as well as POST request body. + # Should be about equally secure if using HTTPS. + request_token = get_jwt_from_auth_header(request) + if request_token is None: + request_token = request.json_body.get("id_token", None) + + is_https = (request.scheme == "https") + + request.response.set_cookie( + "jwtToken", + value=request_token, + domain=request.domain, + path="/", + httponly=True, + samesite=samesite, + overwrite=True, + secure=is_https + ) + + return {"saved_cookie": True} + + +@view_config(route_name='logout', + permission=NO_PERMISSION_REQUIRED, http_cache=0) +@debug_log +def logout_view(context, request): + return app_project().logout(context, request) + + +def logout(context, request): + """ + This endpoint proxies a request to Auth0 for it to remove its session cookies. + See https://auth0.com/docs/api/authentication#enterprise-saml-and-others- + + The Auth0 endpoint is meant to be navigated to by end-user as part of SSO logout (?) + So this endpoint may not be needed at moment. Kept for reference. + + The front-end handles logging out by discarding the locally-held JWT from + browser cookies and re-requesting the current 4DN URL. + """ + ignored(context) + + # Deletes the cookie + request.response.set_cookie( + name='jwtToken', + value=None, + domain=request.domain, + max_age=0, + path='/', + overwrite=True + ) + + request.response.status_code = 401 + request.response.headers['WWW-Authenticate'] = ( + "Bearer realm=\"{}\", title=\"Session Expired\"; Basic realm=\"{}\"" + .format(request.domain, request.domain) + ) + + return {"deleted_cookie": True} + + # TODO: NEED DO THIS CLIENTSIDE SO IT UNSETS USER'S COOKIE - MUST BE THRU REDIRECT NOT AJAX + # (we don't do this - i.e. we don't bother to log user out of all of Auth0 session, just out of + # own web app) + + # call auth0 to logout - + # auth0_logout_url = "https://{domain}/v2/logout" \ + # .format(domain='hms-dbmi.auth0.com') + + # requests.get(auth0_logout_url) + + # if asbool(request.params.get('redirect', True)): + # raise HTTPFound(location=request.resource_path(request.root)) + + # return {} + + +@view_config(route_name='me', request_method='GET', permission=NO_PERMISSION_REQUIRED) +@debug_log +def me(context, request): + """Alias /users/""" + ignored(context) + for principal in request.effective_principals: + if principal.startswith('userid.'): + break + else: + raise HTTPForbidden(title="Not logged in.") + + namespace, userid = principal.split('.', 1) + + # return { "uuid" : userid } # Uncomment and delete below code to just grab UUID. + + request.response.status_code = 307 # Prevent from creating 301 redirects that get cached permanently by browser + properties = request.embed('/users/' + userid, as_user=userid) + return properties + + +def get_basic_properties_for_user(request, userid): + user = request.registry[COLLECTIONS]['user'][userid] + user_dict = user.__json__(request) + + # Only include certain/applicable fields from profile + include_detail_fields = ['email', 'first_name', 'last_name', 'groups', 'timezone', 'status', 'project_roles'] + user_actions = calculate_properties(user, request, category='user_action') + + properties = { + # 'user': request.embed(request.resource_path(user)), + 'details': {p: v for p, v in user_dict.items() if p in include_detail_fields}, + 'user_actions': [v for k, v in sorted(user_actions.items(), key=itemgetter(0))] + } + + # add uuid to user details + properties['details']['uuid'] = userid + + return properties + + +@view_config(route_name='session-properties', request_method='GET', + permission=NO_PERMISSION_REQUIRED) +@debug_log +def session_properties(context, request): + ignored(context) + for principal in request.effective_principals: + if principal.startswith('userid.'): + break + else: + # NOTE: returning details below allows internal remoteuser (TEST for example) to run DELETE requests + # previously in downstream portal applications, the LoginDenied error was raised, preventing such + # DELETE requests from occurring within unit testing. This can be re-enabled if desired in downstream + # applications, but for now should stay like this so we can unit test DELETEs - Will April 6 2023 + if 'group.admin' in request.effective_principals: + return { + 'details': { + 'groups': [ + 'admin' + ] + } + } + else: + raise LoginDenied(domain=request.domain) + + namespace, userid = principal.split('.', 1) + properties = get_basic_properties_for_user(request, userid) + + # if 'auth.userid' in request.session: + # properties['auth.userid'] = request.session['auth.userid'] + + return properties + + +def basic_auth_check(username, password, request): + """ This function implements the functionality that does the actual checking of the + access key against what is in the database. It is thus very important. Access + key expiration is implemented here - auth will fail if it has expired + """ + # We may get called before the context is found and the root set + root = request.registry[ROOT] + collection = root['access-keys'] + try: + access_key = collection[username] + except KeyError: + return None + + # Check expiration first + # Note that access keys generated awhile ago will remain valid (for now) - will 6/14/21 + properties = access_key.properties + expiration_date = properties.get('expiration_date') + if expiration_date: + dt = isoparse(expiration_date) # datetime.date.fromisoformat in Python3.7 + now = datetime.datetime.utcnow() + if now > dt: + return None + + # If expiration valid, check hash + hash = properties['secret_access_key_hash'] + crypt_context = request.registry[CRYPT_CONTEXT] + valid = crypt_context.verify(password, hash) + if not valid: + return None + + return [] # success + + +@view_config(route_name='impersonate-user', request_method='POST', + validators=[no_validate_item_content_post], + permission='impersonate') +@debug_log +def impersonate_user(context, request): + """As an admin, impersonate a different user.""" + ignored(context) + + userid = request.validated['userid'] + users = request.registry[COLLECTIONS]['user'] + + try: + user = users[userid] + except KeyError: + raise ValidationFailure('body', ['userid'], 'User not found.') + + if user.properties.get('status') != 'current': + raise ValidationFailure('body', ['userid'], 'User is not enabled.') + + user_properties = get_basic_properties_for_user(request, userid) + # pop off impersonate user action if not admin + user_properties['user_actions'] = [x for x in user_properties['user_actions'] if (x['id'] and x['id'] != 'impersonate')] + # make a key + registry = request.registry + auth0_client = registry.settings.get('auth0.client') + auth0_secret = registry.settings.get('auth0.secret') + if not (auth0_client and auth0_secret): + raise HTTPForbidden(title="No keys to impersonate user") + + jwt_contents = { + 'email': userid, + 'email_verified': True, + 'aud': auth0_client, + } + + id_token = jwt.encode( + jwt_contents, + auth0_secret, + algorithm=JWT_ENCODING_ALGORITHM + ) + + is_https = request.scheme == "https" + + request.response.set_cookie( + "jwtToken", + value=id_token.decode('utf-8'), + domain=request.domain, + path="/", + httponly=True, + samesite="strict", + overwrite=True, + secure=is_https + ) + + return user_properties + + +def generate_user(): + """ Generate a random user name with 64 bits of entropy + Used to generate access_key + """ + # Take a random 5 char binary string (80 bits of + # entropy) and encode it as upper cased base32 (8 chars) + random_bytes = os.urandom(5) + user = base64.b32encode(random_bytes).decode('ascii').rstrip('=').upper() + return user + + +def generate_password(): + """ Generate a password with 80 bits of entropy + """ + # Take a random 10 char binary string (80 bits of + # entropy) and encode it as lower cased base32 (16 chars) + random_bytes = os.urandom(10) + password = base64.b32encode(random_bytes).decode('ascii').rstrip('=').lower() + return password + + +@view_config(route_name='create-unauthorized-user', request_method='POST', + permission=NO_PERMISSION_REQUIRED) +@debug_log +def create_unauthorized_user(context, request): + """ + Endpoint that creates an unauthorized user - so we can distinguish between those added by admins + and through this API. + For CGAP, an "unauthorized user" has cgap-core project association and nothing else. + Requires a reCAPTCHA response, which is propogated from the front end + registration form. This is so the endpoint cannot be abused. + TODO: propagate key, secret from GAC + + Given a user properties in the request body, will validate those and also + validate the reCAPTCHA response using the reCAPTCHA server. If all checks + are successful, POST a new user and login + + Args: + context: (ignored) + request: Request object + + Returns: + dictionary User creation response from collection_add + + Raises: + LoginDenied, HTTPForbidden, or ValidationFailure + """ + ignored(context) + # env check + env_name = request.registry.settings.get('env.name') + if env_name not in AUTO_REGISTRATION_ENVS: + raise LoginDenied(f'Tried to register on {env_name}. Self-registration is only enabled on ' + f'{conjoined_list(AUTO_REGISTRATION_ENVS)}') + + recaptcha_resp = request.json.get('g-recaptcha-response') + if not recaptcha_resp: + raise LoginDenied(f'Did not receive response from recaptcha!') + + email = request._auth0_authenticated # equal to: jwt_info['email'].lower() + user_props = request.json + user_props_email = user_props.get("email", "").lower() + if user_props_email != email: + raise HTTPUnauthorized( + title="Provided email {} not validated with Auth0. Try logging in again.".format(user_props_email), + headers={ + 'WWW-Authenticate': + "Bearer realm=\"{}\"; Basic realm=\"{}\"".format(request.domain, request.domain)} + ) + + # set user insert props + del user_props['g-recaptcha-response'] + user_props['was_unauthorized'] = True + user_props['email'] = user_props_email # lower-cased + user_coll = request.registry[COLLECTIONS]['User'] + request.remote_user = 'EMBED' # permission = restricted_fields + + # validate the User json + validate_request(user_coll.type_info.schema, request, user_props) + if request.errors: + raise ValidationFailure('body', 'create_unauthorized_user', 'Cannot validate request') + + # validate recaptcha_resp + # https://developers.google.com/recaptcha/docs/verify + recap_url = 'https://www.google.com/recaptcha/api/siteverify' + recap_values = { + 'secret': request.registry.settings['g.recaptcha.secret'], + 'response': recaptcha_resp + } + data = urlencode(recap_values).encode() + headers = {"Content-Type": "application/x-www-form-urlencoded; charset=utf-8"} + recap_res = requests.get(recap_url, params=data, headers=headers).json() + + if recap_res['success']: + sno_res = sno_collection_add(user_coll, request, False) # POST User + if sno_res.get('status') == 'success': + return sno_res + else: + raise HTTPForbidden(title="Could not create user. Try logging in again.") + else: + # error with re-captcha + raise HTTPUnauthorized( + title="Invalid reCAPTCHA. Try logging in again.", + headers={ + 'WWW-Authenticate': + "Bearer realm=\"{}\"; Basic realm=\"{}\"".format(request.domain, request.domain)} + ) diff --git a/snovault/authorization.py b/snovault/authorization.py new file mode 100644 index 000000000..4b8505ab3 --- /dev/null +++ b/snovault/authorization.py @@ -0,0 +1,126 @@ +import json + +from pyramid.security import Authenticated +from dcicutils.misc_utils import environ_bool, PRINT +from snovault import COLLECTIONS +from snovault.project_app import app_project + + +DEBUG_PERMISSIONS = environ_bool("DEBUG_PERMISSIONS", default=False) + + +def groupfinder(login, request): + if '.' not in login: + if DEBUG_PERMISSIONS: + PRINT("groupfinder sees no '.' in %s, returning None" % login) + return None + namespace, localname = login.split('.', 1) + user = None + + collections = request.registry[COLLECTIONS] + + """ At least part of this stanza seems mainly for testing purposes + should the testing bits be refactored elsewhere??? + 20-09-08 changed permission model requires import of Authenticated + is that kosher + """ + # TODO (C4-332): Consolidate permissions all in one perms.py file once this all stabilizes. + if namespace == 'remoteuser': + + # These names are used in testing or special service situations to force the permissions result + # to known values without any need to go through lookup of any particular user and process + # their groups or project_roles. + + synthetic_result = None + + if localname in ['EMBED', 'INDEXER']: + synthetic_result = [] + elif localname in ['TEST', 'IMPORT', 'UPGRADE', 'INGESTION']: + synthetic_result = ['group.admin'] + elif localname in ['TEST_SUBMITTER']: + synthetic_result = ['group.submitter'] + elif localname in ['TEST_AUTHENTICATED']: + synthetic_result = [Authenticated] + + if synthetic_result is not None: + if DEBUG_PERMISSIONS: + PRINT("groupfinder for", login, "returning synthetic result:", synthetic_result) + return synthetic_result + + # Note that the above 'if' has no final 'else', and the remainder of cases, + # having the form remoteuser., are processed in the next 'if' below. + + if namespace in ('mailto', 'remoteuser', 'auth0'): + users = collections.by_item_type['user'] + try: + user = users[localname] + if DEBUG_PERMISSIONS: + PRINT("groupfinder for", login, "found user", localname) + except KeyError: + if DEBUG_PERMISSIONS: + PRINT("groupfinder for", login, "failed to find user", localname) + return None + + elif namespace == 'accesskey': + + access_keys = collections.by_item_type['access_key'] + try: + access_key = access_keys[localname] + if DEBUG_PERMISSIONS: + PRINT("groupfinder for", login, "found access key", localname) + except KeyError: + if DEBUG_PERMISSIONS: + PRINT("groupfinder for", login, "failed to find access key", localname) + return None + + access_key_status = access_key.properties.get('status') + if access_key_status in ('deleted', 'revoked'): + if DEBUG_PERMISSIONS: + PRINT("groupfinder for", login, "found", access_key_status, "access key", localname) + return None + + userid = access_key.properties['user'] + user = collections.by_item_type['user'][userid] + + if DEBUG_PERMISSIONS: + PRINT("groupfinder for", login, "decoded access key", localname, "as user", user) + + if user is None: + PRINT("groupfinder for", login, "returning None because user is None") + return None + + user_properties = user.properties + + if user_properties.get('status') in ('deleted'): + if DEBUG_PERMISSIONS: + PRINT("groupfinder for %s found user %s, but that user has status deleted." % (login, user)) + return None + + return app_project().authorization_create_principals(login, user, collections) + + +def _create_principals(login, user, collections): + user_properties = user.properties + principals = ['userid.%s' % user.uuid] + if DEBUG_PERMISSIONS: + PRINT("groupfinder starting with principals", principals) + + def add_principal(principal): + if DEBUG_PERMISSIONS: + PRINT("groupfinder for", login, "adding", principal , "to principals.") + principals.append(principal) + + for group in user_properties.get('groups', []): + add_principal('group.%s' % group) + + if DEBUG_PERMISSIONS: + PRINT("groupfinder for", login, "returning principals", json.dumps(principals, indent=2)) + + return principals + + +def is_admin_request(request): + """ Checks for 'group.admin' in effective_principals on request - if present we know this + request was submitted by an admin + """ + return 'group.admin' in request.effective_principals diff --git a/snovault/commands/clear_db_es_contents.py b/snovault/commands/clear_db_es_contents.py new file mode 100644 index 000000000..7bc947d46 --- /dev/null +++ b/snovault/commands/clear_db_es_contents.py @@ -0,0 +1,170 @@ +import argparse +import logging +import structlog +import transaction + +from dcicutils.env_utils import is_stg_or_prd_env +from dcicutils.lang_utils import disjoined_list +from pyramid.paster import get_app +from snovault import DBSESSION +from sqlalchemy import text as psql_text +from ..storage import Base +from ..elasticsearch.create_mapping import run as run_create_mapping +from sqlalchemy import MetaData +from typing import Optional, List +from zope.sqlalchemy import mark_changed +from .. import configure_dbsession + + +log = structlog.getLogger(__name__) + + +EPILOG = __doc__ + + +def clear_db_tables(app): + """ + Given a pyramids app that has a configured DB session, will clear the + contents of all DB tables + + Args: + app: Pyramid application + + Returns: + bool: True if successful, False if error encountered + """ + success = False + session = app.registry[DBSESSION] + meta = MetaData(bind=session.connection()) + meta.reflect() + connection = session.connection().connect() + try: + # truncate tables by only deleting contents (sqlalchemy 1.4+ compliant) + table_names = ','.join(table.name for table in reversed(Base.metadata.sorted_tables)) + connection.execute(psql_text('SET statement_timeout = 300000;')) # give 5 mins for DB clear + connection.execute(psql_text(f'TRUNCATE {table_names} RESTART IDENTITY;')) + except Exception as e: + log.error(f"clear_db_es_contents: Error on DB drop_all/create_all. {type(e)}: {e}") + transaction.abort() + else: + # commit all changes to DB + session.flush() + mark_changed(session()) + transaction.commit() + success = True + return success + + +SKIPPING_CLEAR_ATTEMPT = 'Skipping the attempt to clear DB.' + + +def run_clear_db_es(app, only_envs: Optional[List[str]] = None, skip_es: bool = False, + allow_prod: bool = False) -> bool: + """ + This function actually clears DB/ES. Takes a Pyramid app as well as two flags. _Use with care!_ + + For safety, this function will return without side-effect if ... + - The current environment is any production system. + - The current environment is not a member of the `only_envs` argument (list). + + If `arg_skip_es` (default False) is True, this function will return after DB clear + and before running create_mapping. + + Args: + app: Pyramid application + only_envs (list): a list of env names that are the only envs where this action will run + skip_es (bool): if True, do not run create_mapping after DB clear + allow_prod (bool): if True, allows running on envs that are set to the staging or prod + env in the GLOBAL_ENV_BUCKET (main.ecosystem) + + Returns: + bool: True if DB was cleared (regardless of ES) + """ + current_env = app.registry.settings.get('env.name', 'local') + + if is_stg_or_prd_env(current_env) and not allow_prod: + log.error(f"clear_db_es_contents: This action cannot be performed on env {current_env}" + f" because it is a production-class (stg or prd) environment." + f" {SKIPPING_CLEAR_ATTEMPT}") + return False + + if only_envs and current_env not in only_envs: + log.error(f"clear_db_es_contents: The current environment, {current_env}, is not {disjoined_list(only_envs)}." + f" {SKIPPING_CLEAR_ATTEMPT}") + return False + + log.info('clear_db_es_contents: Clearing DB tables...') + db_success = clear_db_tables(app) + if not db_success: + log.error("clear_db_es_contents: Clearing DB tables failed!" + " Such failures may happen, for example, when there are external DB connections." + " You might want to try running clear_db_es_contents again.") + return False + log.info("clear_db_es_contents: Successfully cleared DB.") + + # create mapping after clear DB to remove ES contents + if not skip_es: + log.info("clear_db_es_contents: Clearing ES with create_mapping...") + run_create_mapping(app, purge_queue=True) + log.info("clear_db_es_contents: Successfully cleared ES.") + + log.info("clear_db_es_contents: All done.") + return True + + +def main(simulated_args=None): + logging.basicConfig() + # Loading app will have configured from config file. Reconfigure here: + logging.getLogger('encoded').setLevel(logging.DEBUG) + + parser = argparse.ArgumentParser( # noqa - PyCharm wrongly thinks the formatter_class is specified wrong here. + description='Clear DB and ES Contents', epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument('config_uri', help='path to configfile') + parser.add_argument('--app-name', help='Pyramid app name in configfile') + parser.add_argument('--only-if-env', '--only-if-envs', dest='only_envs', default=None, + help=("A comma-separated list of envs where this action is allowed to run." + " If omitted, any env is OK to run.")) + parser.add_argument("--confirm", action="store_true", dest="confirm", default=None, + help="Specify --confirm to require interactive confirmation.") + parser.add_argument("--no-confirm", action="store_false", dest="confirm", default=None, + help="Specify --no-confirm to suppress interactive confirmation.") + parser.add_argument('--skip-es', action='store_true', default=False, + help='If set, do not run create_mapping after DB drop') + parser.add_argument('--allow-prod', action='store_true', default=False, + help='DANGER: If set, will allow running this command on an env that is staging or prod') + args = parser.parse_args(simulated_args) + + confirm = args.confirm + app_name = args.app_name + config_uri = args.config_uri + only_envs = args.only_envs + skip_es = args.skip_es + allow_prod = args.allow_prod + + if confirm is None: + confirm = not only_envs # If only_envs is supplied, we have better protection so don't need to confirm + + # get the pyramids app + app = get_app(config_uri, app_name) + + # create db schema + configure_dbsession(app) + + only_envs = [x for x in (only_envs or "").split(',') if x] + + if confirm: + env_to_confirm = app.registry.settings.get('env.name', 'local') + env_confirmation = input(f'This will completely clear DB contents for environment {env_to_confirm}.\n' + f' Type the env name to confirm: ') + if env_confirmation != env_to_confirm: + print(f"NOT confirmed. {SKIPPING_CLEAR_ATTEMPT}") + return + + # actually run. split this out for easy testing + run_clear_db_es(app=app, only_envs=only_envs, skip_es=skip_es, allow_prod=allow_prod) + + +if __name__ == '__main__': + main() diff --git a/snovault/commands/create_mapping_on_deploy.py b/snovault/commands/create_mapping_on_deploy.py new file mode 100644 index 000000000..8d43f7d48 --- /dev/null +++ b/snovault/commands/create_mapping_on_deploy.py @@ -0,0 +1,104 @@ +import argparse +import structlog +import logging + +from pyramid.paster import get_app +from snovault.elasticsearch.create_mapping import run as run_create_mapping +from snovault.elasticsearch.create_mapping import reindex_by_type_staggered +from dcicutils.log_utils import set_logging +from dcicutils.env_utils import is_stg_or_prd_env, is_test_env + +# override this order in the downstream portal +from ..loadxl import ( + ORDER as ITEM_INDEX_ORDER +) + +log = structlog.getLogger(__name__) +EPILOG = __doc__ + + +def get_my_env(app): + """ + Gets the env name of the currently running environment + + :param app: handle to Pyramid app + :return: current env + """ + # Return value is presumably one of the above-declared environments + return app.registry.settings.get('env.name') + + +def get_deployment_config(app): + """ + Gets deployment configuration for the current environment. + + Sets ENV_NAME and WIPE_ES as side-effects. + + :param app: handle to Pyramid app + :return: dict of config options + """ + deploy_cfg = {} + my_env = get_my_env(app) + deploy_cfg['ENV_NAME'] = my_env + if is_stg_or_prd_env(my_env): + log.info('This looks like our production environment -- not wiping ES') + deploy_cfg['WIPE_ES'] = False + elif is_test_env(my_env): + log.info('This looks like a test environment -- wiping ES') + deploy_cfg['WIPE_ES'] = True + else: + log.info('This environment is not recognized -- not wiping ES') + deploy_cfg['WIPE_ES'] = False + return deploy_cfg + + +def _run_create_mapping(app, args): + """ + Runs create_mapping with deploy options and report errors. Allows args passed from argparse in main to override + the default deployment configuration + + :param app: pyramid application handle + :param args: args from argparse + :return: None + """ + try: + deploy_cfg = get_deployment_config(app) + log.info('Running create mapping on env: %s' % deploy_cfg['ENV_NAME']) + if args.wipe_es: # override deploy_cfg WIPE_ES option + log.info('Overriding deploy_cfg and wiping ES') + deploy_cfg['WIPE_ES'] = True + run_create_mapping(app, check_first=(not deploy_cfg['WIPE_ES']), purge_queue=args.clear_queue, + item_order=ITEM_INDEX_ORDER) + except Exception as e: + log.error("Exception encountered while gathering deployment information or running create_mapping") + log.error(str(e)) + exit(1) + + +def main(): + parser = argparse.ArgumentParser( # noqa - PyCharm wrongly thinks the formatter_class is invalid + description="Create Elasticsearch mapping on deployment", epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument('config_uri', help="path to configfile") + parser.add_argument('--app-name', help="Pyramid app name in configfile") + parser.add_argument('--wipe-es', help="Specify to wipe ES", action='store_true', default=False) + parser.add_argument('--clear-queue', help="Specify to clear the SQS queue", action='store_true', default=False) + parser.add_argument('--staggered', default=False, action='store_true', + help='Pass to trigger staggered reindexing, a new mode that will go type-by-type') + + args = parser.parse_args() + app = get_app(args.config_uri, args.app_name) + # Loading app will have configured from config file. Reconfigure here: + set_logging(in_prod=app.registry.settings.get('production'), log_name=__name__, level=logging.DEBUG) + # set_logging(app.registry.settings.get('elasticsearch.server'), app.registry.settings.get('production'), + # level=logging.DEBUG) + if args.staggered: + reindex_by_type_staggered(app) # note that data flow from args is dropped, only 1 mode for staggered + else: + _run_create_mapping(app, args) + exit(0) + + +if __name__ == '__main__': + main() diff --git a/snovault/commands/list_db_tables.py b/snovault/commands/list_db_tables.py new file mode 100644 index 000000000..554b77305 --- /dev/null +++ b/snovault/commands/list_db_tables.py @@ -0,0 +1,150 @@ +import argparse +import logging +import structlog +# import transaction + +from dcicutils.env_utils import is_stg_or_prd_env +from dcicutils.lang_utils import disjoined_list +from dcicutils.misc_utils import PRINT, get_error_message +from pyramid.paster import get_app +# from snovault import DBSESSION +# from snovault.storage import Base +# from snovault.elasticsearch.create_mapping import run as run_create_mapping +# from sqlalchemy import MetaData +from typing import Optional, List +# from zope.sqlalchemy import mark_changed +from .. import configure_dbsession +from ..sqlalchemy_tools import PyramidAppManager +from ..project_app import app_project + + +logger = structlog.getLogger(__name__) + + +EPILOG = __doc__ + + +def list_db_tables(app): + """ + Given a pyramids app that has a configured DB session, will list the contents of all DB tables + + Args: + app: Pyramid application + + Returns: + bool: True if successful, False if error encountered + """ + + app_manager = PyramidAppManager(app) + + with app_manager.connection() as connection: + for table_name in app_manager.ordered_table_names: + n = connection.execute(f"SELECT COUNT(*) FROM {table_name};").one() + print(f" Table {table_name}: {n}") + + +SKIPPING_LIST_ATTEMPT = 'Skipping the attempt to list DB.' + + +def run_list_db_tables(app, only_envs: Optional[List[str]] = None, skip_es: bool = False, + allow_prod: bool = False) -> bool: + """ + This function lists information from the DB. + + For safety, this function will return without side-effect if ... + - The current environment is any production system (and allow_prod is not given). + - The current environment is not a member of the `only_envs` argument (list). + + Args: + app: Pyramid application + only_envs (list): a list of env names that are the only envs where this action will run + allow_prod (bool): if True, allows running on envs that are set to the staging or prod + env in the GLOBAL_ENV_BUCKET (main.ecosystem) + + Returns: + bool: True if DB was listed. + """ + current_env = app.registry.settings.get('env.name', 'local') + + if is_stg_or_prd_env(current_env) and not allow_prod: + logger.error(f"list-db-tables: This action cannot be performed on env {current_env}" + f" because it is a production-class (stg or prd) environment." + f" {SKIPPING_LIST_ATTEMPT}") + return False + + if only_envs and current_env not in only_envs: + logger.error(f"list-db-tables: The current environment, {current_env}, is not {disjoined_list(only_envs)}." + f" {SKIPPING_LIST_ATTEMPT}") + return False + + logger.info('list-db-tables: Listing DB tables...') + try: + list_db_tables(app) + except Exception as e: + logger.info(f"list-db-tables failed. {get_error_message(e)}") + return False + logger.info("list-db-tables succeeded.") + return True + + +def main(simulated_args=None): + parser = argparse.ArgumentParser( # noqa - PyCharm wrongly thinks the formatter_class is specified wrong here. + description='List DB Contents', epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument('--config_uri', help='path to configfile', default='development.ini') + parser.add_argument('--app-name', help='Pyramid app name in configfile', default='app') + parser.add_argument('--only-if-env', '--only-if-envs', dest='only_envs', default=None, + help=("A comma-separated list of envs where this action is allowed to run." + " If omitted, any env is OK to run.")) + parser.add_argument("--confirm", action="store_true", dest="confirm", default=None, + help="Specify --confirm to require interactive confirmation.") + parser.add_argument("--no-confirm", action="store_false", dest="confirm", default=None, + help="Specify --no-confirm to suppress interactive confirmation.") + parser.add_argument('--allow-prod', action='store_true', default=False, + help='DANGER: If set, will allow running this command on an env that is staging or prod') + parser.add_argument('--log', action='store_true', default=False, + help='Set loglevel to DEBUG. Otherwise it will be ERROR.') + args = parser.parse_args(simulated_args) + + confirm = args.confirm + app_name = args.app_name + config_uri = args.config_uri + only_envs = args.only_envs + allow_prod = args.allow_prod + log = args.log + + logging.basicConfig() + #project = app_project(initialize=True) + project = app_project() + # Loading app will have configured from config file. Reconfigure here: + if log: + logging.getLogger(project.NAME).setLevel(logging.DEBUG) + else: + logging.getLogger(project.NAME).setLevel(logging.ERROR) + + if confirm is None: + confirm = False # not only_envs # If only_envs is supplied, we have better protection so don't need to confirm + + # get the pyramids app + app = get_app(config_uri, app_name) + + # create db schema + configure_dbsession(app) + + only_envs = [x for x in (only_envs or "").split(',') if x] + + if confirm: + env_to_confirm = app.registry.settings.get('env.name', 'local') + env_confirmation = input(f'This will list DB contents for environment {env_to_confirm}.\n' + f' Type the env name to confirm: ') + if env_confirmation != env_to_confirm: + PRINT(f"NOT confirmed. {SKIPPING_LIST_ATTEMPT}") + return + + # actually run. split this out for easy testing + run_list_db_tables(app=app, only_envs=only_envs, allow_prod=allow_prod) + + +if __name__ == '__main__': + main() diff --git a/snovault/commands/load_access_keys.py b/snovault/commands/load_access_keys.py new file mode 100644 index 000000000..9d39124e9 --- /dev/null +++ b/snovault/commands/load_access_keys.py @@ -0,0 +1,156 @@ +import argparse +import logging +import structlog +import json +import os +import boto3 +from pyramid.paster import get_app +from webtest import AppError +from dcicutils.misc_utils import TestApp +from dcicutils.beanstalk_utils import get_beanstalk_real_url +from dcicutils.cloudformation_utils import get_ecs_real_url +from dcicutils.secrets_utils import assume_identity + +log = structlog.getLogger(__name__) +EPILOG = __doc__ + + +# This should be imported/overridden by downstream application +ADMIN_EMAIL = 'snovault.platform@gmail.com' + + +def get_existing_key_ids(testapp, user_uuid, key_desc): + """ + Search for an access key with given description and user uuid. + If successful return list of @id values of all found access keys. + This is expected to fail if the current server is not up. + Logs information on errors. + + Args: + testapp (webtest.TestApp): current TestApp + user_uuid (str): uuid of the user used to generate the key + key_desc (str): description of the access key to find + + Return: + list: of str access keys ids + """ + try: + search_res = testapp.get('/search/?type=AccessKey&description=%s&user.uuid=%s' + % (key_desc, user_uuid)).json + except Exception as exc: + log.error('load_access_keys: search failed for access key with desc' + ' %s. Exception: %s' % (key_desc, exc)) + return [] + if len(search_res['@graph']) > 1: + log.warning('load_access_keys: %s access keys found with ' + 'description %s and user.uuid %s.' + % (len(search_res['@graph']), key_desc, user_uuid)) + return [res['@id'] for res in search_res['@graph']] + + +def generate_access_key(testapp, env, user_uuid, description): + """ + Generate an access for given user on given environment. + + Args: + testapp (webtest.TestApp): current TestApp + env (str): application environment used to find server + user_uuid (str): uuid of the user to gener + description (str): description to add to access key + + Returns: + dict: access key contents with server + """ + try: + server = get_ecs_real_url(env) # try to grab from Cfn, if we are ECS env + except Exception: + server = get_beanstalk_real_url(env) + if not server: + server = get_beanstalk_real_url(env) + access_key_req = {'user': user_uuid, 'description': description} + res = testapp.post_json('/access_key', access_key_req).json + return {'secret': res['secret_access_key'], + 'key': res['access_key_id'], + 'server': server} + + +def main(): + """ + Function to create and load access keys for multiple users to the system s3 + bucket. The description of the key is set to the s3 object name. Before + creating the keys, will attempt to delete pre-existing keys with the given + descriptions. This command is not meant to be run locally. + + Provide required `config_uri` and `--app-name` to the command. + Example usage: + `bin/load_access_keys production.ini --app-name app` + """ + logging.basicConfig() + # Loading app will have configured from config file. Reconfigure here: + logging.getLogger('encoded').setLevel(logging.INFO) + + parser = argparse.ArgumentParser( # noqa - PyCharm wrongly thinks the formatter_class is specified wrong here. + description="Load Access Keys", epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument('config_uri', help='path to configfile') + parser.add_argument('--app-name', help='Pyramid app name in configfile') + parser.add_argument('--secret-name', help='name of application identity stored in secrets manager within which' + 'to locate S3_ENCRYPT_KEY, for example: dev/beanstalk/cgap-dev') + args = parser.parse_args() + + app = get_app(args.config_uri, args.app_name) + environ = { + 'HTTP_ACCEPT': 'application/json', + 'REMOTE_USER': 'TEST', + } + testapp = TestApp(app, environ) + + env = app.registry.settings.get('env.name') + if not env: + raise RuntimeError('load_access_keys: cannot find env.name in settings') + + # Resolve secret from environment if one is not specified + encrypt_key = None + if args.secret_name is not None: + identity = assume_identity() # automatically detects GLOBAL_APPLICATION_CONFIGURATION + encrypt_key = identity.get('S3_ENCRYPT_KEY', None) # one of the secrets + if not encrypt_key: + encrypt_key = os.environ.get('S3_ENCRYPT_KEY') + + if not encrypt_key: + raise RuntimeError('load_access_keys: must define S3_ENCRYPT_KEY in env or in GAC') + + # will need to use a dynamic region at some point (not just here) + s3 = boto3.client('s3', region_name='us-east-1') + s3_bucket = app.registry.settings['system_bucket'] + + # we generate keys for the following accounts w/ corresponding descriptions + to_generate = [(ADMIN_EMAIL, 'access_key_admin'), + ('tibanna.app@gmail.com', 'access_key_tibanna'), + ('foursight.app@gmail.com', 'access_key_foursight')] + for email, key_name in to_generate: + try: + user_props = testapp.get('/users/%s?datastore=database' % email).follow().json + except Exception as exc: + log.error('load_access_keys: could not get user %s. Exception: %s' % (email, exc)) + continue + + key_ids = get_existing_key_ids(testapp, user_props['uuid'], key_name) + for key_id in key_ids: + try: + testapp.patch_json(key_id, {'status': 'deleted'}) + except AppError: + log.error('load_access_keys: key_id: %s does not exist in database but exists in ES' % key_id) + + key = generate_access_key(testapp, env, user_props['uuid'], key_name) + s3.put_object(Bucket=s3_bucket, + Key=key_name, + Body=json.dumps(key), + SSECustomerKey=encrypt_key, + SSECustomerAlgorithm='AES256') + log.info('load_access_keys: successfully generated access key %s' % key_name) + + +if __name__ == "__main__": + main() diff --git a/snovault/commands/load_data.py b/snovault/commands/load_data.py new file mode 100644 index 000000000..5d3384176 --- /dev/null +++ b/snovault/commands/load_data.py @@ -0,0 +1,69 @@ +import argparse +import logging +import structlog + +from dcicutils.env_utils import permit_load_data +from dcicutils.common import APP_CGAP +from dcicutils.misc_utils import PRINT + +from pyramid.paster import get_app +from pyramid.path import DottedNameResolver +from .. import configure_dbsession + + +log = structlog.getLogger(__name__) + + +EPILOG = __doc__ + + +# should be overridden in downstream application to pass a different app +def load_data_should_proceed(env, allow_prod, app=None): + """ Returns True on whether or not load_data should proceed. + + :param env: env we are on + :param allow_prod: prod argument from argparse, defaults to False + :param app: app type, one of cgap, fourfront (enums from dcicutils.common) + :return: True if load_data should continue, False otherwise + """ + if not app: + app = APP_CGAP # this fallback is somewhat reasonable + return permit_load_data(envname=env, allow_prod=allow_prod, orchestrated_app=app) # noqa + + +def main(simulated_args=None): + logging.basicConfig() + # Loading app will have configured from config file. Reconfigure here: + logging.getLogger('encoded').setLevel(logging.DEBUG) + + parser = argparse.ArgumentParser( # noqa - PyCharm wrongly thinks the formatter_class is specified wrong here. + description="Load Data", epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument('--app-name', help="Pyramid app name in configfile") + parser.add_argument('config_uri', help="path to configfile") + parser.add_argument('--prod', action='store_true', + help="must be set to confirm this action is intended to happen on a production server") + parser.add_argument('--overwrite', action='store_true', + help="must be set to update existing uuids with patch") + args = parser.parse_args(simulated_args) + + # get the pyramids app + app = get_app(args.config_uri, args.app_name) + + # create db schema + configure_dbsession(app) + + env = app.registry.settings.get('env.name', '') + + load_test_data = app.registry.settings.get('load_test_data') + allow_prod = args.prod + PRINT("load_data: load_test_data function is %s" % (load_test_data)) + load_test_data = DottedNameResolver().resolve(load_test_data) + + if load_data_should_proceed(env, allow_prod): + load_test_data(app, args.overwrite) + + +if __name__ == "__main__": + main() diff --git a/snovault/commands/load_data_by_type.py b/snovault/commands/load_data_by_type.py new file mode 100644 index 000000000..04ac12018 --- /dev/null +++ b/snovault/commands/load_data_by_type.py @@ -0,0 +1,63 @@ +import argparse +import logging +import structlog + +from dcicutils.env_utils import permit_load_data +from pyramid.paster import get_app +from pyramid.path import DottedNameResolver +from .. import configure_dbsession + + +log = structlog.getLogger(__name__) + + +EPILOG = __doc__ + + +def load_data_should_proceed(env, allow_prod): + """ Returns True on whether or not load_data should proceed. + + :param env: env we are on + :param allow_prod: prod argument from argparse, defaults to False + :return: True if load_data should continue, False otherwise + """ + + return permit_load_data(envname=env, allow_prod=allow_prod, orchestrated_app='cgap') # TODO figure out app + + +def main(simulated_args=None): + logging.basicConfig() + # Loading app will have configured from config file. Reconfigure here: + logging.getLogger('encoded').setLevel(logging.DEBUG) + + parser = argparse.ArgumentParser( # noqa - PyCharm wrongly thinks the formatter_class is specified wrong here. + description="Load Test Data", epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument('--app-name', help="Pyramid app name in configfile") + parser.add_argument('config_uri', help="path to configfile") + parser.add_argument('--prod', action='store_true', + help="must be set to confirm this action is intended to happen on a production server") + parser.add_argument('--overwrite', action='store_true', + help="must be set to update existing uuids with patch") + parser.add_argument('--indir', help="directory to load items from") + parser.add_argument('--itype', help="item type to load") + args = parser.parse_args(simulated_args) + + # get the pyramids app + app = get_app(args.config_uri, args.app_name) + + # create db schema + configure_dbsession(app) + + env = app.registry.settings.get('env.name', '') + + allow_prod = args.prod + load_data_by_type = DottedNameResolver().resolve("encoded.loadxl:load_data_by_type") + + if load_data_should_proceed(env, allow_prod): + load_data_by_type(app, args.indir, args.overwrite, args.itype) + + +if __name__ == "__main__": + main() diff --git a/snovault/commands/prepare_template.py b/snovault/commands/prepare_template.py new file mode 100644 index 000000000..529d6caf0 --- /dev/null +++ b/snovault/commands/prepare_template.py @@ -0,0 +1,119 @@ +import argparse +import os + +import logging +from dcicutils.deployment_utils import create_file_from_template +from dcicutils.misc_utils import ignored, PRINT, classproperty + +EPILOG = __doc__ + + +class PrepOptions: + + @classproperty + def ROOT_DIR(cls): # noQA - cls is the right var + # return os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) + return os.path.abspath(os.curdir) + + @classproperty + def DOCKER_COMPOSE_FILE(cls): # noQA - cls is the right var + return os.path.join(cls.ROOT_DIR, 'docker-compose.yml') + + @classproperty + def DOCKER_DEVELOPMENT_INI_FILE(cls): # noQA - cls is the right var + return os.path.join(cls.ROOT_DIR, "deploy/docker/local/docker_development.ini") + + @classproperty + def DEVELOPMENT_INI_FILE(cls): # noQA - cls is the right var + return os.path.join(cls.ROOT_DIR, "development.ini") + + @classproperty + def TEST_INI_FILE(cls): # noQA - cls is the right var + return os.path.join(cls.ROOT_DIR, "test.ini") + + +DATA_SET_CHOICES = ['prod', 'test', 'local', 'deploy'] +DEFAULT_DATA_SET = 'local' + + +def empty_assignment(line, expanded): + ignored(line) + return expanded.strip().endswith(': ""') + + +def template_creator(extra_environment_variables): + def create_from_template(file, expect_change=False): + template_file = file + ".template" + if not os.path.exists(template_file): + raise ValueError(f"The template file {template_file} does not exist.") + warning = (f"The file {file} has unexpectedly changed. You may need to make build-docker-local-clean." + if not expect_change + else None) + return create_file_from_template(template_file=template_file, + to_file=file, + extra_environment_variables=extra_environment_variables, + omittable=empty_assignment, + warn_if_changed=warning) + return create_from_template + + +def prepare_docker(data_set=DEFAULT_DATA_SET, load_inserts=False, run_tests=False, s3_encrypt_key_id=""): + extra_vars = { + "DATA_SET": data_set, + "LOAD_INSERTS": "true" if load_inserts else "", + "RUN_TESTS": "true" if run_tests else "", + "S3_ENCRYPT_KEY_ID": s3_encrypt_key_id or "", + } + prepare_from_template = template_creator(extra_vars) + prepare_from_template(PrepOptions.DOCKER_COMPOSE_FILE, expect_change=True) + prepare_from_template(PrepOptions.DOCKER_DEVELOPMENT_INI_FILE) + + +def prepare_docker_main(): + parser = argparse.ArgumentParser( # noqa - PyCharm wrongly thinks the formatter_class is specified wrong here. + description="Prepare docker files", epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument("--data-set", default=DEFAULT_DATA_SET, choices=DATA_SET_CHOICES, + help=f"the data set to use (default: {DEFAULT_DATA_SET})") + parser.add_argument("--load-inserts", default=False, action="store_true", + help="if supplied, causes inserts to be loaded (default: not loaded)") + parser.add_argument("--run-tests", default=False, action="store_true", + help="if supplied, causes tests to be run in container (default: not tested)") + parser.add_argument('--s3-encrypt-key-id', default=None, + help="an encrypt key id (default: None)") + args = parser.parse_args() + + logging.basicConfig() + prepare_docker(data_set=args.data_set, + load_inserts=args.load_inserts, + run_tests=args.run_tests, + s3_encrypt_key_id=args.s3_encrypt_key_id) + + +def prepare_local_dev(force=False): + extra_vars = {} + prepare_from_template = template_creator(extra_vars) + for file in [PrepOptions.TEST_INI_FILE, PrepOptions.DEVELOPMENT_INI_FILE]: + exists = os.path.exists(file) + if not exists or force: + reason = "it doesn't exist" if not exists else "--force was given to prepare-local-dev" + PRINT(f"{'Recreating' if exists else 'Creating'} {file} because {reason} ...") + prepare_from_template(file, expect_change=True) + PRINT(f"{'Recreated' if exists else 'Created'} {file}.") + else: + PRINT(f"The file {file} already exists and will not be changed.") + + +def prepare_local_dev_main(): + parser = argparse.ArgumentParser( # noqa - PyCharm wrongly thinks the formatter_class is specified wrong here. + description="Prepare files used for local development (test.ini and development.ini)", epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument("--force", default=False, action="store_true", + help="forces creation of a development.ini and test.ini even if they already exist." + " By default, creation is skipped for any file that already exists.") + args = parser.parse_args() + + logging.basicConfig() + prepare_local_dev(force=args.force) diff --git a/snovault/commands/purge_item_type.py b/snovault/commands/purge_item_type.py new file mode 100644 index 000000000..b282bcb90 --- /dev/null +++ b/snovault/commands/purge_item_type.py @@ -0,0 +1,83 @@ +import sys +import argparse +import logging +import structlog +import transaction +from pyramid.paster import get_app + +from dcicutils.env_utils import is_stg_or_prd_env +from .. import STORAGE +from ..elasticsearch.indexer_utils import get_uuids_for_types +from .. import configure_dbsession + + +logger = structlog.getLogger(__name__) +EPILOG = __doc__ + + +def purge_item_type_from_storage(app, item_types, prod=False): + """ + Purges all items with the given item_types from our storage. This function could partially purge + an item type if an error is encountered. Note that this will work no matter what resources are backing + 'PickStorage'. + + IMPORTANT: If an error occurs the DB transaction is rolled back, but the ES deletions will persist. + Re-running 'create_mapping' on this item type will reindex the items. + + :param app: app to access settings from, either a testapp (testapp.app.registry) or regular app(.registry) + :param item_types: list of types to purge from DB + :param prod: bool whether to allow run on prod, default False + :return: True in success, False otherwise + """ + if not hasattr(app, 'registry'): + app = app.app + if not hasattr(app, 'registry'): + raise RuntimeError('Passed app to purge_item_type_from_db does not contain a registry.') + + if 'env.name' in app.registry.settings: + env = app.registry.settings['env.name'] + if is_stg_or_prd_env(env) and not prod: + logger.error('Tried to run purge_item_type_from_storage on prod without specifying' + 'the prod options - exiting.') + return False + + # purge uuids directly from PickStorage, ignoring status=deleted checks + configure_dbsession(app) + # The use of set(...) is to de-duplicate. -kmp 28-Jan-2021 + uuids_to_purge = set(get_uuids_for_types(app.registry, item_types)) + pstorage = app.registry[STORAGE] + for uuid in uuids_to_purge: + try: + pstorage.purge_uuid(uuid) + transaction.commit() + except Exception as e: # XXX: handle recoverable exceptions? + logger.error('Encountered exception purging an item type (uuid: %s) from the DB: %s' + % (uuid, e)) + transaction.abort() + return False + + return True + + +def main(): + """ Entry point for this command """ + logging.basicConfig() + + parser = argparse.ArgumentParser( # noqa - PyCharm wrongly thinks the formatter_class is specified wrong here. + description='Clear an item type out of metadata storage', + epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument('config_uri', help='path to configfile') + parser.add_argument('item_type', help='item type to run on') + parser.add_argument('--app-name', help='Pyramid app name in configfile') + parser.add_argument('--prod', help='Whether or not to proceed if we are on a production server', + action='store_true', default=False) + args = parser.parse_args() + + app = get_app(args.config_uri, args.app_name) + sys.exit(purge_item_type_from_storage(app, [args.item_type], prod=args.prod)) + + +if __name__ == '__main__': + main() diff --git a/snovault/commands/run_upgrader_on_inserts.py b/snovault/commands/run_upgrader_on_inserts.py new file mode 100644 index 000000000..f78ce9d89 --- /dev/null +++ b/snovault/commands/run_upgrader_on_inserts.py @@ -0,0 +1,42 @@ +import argparse +import logging +import json +from pkg_resources import resource_filename + +logger = logging.getLogger(__name__) +EPILOG = __doc__ + + +def get_inserts(inserts_folder_name='inserts', inserts_file_name='workflow'): + folder_name = resource_filename('encoded', 'tests/data/' + inserts_folder_name + '/') + f_name = folder_name + inserts_file_name + '.json' + with open(f_name) as f: + items = json.loads(f.read()) + for insert_item in items: + yield insert_item + + +def main(): + logging.basicConfig() + # Loading app will have configured from config file. Reconfigure here: + logging.getLogger('encoded').setLevel(logging.DEBUG) + + parser = argparse.ArgumentParser( # noqa - PyCharm wrongly thinks the formatter_class is specified wrong here. + description="Run inserts through an upgrader", epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument('--inserts-folder', help="Folder to use to get the file of inserts from. E.g. 'master-inserts' or 'inserts'. Defaults to 'inserts'.") + parser.add_argument('item_type', help="Type of item or filename of inserts, in lowercase/schema-filename form, e.g. 'page', 'static_section'.") + parser.add_argument('upgrader_method_name', help="Name of upgrader method to use as it is defined in upgrade/ folder, e.g. 'workflow_3_4'.") + args = parser.parse_args() + upgrader_module = __import__('encoded.upgrade.' + args.item_type, fromlist=['']) + upgrader_fxn = getattr(upgrader_module, args.upgrader_method_name) + results = [] + for item in get_inserts(args.inserts_folder or 'inserts', args.item_type): + upgrader_fxn(item, None) # Modifies in place + results.append(item) + print(json.dumps(results, indent=4, sort_keys=True)) # Return instead of print? + + +if __name__ == "__main__": + main() diff --git a/snovault/commands/update_inserts_from_server.py b/snovault/commands/update_inserts_from_server.py new file mode 100644 index 000000000..f7d0ab10e --- /dev/null +++ b/snovault/commands/update_inserts_from_server.py @@ -0,0 +1,167 @@ +import structlog +import logging +import argparse +import json +from os import walk +# use ff_utils to find inserts and write data +from dcicutils.ff_utils import search_metadata, expand_es_metadata, dump_results_to_json +# use this function to read inserts +from .run_upgrader_on_inserts import get_inserts + +logger = structlog.getLogger(__name__) +EPILOG = __doc__ + + +def read_local_inserts_dir(dir_name, path, target_types=[]): + """ + Given path string path, read local inserts directory and return a + dictionary of all inserts keyed by item type, as well as a list of all + found uuids + + Args: + dir_name (str): string name of the inserts directory + path (str): string path to the inserts directory + target_types (list): list of item types to load. Empty means all types + + Returns: + dict of inserts, list of item uuids + """ + item_types = [] + item_uuids = [] + local_inserts = {} + # find item types that are represented in the given inserts path + for (dirpath, dirnames, filenames) in walk(path): + item_types = [it[:-5] for it in filenames if it.endswith('.json')] + if target_types: + bad_item_types = [it for it in target_types if it not in item_types] + if bad_item_types: + raise Exception('update_inserts: Specified item type(s) %s are not found in ' + 'the inserts dir. Found: %s' % (bad_item_types, item_types)) + # update item_types if user specified specific ones + fetch_item_types = target_types if target_types else item_types + # load current insert contents from json file + for item_type in item_types: + local_inserts[item_type] = {} # key these by uuid for now + for it_item in get_inserts(dir_name, item_type): + # only fetch items for specified fetch_item_types + if item_type in fetch_item_types: + item_uuids.append(it_item['uuid']) + local_inserts[item_type][it_item['uuid']] = it_item + return local_inserts, item_uuids + + +def main(): + """ + Use this command to update the inserts from a given fourfront env + """ + logging.basicConfig() + # Loading app will have configured from config file. Reconfigure here: + logging.getLogger('encoded').setLevel(logging.DEBUG) + + parser = argparse.ArgumentParser( # noqa - PyCharm wrongly thinks the formatter_class is specified wrong here. + description="Update Inserts", epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument('--env', default='data', + help='FF environment to update from. Defaults to data') + parser.add_argument('--dest', default='temp-local-inserts', + help="destination file in inserts dir to write to") + parser.add_argument('--item-type', action='append', default=[], + help="item type, e.g. file_fastq. Defaults to all types") + parser.add_argument('--ignore-field', action='append', default=["submitted_by", "date_created", "last_modified", "schema_version"], + help='field name to ignore when running expand_es_metadata') + parser.add_argument('--from-search', help='query passed to search_metadata to find uuids') + + args = parser.parse_args() + # this will work since bin/ commands are run from root FF directory + inserts_location = 'src/encoded/tests/data' + # hardcode these to prevent accidental creation of inserts files + inserts_files = ['inserts', 'master-inserts', 'perf-testing', 'workbook-inserts', + 'temp-local-inserts', 'deploy-inserts'] + if args.dest not in inserts_files: + raise Exception('Specified inserts destination %s must be one of: %s' + % (args.dest, inserts_files)) + inserts_path = '/'.join([inserts_location, args.dest]) + + local_inserts, item_uuids = read_local_inserts_dir(args.dest, inserts_path, args.item_type) + + # Used to preserve order of existing inserts in folder(s), if any. + local_inserts_ordering_map = {} + for item_type, local_inserts_for_type in local_inserts.items(): + for insrt_index, insrt_uuid in enumerate(local_inserts_for_type): + # Duplicate insrt_indx between different item types are OK and present. + # local_inserts_ordering_map is shallow. + local_inserts_ordering_map[insrt_uuid] = insrt_index + + # add uuids from the input search result, if present + if args.from_search: + use_search = args.from_search + # get frame=object search results to keep response small + if 'frame=' not in use_search: + use_search += '&frame=object' + search_res = search_metadata(use_search, ff_env=args.env) + search_uuids = [item['uuid'] for item in search_res] + logger.info('update_inserts: Will update using %s items from search' % len(search_uuids)) + item_uuids = list(set(item_uuids + search_uuids)) + + # now find uuids and all linked from the given server + svr_inserts, svr_uuids = expand_es_metadata(item_uuids, ff_env=args.env, + store_frame='raw', add_pc_wfr=True, + ignore_field=args.ignore_field) + + # if we are updating `inserts`, must make sure that items don't conflict + # with those in `master-inserts` + skip_uuids = set() + if args.dest == 'inserts': + master_path = '/'.join([inserts_location, 'master-inserts']) + master_inserts, master_uuids = read_local_inserts_dir('master-inserts', master_path) + item_conflict_report = {} + for item_type in svr_inserts: + itype_err = [] + itype_okay = [] + conflicting_items = [item for item in svr_inserts[item_type] if item['uuid'] in master_uuids] + for conflict in conflicting_items: + # compare inserts by loading json objects + svr_json = json.dumps(conflict, sort_keys=True) + mstr_json = json.dumps(master_inserts[item_type][conflict['uuid']], sort_keys=True) + if svr_json != mstr_json: + itype_err.append(conflict['uuid']) + else: + # the json is the same. Remove from the `inserts` update + skip_uuids.add(conflict['uuid']) + itype_okay.append(conflict['uuid']) + item_conflict_report[item_type] = {'error': itype_err, 'okay': itype_okay} + if any([it for it in item_conflict_report if item_conflict_report[it]['error']]): + error_report = {it: item_conflict_report[it]['error'] for it in item_conflict_report} + logger.error('update_inserts: Cannot update the following items in "inserts" directory,' + ' since there are conflicting items with different values' + 'in the master-inserts. Update those first. Conflicts:\n%s' % json.dumps(error_report, indent=4)) + raise Exception('Cannot load inserts as there are conflicting items in `master-inserts`') + elif any([it for it in item_conflict_report if item_conflict_report[it]['okay']]): + conflict_report = {it: item_conflict_report[it]['okay'] for it in item_conflict_report} + logger.warning('update_inserts: The following items are already in "master-inserts".' + ' Will not add to "inserts". Items:\n%s' % json.dumps(conflict_report, indent=4)) + + # now we need to update the server inserts with contents from local inserts + # so that existing information is not lost + for item_type in svr_inserts: + if skip_uuids: + # remove items specified by skip uuids + svr_inserts[item_type] = [ + insrt for insrt in svr_inserts[item_type] + if insrt['uuid'] not in skip_uuids + ] + svr_inserts[item_type].sort(key=lambda insrt: local_inserts_ordering_map.get(insrt["uuid"], 99999) ) + for item_uuid in local_inserts.get(item_type, {}): + if item_uuid not in svr_uuids and item_uuid not in skip_uuids: + svr_inserts[item_type].append(local_inserts[item_type][item_uuid]) + + dump_results_to_json(svr_inserts, inserts_path) + logger.info('update_inserts: Successfully wrote to %s' % inserts_path) + for item_type in svr_inserts: + logger.info('update_inserts: Wrote %s items to %s' % + (len(svr_inserts[item_type]), item_type + '.json')) + + +if __name__ == "__main__": + main() diff --git a/snovault/crud_views.py b/snovault/crud_views.py index f4e880a72..819ed83a8 100644 --- a/snovault/crud_views.py +++ b/snovault/crud_views.py @@ -267,8 +267,8 @@ def get_linking_items(context, request, render=None): result = { 'status': 'success', '@type': ['result'], - 'display_title': 'Links to %s' % item_uuid, - 'notification' : '%s has %s items linking to it. This may include rev_links if status != deleted' % (item_uuid, len(links)), + 'display_title': f'Links to {item_uuid}', + 'notification': f'{item_uuid} has {len(links)} items linking to it. This may include rev_links if status != deleted', 'uuids_linking_to': links } return result @@ -289,10 +289,9 @@ def item_delete_full(context, request, render=None): if hasattr(request, 'user_info'): user_details = request.user_info.get('details', {}) else: - if 'group.admin' in request.effective_principals: - user_details = {'groups': 'admin'} # you can do it - else: - user_details = {} # you cannot + # used to check for admin here, now done in user_info above + # note that hasattr() results in a function call when referring to an @property + user_details = {} if 'admin' not in user_details.get('groups', []): msg = u'Must be admin to fully delete items.' raise ValidationFailure('body', ['userid'], msg) @@ -307,7 +306,7 @@ def item_delete_full(context, request, render=None): return { 'status': 'success', '@type': ['result'], - 'notification' : 'Permanently deleted ' + uuid, + 'notification': f'Permanently deleted {uuid}', '@graph': [uuid] } else: @@ -316,14 +315,14 @@ def item_delete_full(context, request, render=None): return { 'status': 'success', '@type': ['result'], - 'notification' : 'Set status of ' + uuid + ' to deleted', - '@graph': [ render_item(request, context, render) ] + 'notification': f'Set status of {uuid} to deleted', + '@graph': [render_item(request, context, render)] } return { 'status': 'failure', '@type': ['result'], - 'notification' : 'Deletion failed', + 'notification': 'Deletion failed', '@graph': [uuid] } diff --git a/snovault/custom_embed.py b/snovault/custom_embed.py new file mode 100644 index 000000000..c8c01adfa --- /dev/null +++ b/snovault/custom_embed.py @@ -0,0 +1,394 @@ +import re +from uuid import UUID + +from dcicutils.misc_utils import ignored +from pyramid.httpexceptions import HTTPBadRequest, HTTPForbidden +from pyramid.security import Authenticated +from pyramid.traversal import find_resource +from pyramid.view import view_config +from .util import debug_log + +ATID_PATTERN = re.compile("/[a-zA-Z-]+/[a-zA-Z0-9-_:]+/") +GENELIST_ATID = re.compile("/gene-lists/[a-zA-Z0-9-]+/") # TODO: refactor +MINIMAL_EMBEDS = ["projects", "institutions", "users"] # TODO: refactor for setting in downstream repo +MINIMAL_EMBED_ATID = re.compile("/(" + "|".join(MINIMAL_EMBEDS) + ")/[a-zA-Z0-9-_:]+/") +KEYS_TO_IGNORE = [ + "@id", + "@type", + "principals_allowed", + "uuid", + "status", + "title", + "display_title", + "schema_version", + "date_created", + "actions", +] +FORBIDDEN_MSG = {"error": "no view permissions"} +DATABASE_ITEM_KEY = "@type" # Key specific to JSON objects that are CGAP items + + +def includeme(config): + config.add_route("embed", "/embed") + config.scan(__name__) + + +class CustomEmbed: + """ + Class to handle custom embedding for /embed API. + """ + + def __init__(self, request, item, embed_props): + self.request = request + self.ignored_embeds = embed_props.get("ignored_embeds", []) + self.desired_embeds = embed_props.get("desired_embeds", []) + self.embed_depth = embed_props.get("embed_depth", 4) + self.requested_fields = embed_props.get("requested_fields", []) + + self.cache = {} + self.invalid_ids = [] + if self.requested_fields: + self.nested_fields = self.fields_to_nested_dict() + item = self.user_embed(item, initial_item=True) + self.result = self.field_embed(item, self.nested_fields, initial_item=True) + else: + depth = -1 + self.result = self.embed(item, depth) + + def add_actions(self, item): + """ + Add the "actions" field to an item according to the request's + permissions, formatted identically to the calc props on items + for a GET page view. + + :param item: dict item in object view + :return item: dict item in object view with "actions" + """ + actions = [] + root_resource = self.request.root + item_path = item["@id"] + item_resource = find_resource(root_resource, item_path) + for action in ["edit", "create"]: + if self.request.has_permission(action, item_resource): + actions.append({ + "name": action, + "title": action.capitalize(), + "profile": "/profiles/%s.json" % item_resource.type_info.name, + "href": "%s?currentAction=%s" % ( + self.request.resource_path(item_resource), action + ) + }) + item["actions"] = actions + return item + + def user_embed(self, item_id, initial_item=False): + """ + Use request's embed method to find given item in the database. + + If the user who made the call to the API does not have + permissions to view the item, the item will not be embedded. + Instead, if the item is the initial ID given to the API, nothing is + embedded; if the item is to be embedded at a subsequent depth, a + message stating the item cannot be embedded is inserted instead. + + Additionally, if the given ID is the initial item to embed, add + "actions" if item embedded. + + :param item_id: string uuid or @id + :param initial_item: bool indicative of embedding initial item ID + :return item: object to return for embedding + """ + item = None + given_id = item_id + if not item_id.startswith("/"): + item_id = "/" + item_id + try: + item = self.request.embed(item_id, "@@object", as_user=True) + except HTTPForbidden: + if not initial_item: + item = FORBIDDEN_MSG + except KeyError: + self.invalid_ids.append(given_id) + if item and initial_item: + item = self.add_actions(item) + return item + + def minimal_embed(self, item_id): + """ + Embed minimal item info. Helpful for preventing recursions for + items for which detailed info is commonly not needed. + + :param item_id: string uuid or @id + :return item_embed: dict with item title and @id + """ + item_object = self.user_embed(item_id) + if item_object == FORBIDDEN_MSG: + item_embed = item_object + elif isinstance(item_object, dict): + item_title = item_object.get("title", "") + item_atid = item_object.get("@id", "") + item_embed = {"title": item_title, "@id": item_atid} + else: + item_embed = item_object + return item_embed + + @staticmethod + def is_uuid(uuid_to_test, version=4): + """ + Determine if given string is a valid uuid. + + :param uuid_to_test: string to check + :param version: int for uuid version + :return: bool if given string is valid uuid + """ + try: + uuid_obj = UUID(uuid_to_test, version=version) + except ValueError: + return False + return str(uuid_obj) == uuid_to_test + + def embed(self, item, depth): + """ + Embed items recursively according to input parameters. Unpack + dictionaries and lists to find @ids, which are selectively embedded, + typically in object view. Store new embeds in cache for look-up. + + :param item: object of interest to expand + :param depth: int of current embed depth + :return item: object of interest processed + """ + while True: + if depth == self.embed_depth: + break + elif isinstance(item, dict) and item: + for key in item: + if key in KEYS_TO_IGNORE: + continue + item[key] = self.embed(item[key], depth) + break + elif isinstance(item, list) and item: + for idx in range(len(item)): + item[idx] = self.embed(item[idx], depth) + break + elif isinstance(item, str): + if ATID_PATTERN.match(item): + if depth == -1: + cache_item = item + item = self.user_embed(item, initial_item=True) + self.cache[cache_item] = item + depth += 1 + elif self.desired_embeds: + if item.split("/")[1] in self.desired_embeds: + if item in self.cache: + item = self.cache[item] + depth += 1 + else: + cache_item = item + item = self.user_embed(item) + self.cache[cache_item] = item + depth += 1 + else: + break + else: + if item.split("/")[1] in self.ignored_embeds: + break + elif item in self.cache: + item = self.cache[item] + depth += 1 + elif GENELIST_ATID.match(item): + # NOTE: Non-admins forbidden for raw view, so just skip + # attempt to embed gene lists for default settings. + break + elif MINIMAL_EMBED_ATID.match(item): + cache_item = item + item = self.minimal_embed(item) + self.cache[cache_item] = item + break + else: + cache_item = item + item = self.user_embed(item) + self.cache[cache_item] = item + depth += 1 + elif self.is_uuid(item) and depth == -1: + item = self.user_embed(item, initial_item=True) + depth += 1 + else: + break + else: + break + return item + + def fields_to_nested_dict(self): + """ + Convert list of requested fields into nested dictionary. Each + nested dictionary contains keys whose values are sub-dictionaries + to embed as well as a "fields_to_keep" key whose values are the + terminal fields requested. + + For example, if the requested fields are: + 'variant.gene.title' + 'variant.*' + the resulting nested dict will be: + + { + "variant": { + "gene": {"fields_to_keep": ["title"]}, + "fields_to_keep": ["*"] + } + } + + :return field_dict: nested dict of requested fields + """ + field_dict = {} + for field in self.requested_fields: + field_keys = field.split(".") + field_keys = [x for x in field_keys if x] + field_dict = self.build_nested_dict(field_dict, field_keys) + return field_dict + + def build_nested_dict(self, field_dict, field_keys): + """ + Recursively builds a nested dict for each requested field by + iterating through the keys of the requested field, adding + the keys if not present and building a nested dict for the + remaining keys. The terminal key of the requested field always + corresponds to a field of an embedded item that should be included + in the embedding. + + :param field_dict: existing dict of requested fields + :param field_keys: list of keys of a requested field + :return field_dict: existing dict updated with new field_keys + """ + key = field_keys.pop(0) + if not field_keys: + if "fields_to_keep" in field_dict: + field_dict["fields_to_keep"].append(key) + else: + field_dict["fields_to_keep"] = [key] + else: + if key not in field_dict: + field_dict[key] = {} + field_dict[key] = self.build_nested_dict(field_dict[key], field_keys) + return field_dict + + def field_embed(self, item, field_dict, initial_item=False): + """ + Embed items recursively according to requested fields. Follows + keys of the nested dict of requested fields, unpacking dictionaries + and lists and embedding @ids as required to reach the terminal + requested fields. + + :param item: object of interest to expand + :param field_dict: nested dict of requested fields + :param initial_item: bool indicative of embedding initial item ID + :return item: object of interest processed + """ + while True: + if isinstance(item, dict): + if item == FORBIDDEN_MSG: + break + fields_to_keep = [] + for key in field_dict: + if key == "fields_to_keep": + fields_to_keep += field_dict[key] + continue + if key not in item: + continue + fields_to_keep.append(key) + item[key] = self.field_embed(item[key], field_dict[key]) + if initial_item and "actions" not in fields_to_keep: + fields_to_keep.append("actions") + if not initial_item and "actions" in fields_to_keep: + if DATABASE_ITEM_KEY in item: + item = self.add_actions(item) + else: + raise HTTPBadRequest( + "The 'actions' field was requested for a JSON object" + " that is not a database item." + ) + if "*" not in fields_to_keep: + culled_item = {} + for field in fields_to_keep: + try: + culled_item[field] = item[field] + except KeyError: + continue + item = culled_item + break + if isinstance(item, list): + for idx in range(len(item)): + item[idx] = self.field_embed(item[idx], field_dict) + break + elif isinstance(item, str): + if ATID_PATTERN.match(item): + if item in self.cache: + item = self.cache[item] + else: + cache_item = item + item = self.user_embed(item) + self.cache[cache_item] = item + else: + break + else: + break + return item + + +@view_config( + route_name="embed", request_method="POST", effective_principals=Authenticated +) +@debug_log +def embed(context, request): + """ + API to return custom-embedded view of object posted to endpoint. If no + parameters provided, attempt to return object with embedding done + per default parameters. + + :param context: pyramid request context + :param request: pyramid request object + :return results: list of dicts of custom-embedded views of items + """ + ids = [] + ignored_embeds = [] + desired_embeds = [] + requested_fields = [] + results = [] + invalid_ids = [] + embed_depth = 4 # Arbritary standard depth to search. + ignored(context) + if request.GET: + ids += request.GET.dict_of_lists().get("id", []) + embed_depth = int(request.GET.get("depth", embed_depth)) + ignored_embeds += request.GET.dict_of_lists().get("ignored", []) + desired_embeds += request.GET.dict_of_lists().get("desired", []) + requested_fields += request.GET.dict_of_lists().get("field", []) + elif request.json: + ids += request.json.get("ids", []) + ignored_embeds = request.json.get("ignored", []) + desired_embeds = request.json.get("desired", []) + embed_depth = request.json.get("depth", embed_depth) + requested_fields = request.json.get("fields", []) + ids = list(set(ids)) + if len(ids) > 5: + raise HTTPBadRequest( + "Too many items were given for embedding." + " Please limit to less than 5 items." + ) + if not ids: + raise HTTPBadRequest("No item identifier was provided.") + embed_props = { + "ignored_embeds": ignored_embeds, + "desired_embeds": desired_embeds, + "embed_depth": embed_depth, + "requested_fields": requested_fields, + } + for item_id in ids: + item_embed = CustomEmbed(request, item_id, embed_props) + results.append(item_embed.result) + invalid_ids += item_embed.invalid_ids + invalid_ids += [item for item in results if isinstance(item, str)] + if invalid_ids: + raise HTTPBadRequest( + "The following IDs were invalid: %s" % ", ".join(invalid_ids) + ) + return results diff --git a/snovault/dev_servers.py b/snovault/dev_servers.py new file mode 100644 index 000000000..c63dbc5fe --- /dev/null +++ b/snovault/dev_servers.py @@ -0,0 +1,223 @@ +"""\ +Examples +For the development.ini you must supply the paster app name: + + %(prog)s development.ini --app-name app --init --clear + +""" + +import argparse +import atexit +import logging +import os.path +import select +import shutil +import subprocess +import sys + +from dcicutils.misc_utils import PRINT +from pyramid.paster import get_app, get_appsettings +from pyramid.path import DottedNameResolver +from .elasticsearch import create_mapping +from .project_app import app_project +from .tests import elasticsearch_fixture, postgresql_fixture + + +EPILOG = __doc__ + +logger = logging.getLogger(__name__) + + +def nginx_server_process(prefix='', echo=False): + args = [ + os.path.join(prefix, 'nginx'), + '-c', app_project().project_filename('nginx-dev.conf'), + '-g', 'daemon off;' + ] + process = subprocess.Popen( + args, + close_fds=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + + if not echo: + process.stdout.close() + + if echo: + PRINT('Started: http://localhost:8000') + + return process + + +def ingestion_listener_compute_command(config_uri, app_name): + return [ + 'poetry', 'run', 'ingestion-listener', config_uri, '--app-name', app_name + ] + + +def ingestion_listener_process(config_uri, app_name, echo=True): + """ Uses Popen to start up the ingestion-listener. """ + args = ingestion_listener_compute_command(config_uri, app_name) + + process = subprocess.Popen( + args, + close_fds=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + + if echo: + PRINT('Starting Ingestion Listener...') + + return process + + +def redis_server_process(echo=False): + """ Handler that spins up a Redis server on port 6379 (default)""" + args = [ + 'redis-server', + '--daemonize', + 'yes' + ] + process = subprocess.Popen( + args, + close_fds=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + if not echo: + process.stdout.close() + if echo: + print('Started Redis Server at redis://localhost:6379') + return process + + +def main(): + parser = argparse.ArgumentParser( # noqa - PyCharm wrongly thinks the formatter_class is specified wrong here. + description="Run development servers", epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument('--app-name', help="Pyramid app name in configfile") + parser.add_argument('config_uri', help="path to configfile") + parser.add_argument('--clear', action="store_true", help="Clear existing data") + parser.add_argument('--init', action="store_true", help="Init database") + parser.add_argument('--load', action="store_true", help="Load test set") + parser.add_argument('--datadir', default='/tmp/snovault', help="path to datadir") + parser.add_argument('--no_ingest', action="store_true", default=False, help="Don't start the ingestion process.") + args = parser.parse_args() + + run(app_name=args.app_name, config_uri=args.config_uri, datadir=args.datadir, + # Ingestion is disabled. snovault has no such concept. -kmp 17-Feb-2023 + clear=args.clear, init=args.init, load=args.load, ingest=not args.no_ingest) + + +def run(app_name, config_uri, datadir, clear=False, init=False, load=False, ingest=True): + + #project = app_project(initialize=True) + project = app_project() + + logging.basicConfig(format='') + # Loading app will have configured from config file. Reconfigure here: + logging.getLogger(project.NAME).setLevel(logging.INFO) + + # get the config and see if we want to connect to non-local servers + # TODO: This variable seems to not get used? -kmp 25-Jul-2020 + config = get_appsettings(config_uri, app_name) + + datadir = os.path.abspath(datadir) + pgdata = os.path.join(datadir, 'pgdata') + esdata = os.path.join(datadir, 'esdata') + # ----- comment out from HERE... + if clear: + for dirname in [pgdata, esdata]: + if os.path.exists(dirname): + shutil.rmtree(dirname) + if init: + postgresql_fixture.initdb(pgdata, echo=True) + # ----- ... to HERE to disable recreation of test db + # ----- may have to `rm /tmp/snovault/pgdata/postmaster.pid` + + @atexit.register + def cleanup_process(): + for process in processes: + if process.poll() is None: + process.terminate() + for process in processes: + try: + for line in process.stdout: + sys.stdout.write(line.decode('utf-8')) + except IOError: + pass + process.wait() + + processes = [] + + # For now - required components + postgres = postgresql_fixture.server_process(pgdata, echo=True) + processes.append(postgres) + + es_server_url = config.get('elasticsearch.server', "localhost") + + if '127.0.0.1' in es_server_url or 'localhost' in es_server_url: + # Bootup local ES server subprocess. Else assume connecting to remote ES cluster. + elasticsearch = elasticsearch_fixture.server_process(esdata, echo=True) + processes.append(elasticsearch) + elif not config.get('indexer.namespace'): + raise Exception( + 'It looks like are connecting to remote elasticsearch.server but no indexer.namespace is defined.') + elif not config.get("elasticsearch.aws_auth", False): + # TODO detect if connecting to AWS or not before raising an Exception. + PRINT( + 'WARNING - elasticsearch.aws_auth is set to false.' + ' Connection will fail if connecting to remote ES cluster on AWS.') + + nginx = nginx_server_process(echo=True) + processes.append(nginx) + + app = get_app(config_uri, app_name) + settings = app.registry.settings + + # Optional components + if 'redis.server' in settings: + redis = redis_server_process(echo=True) + processes.append(redis) + + if ingest: + ingestion_listener = ingestion_listener_process(config_uri, app_name) + processes.append(ingestion_listener) + + # clear queues and initialize indices before loading data. No indexing yet. + # this is needed for items with properties stored in ES + if init: + create_mapping.run(app, skip_indexing=True, purge_queue=False) + + if init and load: + load_test_data = app.registry.settings.get('load_test_data') + load_test_data = DottedNameResolver().resolve(load_test_data) + load_res = load_test_data(app) + if load_res: # None if successful + raise load_res + + # now clear the queues and queue items for indexing + create_mapping.run(app, check_first=True, strict=True, purge_queue=False) + + PRINT('Started. ^C to exit.') + + stdouts = [p.stdout for p in processes] + + # Ugly should probably use threads instead + while True: + readable, writable, err = select.select(stdouts, [], stdouts, 5) + for stdout in readable: + for line in iter(stdout.readline, b''): + sys.stdout.write(line.decode('utf-8')) + if err: + for stdout in err: + for line in iter(stdout.readline, b''): + sys.stdout.write(line.decode('utf-8')) + break + + +if __name__ == '__main__': + main() diff --git a/snovault/drs.py b/snovault/drs.py new file mode 100644 index 000000000..028385ac6 --- /dev/null +++ b/snovault/drs.py @@ -0,0 +1,113 @@ +from pyramid.view import view_config +from pyramid.security import Authenticated +from pyramid.exceptions import HTTPNotFound +from .util import debug_log + + +DRS_VERSION_1 = 'v1' +DRS_PREFIX_V1 = f'ga4gh/drs/{DRS_VERSION_1}' +DRS_OBJECT_GET = DRS_PREFIX_V1 + '/objects/{object_id}' +DRS_OBJECT_GET_ACCESS_URL = DRS_PREFIX_V1 + '/objects/{object_id}/access/{access_id}' +DRS_OBJECT_GET_ACCESSS_URL_SLASH = DRS_PREFIX_V1 + '/objects/{object_id}/access/' +DRS_OBJECT_GET_ACCESSS_URL_NO_SLASH = DRS_PREFIX_V1 + '/objects/{object_id}/access' +REQUIRED_FIELDS = [ + 'id', + 'created_time', + 'drs_id', + 'self_uri', + 'size', + 'checksums' +] +ACCESS_METHOD_REQUIRED_FIELDS = [ + 'access_url', + 'type' +] + + +def includeme(config): + config.add_route('drs_objects', '/' + DRS_OBJECT_GET) + config.add_route('drs_download', '/' + DRS_OBJECT_GET_ACCESS_URL) + config.add_route('drs_download_slash', '/' + DRS_OBJECT_GET_ACCESSS_URL_SLASH) + config.add_route('drs_download_no_slash', '/' + DRS_OBJECT_GET_ACCESSS_URL_NO_SLASH) + config.scan(__name__) + + +def validate_drs_object(drs_object): + """ Validates the structure of a drs object (required fields) + Because we're not wrapping in any object-oriented structure, the internal API + will call this and throw a validation error if the returned DRS object + does not conform to structure. + """ + for required_key in REQUIRED_FIELDS: + assert required_key in drs_object + if 'access_methods' in drs_object: + for required_key in ACCESS_METHOD_REQUIRED_FIELDS: + for access_method in drs_object['access_methods']: + assert required_key in access_method + + +def get_and_format_drs_object(request, object_uri): + """ Call request.embed on the object_uri and reformats it such that it fits the DRS + format, returning access_methods etc as needed if it is a file + """ + try: + drs_object = request.embed(object_uri, '@@drs', as_user=True) + except Exception: + raise HTTPNotFound('You accessed a DRS object_uri that either does not exist' + ' or you do not have access to it.') + drs_object['self_uri'] = f'drs://{request.host}{request.path}' + return drs_object + + +def get_drs_url(request, object_uri): + """ Does 2 calls - one to verify the object_uri is in fact a valid DRS object and + another to get the bytes of the DRS object """ + try: + drs_obj = get_and_format_drs_object(request, object_uri) + access_methods = drs_obj.get('access_methods', []) + # in our system there is only 1 access method - HTTPS to S3 + return access_methods[0]['access_url'] + except Exception as e: + raise HTTPNotFound(f'You accessed a DRS object that either you do not have access to,' + f' did not pass valid access_id or does not exist {str(e)}') + + +@view_config( + route_name='drs_objects', request_method='GET' +) +@debug_log +def drs_objects(context, request): + """ Implements DRS GET as specified by the API description + https://ga4gh.github.io/data-repository-service-schemas/preview/release/drs-1.0.0/docs/#_getobject + """ + drs_object_uri = '/' + request.matchdict['object_id'] + formatted_drs_object = get_and_format_drs_object(request, drs_object_uri) + try: + validate_drs_object(formatted_drs_object) + except AssertionError as e: + raise ValueError(f'Formatted DRS object does not conform to spec - check your @@drs' + f' implementation: {str(e)}') + return formatted_drs_object + + +@view_config( + route_name='drs_download_no_slash', request_method='GET', + effective_principals=Authenticated +) +@view_config( + route_name='drs_download_slash', request_method='GET', + effective_principals=Authenticated +) +@view_config( + route_name='drs_download', request_method='GET', + effective_principals=Authenticated +) +@debug_log +def drs_objects_download(context, request): + """ Implements DRS GET bytes as specified by the API description + https://ga4gh.github.io/data-repository-service-schemas/preview/release/drs-1.0.0/docs/#_getaccessurl + + NOTE: access_id is discarded - permissions are validated by @@download when navigated + """ + drs_object_uri = '/' + request.matchdict['object_id'] + return get_drs_url(request, drs_object_uri) diff --git a/snovault/tests/snowflake_hash.py b/snovault/edw_hash.py similarity index 91% rename from snovault/tests/snowflake_hash.py rename to snovault/edw_hash.py index 96d00163f..900901fa9 100644 --- a/snovault/tests/snowflake_hash.py +++ b/snovault/edw_hash.py @@ -8,16 +8,16 @@ def includeme(config): - register_crypt_handler(SNOWHash) + register_crypt_handler(EDWHash) -class SNOWHash(uh.StaticHandler): - """ a special snowflake of a password hashing scheme +class EDWHash(uh.StaticHandler): + """ EDW's password hashing scheme Cryptographic strength of the hashing function is less of a concern for randomly generated passwords. """ - name = 'snowflake_hash' + name = 'edw_hash' checksum_chars = uh.PADDED_BASE64_CHARS checksum_size = 64 diff --git a/snovault/embed.py b/snovault/embed.py index 176438b4d..20d0c8ce5 100644 --- a/snovault/embed.py +++ b/snovault/embed.py @@ -1,14 +1,17 @@ import logging from copy import deepcopy from posixpath import join - from pyramid.compat import ( native_, unquote_bytes_to_wsgi, ) -from pyramid.httpexceptions import HTTPNotFound - -from .interfaces import CONNECTION +from pyramid.httpexceptions import HTTPNotFound, HTTPServerError +import pyramid.request +from .crud_views import collection_add as sno_collection_add +from .interfaces import COLLECTIONS, CONNECTION +from .resources import Collection +from .schema_utils import validate_request +from dcicutils.misc_utils import check_true log = logging.getLogger(__name__) @@ -190,6 +193,69 @@ def _embed(request, path, as_user='EMBED'): '_sid_cache': subreq._sid_cache} +def subrequest_object(request, object_id): + subreq = make_subrequest(request, "/" + object_id) + subreq.headers['Accept'] = 'application/json' + # Tweens are suppressed here because this is an internal call and doesn't need things like HTML processing. + # -kmp 2-Feb-2021 + response = request.invoke_subrequest(subreq, use_tweens=False) + if response.status_code >= 300: # alas, the response from a pyramid subrequest has no .raise_for_status() + raise HTTPServerError("Error obtaining object: %s" % object_id) + object_json = response.json + return object_json + + +def subrequest_item_creation(request: pyramid.request.Request, item_type: str, json_body: dict = None) -> dict: + """ + Acting as proxy on behalf of request, this creates a new item of the given item_type with attributes per json_body. + + For example, + + subrequest_item_creation(request=request, item_type='NobelPrize', + json_body={'category': 'peace', 'year': 2016)) + + Args: + request: the request on behalf of which this subrequest is done + item_type: the name of the item item type to be created + json_body: a python dictionary representing JSON containing data to use in initializing the newly created item + + Returns: + a python dictionary (JSON description) of the item created + + """ + + if json_body is None: + json_body = {} + collection_path = '/' + item_type + method = 'POST' + # json_utf8 = json.dumps(json_body).encode('utf-8') # Unused, but here just in case + check_true(not request.remote_user, "request.remote_user has %s before we set it." % request.remote_user) + request.remote_user = 'EMBED' + subrequest = make_subrequest(request=request, path=collection_path, method=method, json_body=json_body) + subrequest.remote_user = 'EMBED' + subrequest.registry = request.registry + # Maybe... + # validated = json_body.copy() + # subrequest.validated = validated + registry: Registry = subrequest.registry # noQA - PyCharm can't tell subrequest.registry IS a Registry + collection: Collection = registry[COLLECTIONS][item_type] + check_true(subrequest.json_body, "subrequest.json_body is not properly initialized.") + check_true(not subrequest.validated, "subrequest was unexpectedly validated already.") + check_true(not subrequest.errors, "subrequest.errors already has errors before trying to validate.") + check_true(subrequest.remote_user == request.remote_user, + "Mismatch: subrequest.remote_user=%r request.remote_user=%r" + % (subrequest.remote_user, request.remote_user)) + validate_request(schema=collection.type_info.schema, request=subrequest, data=json_body) + if not subrequest.validated: + return { + "@type": ["Exception"], + "errors": subrequest.errors + } + else: + json_result: dict = sno_collection_add(context=collection, request=subrequest, render=False) + return json_result + + class NullRenderer: '''Sets result value directly as response. ''' diff --git a/snovault/ingestion/common.py b/snovault/ingestion/common.py new file mode 100644 index 000000000..5130be976 --- /dev/null +++ b/snovault/ingestion/common.py @@ -0,0 +1,184 @@ +""" +common.py - tools common to various parts of ingestion +""" + +from .exceptions import MissingParameter, BadParameter +from ..util import CONTENT_TYPE_SPECIAL_CASES + + +def metadata_bundles_bucket(registry): + return registry.settings.get('metadata_bundles_bucket') + + +# ================================================== + + +def register_path_content_type(*, path, content_type): + """ + Registers that endpoints that begin with the specified path use the indicated content_type. + + This is part of an inelegant workaround for an issue in renderers.py that maybe we can make go away in the future. + See the 'implementation note' in ingestion/common.py for more details. + """ + exceptions = CONTENT_TYPE_SPECIAL_CASES.get(content_type, None) + if exceptions is None: + CONTENT_TYPE_SPECIAL_CASES[content_type] = exceptions = [] + if path not in exceptions: + exceptions.append(path) + + +def content_type_allowed(request): + """ + Returns True if the current request allows the requested content type. + + This is part of an inelegant workaround for an issue in renderers.py that maybe we can make go away in the future. + See the 'implementation note' in ingestion/common.py for more details. + """ + if request.content_type == "application/json": + # For better or worse, we always allow this. + return True + + exceptions = CONTENT_TYPE_SPECIAL_CASES.get(request.content_type) + + if exceptions: + for text in exceptions: + if text in request.path: + return True + + return False + +# ================================================== + + +_NO_DEFAULT = object() + + +def get_parameter(parameter_block, parameter_name, as_type=None, default=_NO_DEFAULT, update=False): + """ + Returns the value of a given parameter from a dictionary of parameter values. + + If the parameter is not in the dictionary, the default will be returned if one is given. + If the parameter is not present but there is no default, an error of type MissingParameter will be raised. + + Args: + parameter_block (dict): a dictionary whose keys are parameter names and whose values are parameter values + parameter_name (str): the name of a parameter + as_type: if supplied, a type coercion to perform on the result + default (object): a default value to be used if the parameter_name is not present. + update (bool): if as_type is applied, whether to update the parameter_block + """ + + if isinstance(parameter_block, dict): + if parameter_name in parameter_block: + parameter_value = parameter_block[parameter_name] + result = parameter_value + if as_type: + if isinstance(as_type, type) and isinstance(result, as_type): + return result + elif as_type is bool: + lower_value = str(result).lower() + if lower_value == "true": + result = True + elif lower_value in ("false", "none", "null", ""): + result = False + else: + raise BadParameter(parameter_name=parameter_name, parameter_value=parameter_value, + extra_detail=("Expected a string representing a boolean, such as" + " 'true' for True, or 'false' or the empty string for False.")) + else: + result = as_type(result) + elif default is _NO_DEFAULT: + raise MissingParameter(parameter_name=parameter_name) + else: + result = default + + if update: + parameter_block[parameter_name] = result + + return result + + else: + raise TypeError("Expected parameter_block to be a dict: %s", parameter_block) + + +class IngestionError: + """ + Holds info on an error that occurred in ingestion. Right now this consists of the + offending request body and the VCF row it occurred on. + + This class doesn't really do much except specify the structure. It must align with that of FileProcessed + (reproduced as of 12/2/2020 below): + + "file_ingestion_error": { + "title": "Ingestion Error Report", + "description": "This field is set when an error occurred in ingestion with all errors encountered", + "type": "array", + "items": { + "title": "Ingestion Error", + "type": "object", + "properties": { + "body": { + "type": "string", + "index": false # the intention is not to index this in the future + }, + "row": { + "type": "integer" + } + } + } + } + + """ + + def __init__(self, body, row): + self.body = body + self.row = row + + def to_dict(self): + return { + 'body': str(self.body), + 'row': self.row + } + + +class IngestionReport: + """ + A "virtual" item on file_processed that contains detailed information on the ingestion run. + Not creating an item for this is a design decision. The output of this process is more for + debugging and not for auditing, so it does not merit an item at this time. + """ + MAX_ERRORS = 100 # tune this to get more errors, 100 is a lot though and probably more than needed + + def __init__(self): + self.grand_total = 0 + self.errors = [] + + def brief_summary(self): + return ('INGESTION REPORT: There were %s total variants detected, of which %s were successful' + 'and %s failed. Check ProcessedFile for full error output.' % (self.grand_total, + self.total_successful(), + self.total_errors())) + + def total_successful(self): + return self.grand_total - len(self.errors) + + def total_errors(self): + return len(self.errors) + + def get_errors(self, limit=True): + """ Returns a limited number of errors, where limit can be True (self.MAX_ERRORS), False (no limit), + or an integer. """ + if limit is True: + limit = self.MAX_ERRORS + elif limit is False: + limit = None + return self.errors[:limit] + + def mark_success(self): + """ Marks the current row number as successful, which in this case just involves incrementing the total """ + self.grand_total += 1 + + def mark_failure(self, *, body, row): + """ Marks the current row as failed, creating an IngestionError holding the response body and row. """ + self.grand_total += 1 + self.errors.append(IngestionError(body, row).to_dict()) diff --git a/snovault/ingestion/exceptions.py b/snovault/ingestion/exceptions.py new file mode 100644 index 000000000..ed5972a97 --- /dev/null +++ b/snovault/ingestion/exceptions.py @@ -0,0 +1,41 @@ +""" +Exception definitions for ingestion +""" + +from pyramid.httpexceptions import HTTPBadRequest, HTTPServerError + + +class SubmissionFailure(HTTPServerError): + pass + + +class UndefinedIngestionProcessorType(Exception): + + def __init__(self, processor_type): + self.ingestion_type_name = processor_type + super().__init__("No ingestion processor type %r is defined." % processor_type) + + +class MissingParameter(HTTPBadRequest): + + def __init__(self, parameter_name): + self.parameter_name = parameter_name + super().__init__(detail="Missing parameter: %s" % parameter_name) + + +class BadParameter(HTTPBadRequest): + + def __init__(self, parameter_name, parameter_value, extra_detail=None): + self.parameter_name = parameter_name + self.parameter_value = parameter_value + self.extra_detail = extra_detail + suffix = " " + extra_detail if extra_detail else "" + super().__init__(detail="The value of the %s parameter, %r, is invalid.%s" + % (parameter_name, parameter_value, suffix)) + + +class UnspecifiedFormParameter(HTTPBadRequest): + + def __init__(self, parameter_name): + self.parameter_name = parameter_name + super().__init__(detail="A form parameter was not filled out: %s" % parameter_name) diff --git a/snovault/ingestion/ingestion_listener.py b/snovault/ingestion/ingestion_listener.py new file mode 100644 index 000000000..f3b75d263 --- /dev/null +++ b/snovault/ingestion/ingestion_listener.py @@ -0,0 +1,613 @@ +#from ..project_defs import SnovaultProject +from ..project_app import app_project +import argparse +import atexit +import botocore.exceptions +import cgi +import datetime +import elasticsearch +import io +import json +import os +import psycopg2 +import re +import signal +import structlog +import threading +import time +import webtest + +from dcicutils.env_utils import is_stg_or_prd_env +from dcicutils.misc_utils import VirtualApp, ignored, check_true, full_class_name, environ_bool, PRINT +from pyramid import paster +# Possibly still needed by some commented-out code. +# from pyramid.response import Response +from pyramid.view import view_config +from snovault.util import debug_log +from ..embed import subrequest_object +from ..types.ingestion import SubmissionFolio, IngestionSubmission +from ..util import ( + debuglog, get_trusted_email, beanstalk_env_from_request, + register_path_content_type, vapp_for_email, # vapp_for_ingestion, + SettingsKey, make_s3_client, extra_kwargs_for_s3_encrypt_key_id, +) +from .common import metadata_bundles_bucket, get_parameter, IngestionReport +from .exceptions import UnspecifiedFormParameter, SubmissionFailure # , BadParameter +from .ingestion_listener_base import ( + DEBUG_SUBMISSIONS, + IngestionListenerBase, +) +from .ingestion_message_handler_decorator import call_ingestion_message_handler +from .ingestion_processor_decorator import get_ingestion_processor +from .queue_utils import IngestionQueueManager + + +log = structlog.getLogger(__name__) +EPILOG = __doc__ +INGESTION_QUEUE = 'ingestion_queue' + + +def includeme(config): + # config.add_route('process_ingestion', '/process_ingestion') + config.add_route('queue_ingestion', '/queue_ingestion') + config.add_route('ingestion_status', '/ingestion_status') + config.add_route('submit_for_ingestion', '/submit_for_ingestion') + config.registry[INGESTION_QUEUE] = IngestionQueueManager(config.registry) + config.scan(__name__) + + +SUBMISSION_PATTERN = re.compile(r'^/ingestion-submissions/([0-9a-fA-F-]+)(|/.*)$') + +register_path_content_type(path='/submit_for_ingestion', content_type='multipart/form-data') + + +def extract_submission_info(request): + matched = SUBMISSION_PATTERN.match(request.path_info) + if matched: + submission_id = matched.group(1) + else: + raise SubmissionFailure("request.path_info is not in the expected form: %s" % request.path_info) + + instance = subrequest_object(request, submission_id) + return submission_id, instance + + +@view_config(name='submit_for_ingestion', request_method='POST', context=IngestionSubmission, + # Apparently adding this 'accept' causes discrimination on incoming requests not to find this method. + # We do want this type, and instead we check the request to make sure we got it, but we omit it here + # for practical reasons. -kmp 10-Sep-2020 + # accept='multipart/form-data', + permission='edit') +@debug_log +def submit_for_ingestion(context, request): + ignored(context) + + check_true(request.content_type == 'multipart/form-data', # even though we can't declare we accept this + "Expected request to have content_type 'multipart/form-data'.", error_class=SubmissionFailure) + + bs_env = beanstalk_env_from_request(request) + bundles_bucket = metadata_bundles_bucket(request.registry) + datafile = request.POST.get('datafile') + if datafile is None: + # S3 protocol; not uploading from here (SubmitCGAP uploads directly). + # Added circa March 2023 for Fourfront ontology ingestion. + filename = request.POST['datafile_source_filename'] + override_name = None + elif isinstance(datafile, cgi.FieldStorage): + filename = datafile.filename + override_name = request.POST.get('override_name', None) + else: + # e.g., specifically it might be b'' when no file is selected, + # but IMPORTANTLY, cgi.FieldStorage has no predefined boolean value, + # so we can't just ask to check 'not datafile'. Sigh. -kmp 5-Aug-2020 + raise UnspecifiedFormParameter('datafile') + parameters = dict(request.POST) # Convert to regular dictionary, which is also a copy + parameters['datafile'] = filename + + # Other parameters, like validate_only, will ride in on parameters via the manifest on s3 + + submission_id, instance = extract_submission_info(request) + + # The three arguments award, lab, and ingestion_type were needed in the old protocol + # but are not needed in the new protocol because someone will have set up the IngestionSubmission + # object already with the right values. We tolerate them here, but we insist they be consistent (redundant). + # Note, too, that we use the 'update=True' option that causes them to be added to our parameters if they are + # missing, defaulted from the previous item, so that they will be written to the parameter block stored on S3. + # (We could do that differently now, by looking them up dynamically, but rather than risk making a mistake, + # I just went with path of least resistance for now.) + # -kmp 2-Dec-2020 + # + # Same goes for award and lab which were brought from fourfront + # into this common snovault version of ingestion_listener.py. + # -dmichaels 12-May-2023 + + if instance.get("institution"): + institution = instance['institution']['@id'] + institution_arg = get_parameter(parameters, "institution", default=institution, update=True) + if institution_arg != institution: + # If the "institution" argument was passed, which we no longer require, make sure it's consistent. + raise SubmissionFailure("'institution' was supplied inconsistently for submit_for_ingestion.") + + if instance.get("project"): + project = instance['project']['@id'] + project_arg = get_parameter(parameters, "project", default=project, update=True) + if project_arg != project: + # If the "project" argument was passed, which we no longer require, make sure it's consistent. + raise SubmissionFailure("'project' was supplied inconsistently for submit_for_ingestion.") + + if instance.get("award"): + award = instance['award']['@id'] + award_arg = get_parameter(parameters, "award", default=award, update=True) + if award_arg != award: + # If the "award" argument was passed, which we no longer require, make sure it's consistent. + raise SubmissionFailure("'award' was supplied inconsistently for submit_for_ingestion.") + + if instance.get("lab"): + lab = instance['lab']['@id'] + lab_arg = get_parameter(parameters, "lab", default=lab, update=True) + if lab_arg != lab: + # If the "lab" argument was passed, which we no longer require, make sure it's consistent. + raise SubmissionFailure("'lab' was supplied inconsistently for submit_for_ingestion.") + + ingestion_type = instance['ingestion_type'] + ingestion_type_arg = get_parameter(parameters, "ingestion_type", default=ingestion_type, update=True) + if ingestion_type_arg != ingestion_type: + # If the "ingestion_type" argument was passed, which we no longer require, make sure it's consistent. + raise SubmissionFailure("'ingestion_type' was supplied inconsistently for submit_for_ingestion.") + + # ``input_file`` contains the actual file data which needs to be + # stored somewhere. + + if datafile is not None: + input_file_stream = datafile.file + input_file_stream.seek(0) + + # NOTE: Some reference information about uploading files to s3 is here: + # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html + + # submission.set_item_detail(object_name=manifest['object_name'], parameters=manifest['parameters'], + # institution=institution, project=project) + + # submission_id = str(uuid.uuid4()) + _, ext = os.path.splitext(filename) + object_name = "{id}/datafile{ext}".format(id=submission_id, ext=ext) if datafile is not None else submission_id + manifest_name = "{id}/manifest.json".format(id=submission_id) + + # We might need to extract some additional information from the GAC + s3_client = make_s3_client() + + upload_time = datetime.datetime.utcnow().isoformat() + success = True + message = "Uploaded successfully." + + # Set up potentially useful additional args + s3_encrypt_key_id = request.registry.settings.get(SettingsKey.S3_ENCRYPT_KEY_ID) + extra_kwargs = extra_kwargs_for_s3_encrypt_key_id(s3_encrypt_key_id=s3_encrypt_key_id, + client_name='submit_for_ingestion') + + if extra_kwargs: + additional_info = f" (with SSEKMSKeyId: {s3_encrypt_key_id})" + else: + additional_info = " (no SSEKMSKeyId)" + + if datafile is not None: + try: + # Make sure to pass any extra args. + s3_client.upload_fileobj(input_file_stream, Bucket=bundles_bucket, Key=object_name, **extra_kwargs) + except botocore.exceptions.ClientError as e: + log.error(e) + success = False + message = f"{full_class_name(e)}: {str(e)}{additional_info}" + + # This manifest will be stored in the manifest.json file on on s3 AND will be returned from this endpoint call. + manifest_content = { + "filename": filename, + "object_name": object_name, + "s3_encrypt_key_id": s3_encrypt_key_id, + "submission_id": submission_id, + "submission_uri": SubmissionFolio.make_submission_uri(submission_id), + "beanstalk_env_is_prd": is_stg_or_prd_env(bs_env), + "beanstalk_env": bs_env, + "bucket": bundles_bucket, + "authenticated_userid": request.authenticated_userid, + "email": get_trusted_email(request, context="Submission", raise_errors=False), + "success": success, + "message": message, + "upload_time": upload_time, + "parameters": parameters, + } + + manifest_content_formatted = json.dumps(manifest_content, indent=2) + + if success: + + try: + with io.BytesIO(manifest_content_formatted.encode('utf-8')) as fp: + s3_client.upload_fileobj(fp, Bucket=bundles_bucket, Key=manifest_name, **extra_kwargs) + except botocore.exceptions.ClientError as e: + log.error(e) + message = f"{full_class_name(e)} (while uploading metadata): {str(e)}{additional_info}" + raise SubmissionFailure(message) + + queue_manager = get_queue_manager(request, override_name=override_name) + _, failed = queue_manager.add_uuids([submission_id], ingestion_type=ingestion_type) + + if failed: + # If there's a failure, failed will be a list of one problem description since we only submitted one thing. + raise SubmissionFailure(failed[0]) + + if not success: + + raise SubmissionFailure(message) + + return manifest_content + + +@view_config(route_name='ingestion_status', request_method='GET', permission='index') +@debug_log +def ingestion_status(context, request): + """ Status route, essentially identical to indexing_status. """ + ignored(context) + queue_manager = request.registry[INGESTION_QUEUE] + n_waiting, n_inflight = queue_manager.get_counts() + return { + 'title': 'Ingestion Status', + 'waiting': n_waiting, + 'inflight': n_inflight + } + + + +def process_submission(*, submission_id, ingestion_type, app, bundles_bucket=None, s3_client=None): + ignored(s3_client) # we might want to restore the ability to pass this, but no one actually does. -kmp 6-Dec-2021 + bundles_bucket = bundles_bucket or metadata_bundles_bucket(app.registry) + s3_client = make_s3_client() + manifest_name = "{id}/manifest.json".format(id=submission_id) + log.warning(f'Processing submission {manifest_name}') + obj = s3_client.get_object(Bucket=bundles_bucket, Key=manifest_name) + # data = json.load(obj)['Body'] + data = json.load(obj['Body']) + email = None + try: + email = data['email'] + except KeyError as e: + ignored(e) + debuglog("Manifest data is missing 'email' field.") + if DEBUG_SUBMISSIONS: + pass + debuglog("processing submission %s with email %s" % (submission_id, email)) + with vapp_for_email(email=email, app=app) as vapp: + if DEBUG_SUBMISSIONS: + PRINT("PROCESSING FOR %s" % email) + submission = SubmissionFolio(vapp=vapp, ingestion_type=ingestion_type, submission_id=submission_id, log=None) + handler = get_ingestion_processor(ingestion_type) + result = handler(submission) + if DEBUG_SUBMISSIONS: + PRINT("DONE PROCESSING FOR %s" % email) + return { + "result": result, + "ingestion_type": ingestion_type, + "submission_id": submission_id, + } + + +@view_config(route_name='queue_ingestion', request_method='POST', permission='index') +@debug_log +def queue_ingestion(context, request): + """ Queues uuids as part of the request body for ingestion. Can batch as many as desired in a + single request. Note that you can also pass ingestion_type, which will apply to all uuids queued. + The default is (SNV) ontology. + """ + ignored(context) + uuids = request.json.get('uuids', []) + ingestion_type = request.json.get('ingestion_type', 'ontology') # note that this applies to all uuids + override_name = request.json.get('override_name', None) + return enqueue_uuids_for_request(request, uuids, ingestion_type=ingestion_type, override_name=override_name) + + +def enqueue_uuids_for_request(request, uuids, *, ingestion_type, override_name=None): + response = { + 'notification': 'Failure', + 'number_queued': 0, + 'detail': 'Nothing was queued. Make sure to past in a list of uuids in in "uuids" key.' + } + if uuids is []: + return response + queue_manager = get_queue_manager(request, override_name=override_name) + _, failed = queue_manager.add_uuids(uuids, ingestion_type=ingestion_type) + if not failed: + response['notification'] = 'Success' + response['number_queued'] = len(uuids) + response['detail'] = 'Successfully queued the following uuids: %s' % uuids + app_project().note_ingestion_enqueue_uuids_for_request(ingestion_type, request, uuids) + else: + response['number_queued'] = len(uuids) - len(failed) + response['detail'] = 'Some uuids failed: %s' % failed + return response + + +def get_queue_manager(request, *, override_name): + return (request.registry[INGESTION_QUEUE] + if not override_name + else IngestionQueueManager(request.registry, override_name=override_name)) + + +class IngestionListener(IngestionListenerBase): + """ Organizes helper functions for the ingestion listener """ + POLL_INTERVAL = 10 # seconds between each poll + INGEST_AS_USER = environ_bool('INGEST_AS_USER', default=True) # The new way, but possible to disable for now + + def __init__(self, vapp, _queue_manager=None, _update_status=None): + self.vapp = vapp + + # Get queue_manager + registry = None + if isinstance(self.vapp, (webtest.TestApp, VirtualApp)): # TestApp in testing or VirtualApp in production + registry = self.vapp.app.registry + elif _queue_manager is None: # if we got here, we cannot succeed in starting + raise Exception('Bad arguments given to IngestionListener: %s, %s, %s' % + (self.vapp, _queue_manager, _update_status)) + self.queue_manager = IngestionQueueManager(registry) if not _queue_manager else _queue_manager + self.update_status = _update_status + + @staticmethod + def should_remain_online(override=None): + """ A function that says whether 'run' should continue. This is provided because it + can be mocked in testing. + + :param override: a lambda that will execute when evaluating if specified + :return: True if should stay running, False otherwise + """ + if not override: + return True + return override() + + def get_messages(self): + """ Sleeps (as to not hit SQS too frequently) then requests messages, + returning the result bodies. + + NOTE: THIS FUNCTION SHOULD NOT BE USED OUTSIDE OF THIS CODE SINCE + IT BLOCKS FOR RATE LIMITING REASONS + + :return: messages available on SQS + """ + time.sleep(self.POLL_INTERVAL) # sleep here before polling again + return self.queue_manager.receive_messages() + + def delete_messages(self, messages): + """ Deletes messages from SQS (after they have been processed). Does not return + anything but will log if messages fail deletion. + + :param messages: messages to be deleted + """ + failed = self.queue_manager.delete_messages(messages) + while True: + debuglog("Trying to delete messages") + tries = 3 + if failed: + debuglog("Failed to delete messages") + if tries > 0: + failed = self.queue_manager.delete_messages(failed) # try again + tries -= 1 + else: + log.error('Failed to delete messages from SQS: %s' % failed) + break + else: + debuglog("Deleted messages") + break + + def _patch_value(self, uuid, field, value): + """ Patches field with value on item uuid """ + self.vapp.patch_json('/' + uuid, {field: value}) + + def patch_ingestion_report(self, report, uuid): + """ Sets the file_ingestion_error field of the given uuid """ + if isinstance(report, IngestionReport): # handle normal case + self._patch_value(uuid, 'file_ingestion_error', report.get_errors()) + elif isinstance(report, list): # handle when build_ingestion_error_report result is passed + self._patch_value(uuid, 'file_ingestion_error', report) + else: + raise TypeError('Got bad type for ingestion error report: %s' % report) + + def set_status(self, uuid, status): + """ Sets the file_ingestion_status of the given uuid """ + self._patch_value(uuid, 'file_ingestion_status', status) + + @staticmethod + def build_ingestion_error_report(msg): + """ Builds an ingestion error report in case an error is encountered that cannot be recovered from + in VCF ingestion - see file_processed.json for structure definition. """ + return [ + { + 'body': msg, + 'row': -1 # this exception may have occurred on a particular row but since it could not be recovered + } # from we assume the msg has sufficient info to work backwards from - Will 4/9/21 + ] + + def run(self): + """ Main process for this class. Runs forever doing ingestion as needed. + + HIGH LEVEL LOGIC: + while True: + while there are messages available: + for each message: + download, decompress, ingest, patch file status to "Ingested" + delete processed messages + """ + log.info('Ingestion listener successfully online.') + + debuglog("Ingestion listener started.") + + messages = [] # This'll get a better value below in each loop iteration. This is just a declaration of intent. + + def discard(msg): + self.delete_messages([msg]) + # Assuming we didn't get an error trying to remove it, + # it should also get removed from our to-do list. + messages.remove(msg) + + while self.should_remain_online(): + + debuglog("About to get messages.") + + messages = self.get_messages() # wait here + + debuglog("Got", len(messages), "messages.") + + # ingest each VCF file + for message in list(messages): + # C4-990/2023-02-09/dmichaels + # Added the list wrapper around messages in the above loop, + # i.e. list(messages), so that when we remove a message from + # the messages list via the discard function (above) the loop + # does not end up skipping the very next message. + + debuglog("Message:", message) + + # C4-990/2023-02-09/dmichaels + # This calls at most one our message handlers + # registered via the @ingestion_message_handler decorator. + if call_ingestion_message_handler(message, self): + # Here one of our message handlers was called and it processed this message. + discard(message) + + # This is just fallback cleanup in case messages weren't cleaned up within the loop. + # In normal operation, they will be. + self.delete_messages(messages) + + +def run(vapp=None, _queue_manager=None, _update_status=None): + """ Entry-point for the ingestion listener for waitress. """ + ingestion_listener = IngestionListener(vapp, _queue_manager=_queue_manager, _update_status=_update_status) + try: + ingestion_listener.run() + except Exception as e: + debuglog(str(e)) + raise + + +class ErrorHandlingThread(threading.Thread): + """ Must be duplicated here so logging is correct. """ + + def run(self): + # interval = self._kwargs.get('interval', DEFAULT_INTERVAL) + interval = 60 # DB polling can and should be slower + update_status = self._kwargs['_update_status'] # noQA - uses private instance variables of parent class + while True: + try: + self._target(*self._args, **self._kwargs) # noQA - uses private instance variables of parent class + except (psycopg2.OperationalError, elasticsearch.exceptions.ConnectionError) as e: + # Handle database restart + log.warning('Database not there, maybe starting up: %r', e) + update_status(msg=repr(e)) + log.debug('sleeping') + time.sleep(interval) + continue + except Exception as e: + # Unfortunately mod_wsgi does not restart immediately + log.exception('Exception in ingestion listener, restarting process at next request: %s' % e) + os.kill(os.getpid(), signal.SIGINT) + break + + +# Composite Application (for wsgi) +def composite(loader, global_conf, **settings): + """ This is a composite pyramid app, meant to run components of an application + or an application extension. In our case we are running the ingestion listener, + which requires executing a command with application context. This code lives + in encoded top-level as it is a wsgi entry-point. Note that the local deployment + does NOT run the listener this way, but runs the run method through main directly. + This code is heavily based off of the es_index_listener in snovault. + """ + listener = None + + # Register before app creation. + @atexit.register + def join_listener(): + if listener: + log.debug('joining listening thread') + listener.join() + + # Composite app is used so we can load the main app + app_name = settings.get('app', None) + app = loader.get_app(app_name, global_conf=global_conf) + username = settings.get('username', 'IMPORT') + environ = { + 'HTTP_ACCEPT': 'application/json', + 'REMOTE_USER': username, + } + vapp = VirtualApp(app, environ) + timestamp = datetime.datetime.utcnow().isoformat() + status_holder = { + 'status': { + 'status': 'starting listener', + 'started': timestamp, + 'msgs': [] + }, + } + + def update_status(msg=None, **kw): + """ Method passed to run to update "global" status. """ + # Setting a value in a dictionary is atomic + status = status_holder['status'].copy() + status.update(**kw) # can hold generic info + if msg is not None: + status['msgs'].append(msg) + status_holder['status'] = status + + kwargs = { + 'vapp': vapp, + '_update_status': update_status + } + + # daemon thread that actually executes `run` method to call /index + listener = ErrorHandlingThread(target=run, name='listener', kwargs=kwargs) + listener.daemon = True + log.debug('WSGI Ingestion Listener Started') + listener.start() + + # Register after virtualapp creation. + @atexit.register + def shutdown_listener(): + """ Echo a statement at shutdown """ + log.debug('shutting down listening thread') + + def status_app(environ, start_response): + """ Allows you to get the status of the ingestion "manager". This will be much + more useful once multi-processing is thrown at ingestion. + """ + ignored(environ) + status = '200 OK' + response_headers = [('Content-type', 'application/json')] + start_response(status, response_headers) + return [json.dumps(status_holder['status'])] + + return status_app + + +# Command Application (for waitress) +def main(): + """ Entry point for the local deployment. """ + parser = argparse.ArgumentParser( # noqa - PyCharm wrongly thinks the formatter_class is specified wrong here. + description='Listen for VCF File uuids to ingest', + epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument('--app-name', help='Pyramid app name in configfile') + parser.add_argument('--username', '-u', default='IMPORT', help='Import username') + parser.add_argument('--dry-run', action='store_true', help='Do not post variants, just validate') + parser.add_argument('config_uri', help="path to configfile") + args = parser.parse_args() + + app = paster.get_app(args.config_uri, args.app_name) + config = { + 'HTTP_ACCEPT': 'application/json', + 'REMOTE_USER': args.username, + } + + vapp = VirtualApp(app, config) + return run(vapp) + + +if __name__ == '__main__': + main() diff --git a/snovault/ingestion/ingestion_listener_base.py b/snovault/ingestion/ingestion_listener_base.py new file mode 100644 index 000000000..35573a4bf --- /dev/null +++ b/snovault/ingestion/ingestion_listener_base.py @@ -0,0 +1,19 @@ +from dcicutils.misc_utils import environ_bool + +STATUS_QUEUED = 'Queued' +STATUS_INGESTED = 'Ingested' +STATUS_DISABLED = 'Ingestion disabled' +STATUS_ERROR = 'Error' +STATUS_IN_PROGRESS = 'In progress' +SHARED = 'shared' +DEBUG_SUBMISSIONS = environ_bool("DEBUG_SUBMISSIONS", default=False) + + +class IngestionListenerBase: + """ + This is the type (of the second argument) expected by the + ingestion_message_handler decorator. In separate file from + ingestion_listener.py to avoid recursive imports via + ingestion_message_handler_decorator.py where this is used. + """ + pass diff --git a/snovault/ingestion/ingestion_message.py b/snovault/ingestion/ingestion_message.py new file mode 100644 index 000000000..108e82b80 --- /dev/null +++ b/snovault/ingestion/ingestion_message.py @@ -0,0 +1,17 @@ +import json + + +class IngestionMessage: + """ + Wrapper for raw ingestion message from SQS. Extracts ingestion type and uuid + for easy/unified access. This is the type (of the first argument) expected by + the ingestion_message_handler decorator. Note that the ingestion type name + string is trimmed and treated as case-insensitive. + """ + def __init__(self, raw_message: dict) -> None: + self.body = json.loads(raw_message["Body"]) or {} + self.uuid = self.body["uuid"] or "" + self.type = self.body.get("ingestion_type", "vcf").strip().lower() + + def is_type(self, value: str) -> bool: + return isinstance(value, str) and self.type == value.lower() diff --git a/snovault/ingestion/ingestion_message_handler_decorator.py b/snovault/ingestion/ingestion_message_handler_decorator.py new file mode 100644 index 000000000..692a39894 --- /dev/null +++ b/snovault/ingestion/ingestion_message_handler_decorator.py @@ -0,0 +1,193 @@ +# Module containing the definition of the @ingestion_message_handler decorator used +# to globally register ingestion message handler functions for specific ingestion +# message types, or a default message handler to handle any message types for which +# no specific handler was registered. Only a single handler may be registered for +# a specified message type, and only a single default handler may be registered. +# Also defined here is a function (call_ingestion_message_handler) to call the +# appropriate registered handler for a given message. + +import inspect +from typing import Union +from dcicutils.misc_utils import ignored, PRINT +from .ingestion_listener_base import IngestionListenerBase +from .ingestion_message import IngestionMessage + + +# Dictionary (by ingestion type) of globally registered ingestion message handlers. +_ingestion_message_handlers = {} + + +def ingestion_message_handler(f=None, *decorator_args, **decorator_kwargs): + """ + Decorator to globally register ingestion message handlers, to be used for example like this: + + @ingestion_message_handler + def your_ingester_message_handler(message: IngestionMessage, listener: IngestionListener): + # Handle your message here; return whatever you like; + # it will be returned in turn by call_ingestion_message_handler. + + Although any function may be annotated with this decorator, at this time and for our purposes + it is expected to have a signature as show in the example above; this IS enforced to some extent. + + In addition, you can pass an ingestion_type argument to the decorator to LIMIT the call of the + decorated handler function to messages with an ingestion type which matches the specified string value. + For example, to define a message handler to be called ONLY for message types which are "vcf": + + @ingestion_message_handler(ingestion_type="vcf") + def your_ingester_message_handler(message: IngestionMessage, listener: IngestionListener): + # Handle your message here; return whatever you like; + # it will be returned in turn by call_ingestion_message_handler. + + Note that ingestion type names are (space-trimmed and) treated as case-insenstive. + + If the ingestion_type is not specified in the decorator, then the registered handler is said + to be the DEFAULT handler and will handle any message type not covered by any other registered + handler. There MUST be exactly ONE handler registered able to handle any expected message type, + otherwise an exception will be thrown (either at handler registration time, i.e. startup; or + at runtime, i.e. if an incoming message is found not to have an associated handler and there is + no default handler; this latter bit is handled by the call_ingestion_message_handler function below). + """ + ignored(decorator_args) + has_decorator_args = True if not callable(f) or f.__name__ == "" else False + ingestion_type = None + + # Sanity check any decorator arguments; currently just the optional ingestion_type. + if has_decorator_args: + if f is not None: + decorator_args = (f, *decorator_args) + if len(decorator_args) + len(decorator_kwargs) > 1: + raise ValueError(f"Invalid @ingestion_message_handler decorator usage (takes at most one argument).") + if len(decorator_args) == 1: + ingestion_type = decorator_args[0] + else: + ingestion_type = decorator_kwargs.get("ingestion_type", decorator_kwargs.get("type")) + if not (ingestion_type is None or isinstance(ingestion_type, str)): + raise ValueError(f"Invalid @ingestion_message_handler decorator usage (argument must be ingestion type string).") + ingestion_type = ingestion_type.strip().lower() + # If ingestion_type is not specified or is "default" this we are registering a default handler. + if ingestion_type == "default": + ingestion_type = None + + def ingestion_message_handler_wrapper(wrapped_function): + + if ingestion_type in _ingestion_message_handlers: + raise ValueError(f"Ingestion message handler already defined for " + f"ingestion message type: {ingestion_type if ingestion_type else ''}") + + # Sanity check the signature of the decorated ingestion message handler function. + # It should contain two arguments with either no type annotations or if present + # then they should be for IngestionMessage and IngestionListenerBase, respectively. + # Return value annotation is not checked. + wrapped_function_signature = inspect.signature(wrapped_function) + if len(wrapped_function_signature.parameters) < 2: + raise ValueError(f"Too few arguments (need two) " + f"for ingestion message handler function: {wrapped_function.__name__}") + if len(wrapped_function_signature.parameters) > 2: + raise ValueError(f"Too many arguments (need two) " + f"for ingestion message handler function: {wrapped_function.__name__}") + parameters = iter(wrapped_function_signature.parameters.items()) + first_parameter = next(parameters) + if first_parameter and len(first_parameter) >= 2: + first_parameter_annotation = first_parameter[1].annotation + if not first_parameter_annotation or (first_parameter_annotation.__name__ != "_empty" and + not issubclass(first_parameter_annotation, IngestionMessage)): + raise ValueError(f"Wrong first argument type (need unspecified or IngestionMessage) " + f"for ingestion message handler function: {wrapped_function.__name__}") + second_parameter = next(parameters) + if second_parameter and len(second_parameter) >= 2: + second_parameter_annotation = second_parameter[1].annotation + if not second_parameter_annotation or (second_parameter_annotation.__name__ != "_empty" and + not issubclass(second_parameter_annotation, IngestionListenerBase)): + raise ValueError(f"Wrong second argument type (need unspecified or IngestionListenerBase) " + f"for ingestion message handler function: {wrapped_function.__name__}") + PRINT(f"Registering ingestion message handler: " + f"{wrapped_function.__name__} (type: {ingestion_type if ingestion_type else ''})") + + def ingestion_message_handler_function(*args, **kwargs): + """ + This is the function called on each actual ingestion message handler call. + """ + ignored(kwargs) + # Check for two arguments of type IngestionMessage and IngestionListenerBase, respectively. + if len(args) != 2: + raise ValueError(f"Wrong number of arguments ({len(args)} passed to " + f"ingestion message handler (expecting two): {wrapped_function.__name__}") + message = args[0] + listener = args[1] + if not isinstance(message, IngestionMessage): + raise ValueError(f"First argument passed to ingestion message handler is " + f"not of type IngestionMessage: {wrapped_function.__name__}") + if not isinstance(listener, IngestionListenerBase): + raise ValueError(f"Second argument passed to ingestion message handler is " + f"not of type IngestionListenerBase: {wrapped_function.__name__}") + # Ensure we should call this handler based on any ingestion_type specified in the decorator. + # Given the current implementation and intended usage (i.e. handlers be specifically associated + # with a given message type, and calling via call_ingestion_message_handler) this should check + # should be unnecessary, though extra check will not hurt; it would only come up if calling a + # registered message handler directly (i.e. not via call_ingestion_message_handler). + PRINT(f"Checking message ({message.uuid}) type ({message.type}) for handler: {wrapped_function.__name__}") + if ingestion_type: + # Here the decorator specified a NON-default ingestion type for this handler; + # check and only call this handler (the wrapped function) if the handler + # ingestion type matches the ingestion message type. + if not message.is_type(ingestion_type): + # Since the ingestion_type specified for the handler decorator does NOT match + # the type of the message, then this message is NOT intended to be processed by + # this handler, it will NOT be called. Again, as mentioned above, this should + # NOT come up if the handler is called via call_ingestion_message_handler. + PRINT(f"Message ({message.uuid}) type ({message.type}) " + f"NOT intended for handler: {wrapped_function.__name__}") + return False + # Here this handler decorator either had no ingestion_type specifier, or it does + # and it matches the ingestion message type, indicating this message IS intended + # to be processed by this handler; we will call it here, returning its value. + PRINT(f"Calling message ({message.uuid}) type ({message.type}) " + f"handler: {wrapped_function.__name__}") + handler_result = wrapped_function(message, listener) + PRINT(f"Called message ({message.uuid}) type ({message.type}) " + f"handler: {wrapped_function.__name__} -> {handler_result}") + return handler_result + + # Register this handler for the ingestion type in our global dictionary; + # already checked above if a handler is already registered for this type. + _ingestion_message_handlers[ingestion_type] = ingestion_message_handler_function + + return ingestion_message_handler_function + + return ingestion_message_handler_wrapper(f) if not has_decorator_args else ingestion_message_handler_wrapper + + +def call_ingestion_message_handler(message: Union[IngestionMessage, dict], listener) -> bool: + """ + Calls the ingestion message handler function globally registered via the + @ingestion_message_handler decorator which corresponding to the TYPE of the given + IngestionMessage, passing it the given IngestionMessage and IngestionListenerBase + as arguments; returns the value returned by the message handler. + + If a message handler has NOT been registered for the given message type AND of NO default + message handler has been registered, then throws and exception. I.e. a specific message handler + MUST be defined for each expected message type OR a DEFAULT message handler must be defined + to handle messages with types which does NOT correspond to any specifically registered handlers. + """ + if not isinstance(message, IngestionMessage): + # For convenience, allow passing a message which is NOT of type IngestionMessage, which we + # will ASSUME in this case is a RAW (dict) message from which we create an IngestionMessage. + message = IngestionMessage(message) + # Get the handler for this message type, or the default handler of none specifically found. + handler = _ingestion_message_handlers.get(message.type, _ingestion_message_handlers.get(None)) + if handler: + return handler(message, listener) + else: + # If NO message handler is registered for the given message type AND if there + # is NO default message handler registered then we regard this as a (runtime) error. + raise RuntimeError(f"No ingestion message handler defined for ingestion message type: {message.type}" + f" -> Message: {message.body}") + + +def clear_ingestion_message_handlers_for_testing(): + """ + Clears all globally registered ingestion message handlers. + This is for TESTING purposes ONLY! + """ + global _ingestion_message_handlers + _ingestion_message_handlers = {} diff --git a/snovault/ingestion/ingestion_message_handler_default.py b/snovault/ingestion/ingestion_message_handler_default.py new file mode 100644 index 000000000..52ddfaf1e --- /dev/null +++ b/snovault/ingestion/ingestion_message_handler_default.py @@ -0,0 +1,87 @@ +import json +import structlog +from dcicutils.misc_utils import ignored, PRINT +from .ingestion_listener import (IngestionListener, DEBUG_SUBMISSIONS) +from ..types.ingestion import SubmissionFolio +from ..util import (debuglog, vapp_for_email, make_s3_client) +from .common import metadata_bundles_bucket +from .ingestion_message import IngestionMessage +from .ingestion_message_handler_decorator import ingestion_message_handler +from .ingestion_processor_decorator import get_ingestion_processor + + +log = structlog.getLogger(__name__) + + +def includeme(config): + pass + + +@ingestion_message_handler(ingestion_type="default") +def ingestion_message_handler_default(message: IngestionMessage, listener: IngestionListener) -> bool: + """ + This is the part of listener.IngestionListener.run function which handles a + single message within the (effectively-infinite) incoming message handling loop, + specifically for non-VCF files; refactored out of ingestion_listener.py February 2023. + Returns True if the message was successfully handled, otherwise False. + """ + + PRINT(f"Default ingestion message handler called for message ({message.uuid}) type: {message.type}") + + # Let's minimally disrupt things for now. We can refactor this later + # to make all the parts work the same -kmp + if listener.INGEST_AS_USER: + try: + debuglog("REQUESTING RESTRICTED PROCESSING:", message.uuid) + process_submission(submission_id=message.uuid, + ingestion_type=message.type, + # bundles_bucket=submission.bucket, + app=listener.vapp.app) + debuglog("RESTRICTED PROCESSING DONE:", message.uuid) + except Exception as e: + log.error(e) + else: + submission = SubmissionFolio(vapp=listener.vapp, ingestion_type=message.type, + submission_id=message.uuid) + handler = get_ingestion_processor(message.type) + try: + debuglog("HANDLING:", message.uuid) + handler(submission) + debuglog("HANDLED:", message.uuid) + except Exception as e: + log.error(e) + # If we suceeded, we don't need to do it again, and if we failed we don't need to fail again. + return True + + +def process_submission(*, submission_id, ingestion_type, app, bundles_bucket=None, s3_client=None): + ignored(s3_client) # we might want to restore the ability to pass this, but no one actually does. -kmp 6-Dec-2021 + bundles_bucket = bundles_bucket or metadata_bundles_bucket(app.registry) + s3_client = make_s3_client() + manifest_name = "{id}/manifest.json".format(id=submission_id) + log.warning(f'Processing submission {manifest_name}') + obj = s3_client.get_object(Bucket=bundles_bucket, Key=manifest_name) + # data = json.load(obj)['Body'] + data = json.load(obj['Body']) + email = None + try: + email = data['email'] + except KeyError as e: + ignored(e) + debuglog("Manifest data is missing 'email' field.") + if DEBUG_SUBMISSIONS: + pass + debuglog("processing submission %s with email %s" % (submission_id, email)) + with vapp_for_email(email=email, app=app) as vapp: + if DEBUG_SUBMISSIONS: + PRINT("PROCESSING FOR %s" % email) + submission = SubmissionFolio(vapp=vapp, ingestion_type=ingestion_type, submission_id=submission_id, log=None) + handler = get_ingestion_processor(ingestion_type) + result = handler(submission) + if DEBUG_SUBMISSIONS: + PRINT("DONE PROCESSING FOR %s" % email) + return { + "result": result, + "ingestion_type": ingestion_type, + "submission_id": submission_id, + } diff --git a/snovault/ingestion/ingestion_processor_decorator.py b/snovault/ingestion/ingestion_processor_decorator.py new file mode 100644 index 000000000..529e67930 --- /dev/null +++ b/snovault/ingestion/ingestion_processor_decorator.py @@ -0,0 +1,31 @@ +from ..types.ingestion import IngestionSubmission +from .exceptions import UndefinedIngestionProcessorType + + +_INGESTION_UPLOADERS = {} + + +def ingestion_processor(processor_type): + """ + @ingestion_uploader() is a decorator that declares the upload handler for an ingestion type. + """ + + # Make sure the ingestion type specified for the decorated function is supported by + # our IngestionSubmission type; this info comes from schemas/ingestion_submission.json. + if not IngestionSubmission.supports_type(processor_type): + raise UndefinedIngestionProcessorType(processor_type) + + def ingestion_type_decorator(fn): + if processor_type in _INGESTION_UPLOADERS: + raise RuntimeError(f"Ingestion type {processor_type} is already defined.") + _INGESTION_UPLOADERS[processor_type] = fn + return fn + + return ingestion_type_decorator + + +def get_ingestion_processor(processor_type): + handler = _INGESTION_UPLOADERS.get(processor_type, None) + if not handler: + raise UndefinedIngestionProcessorType(processor_type) + return handler diff --git a/snovault/ingestion/ingestion_processors.py b/snovault/ingestion/ingestion_processors.py new file mode 100644 index 000000000..5ef05e0b4 --- /dev/null +++ b/snovault/ingestion/ingestion_processors.py @@ -0,0 +1,17 @@ +from ..types.ingestion import SubmissionFolio +from .ingestion_processor_decorator import ingestion_processor + + +@ingestion_processor('data_bundle') +def handle_data_bundle(submission: SubmissionFolio): + + # We originally called it 'data_bundle' and we retained that as OK in the schema + # to not upset anyone testing with the old name, but this is not the name to use + # any more, so reject new submissions of this kind. -kmp 27-Aug-2020 + + with submission.processing_context(): + + raise RuntimeError("handle_data_bundle was called (for ingestion_type=%s). This is always an error." + " The ingestion_type 'data_bundle' was renamed to 'metadata_bundle'" + " prior to the initial release. Your submission program probably needs to be updated." + % submission.ingestion_type) diff --git a/snovault/ingestion/queue_utils.py b/snovault/ingestion/queue_utils.py new file mode 100644 index 000000000..3f0e939ef --- /dev/null +++ b/snovault/ingestion/queue_utils.py @@ -0,0 +1,208 @@ +import time +import json +import socket +import boto3 +import structlog +import datetime + + +log = structlog.getLogger(__name__) + + +class IngestionQueueManager: + """ + Similar to QueueManager in snovault in that in manages SQS queues, but that code is not generic + enough to use here, so it is "duplicated" so to speak here. At a later time the functionality of this + class and QueueManager should be refactored into a "helper" class, but for now this is sufficient + and is tested independently here. + + We will use a single queue to keep track of File uuids to be indexed. This used to manage only VCFs + but now the Ingestion functionality is generic and can be extended to arbitrary processing on + any type. + """ + QUEUE_NAME_EXTENSION = '-ingestion-queue' # XXX: breaking change, matches 4dn-cloud-infra resources + + def __init__(self, registry, override_name=None): + """ Does initial setup for interacting with SQS """ + self.batch_size = 1 # NOTE: this value is important because we don't want to block other jobs + self.env_name = registry.settings.get('env.name', None) + if not self.env_name: # replace with something usable + backup = socket.gethostname()[:80].replace('.', '-') + self.env_name = backup if backup else 'cgap-backup' + kwargs = { + 'region_name': 'us-east-1' + } + self.client = boto3.client('sqs', **kwargs) + self.queue_name = override_name or (self.env_name + self.QUEUE_NAME_EXTENSION) + self.queue_attrs = { + self.queue_name: { + 'DelaySeconds': '1', # messages initially invisible for 1 sec + 'VisibilityTimeout': '10800', # 3 hours + 'MessageRetentionPeriod': '604800', # 7 days, in seconds + 'ReceiveMessageWaitTimeSeconds': '5', # 5 seconds of long polling + } + } + self.queue_url = self._initialize() + + def _initialize(self): + """ Initializes the actual queue - helper method for init """ + try: + response = self.client.create_queue( + QueueName=self.queue_name, + Attributes=self.queue_attrs[self.queue_name] + ) + queue_url = response['QueueUrl'] + except self.client.exceptions.QueueNameExists: + queue_url = self._get_queue_url(self.queue_name) + except Exception as e: + log.error('Error while attempting to create queue: %s' % e) + queue_url = self._get_queue_url(self.queue_name) + return queue_url + + def _get_queue_url(self, queue_name): + """ + Simple function that returns url of associated queue name + """ + try: + response = self.client.get_queue_url( + QueueName=queue_name + ) + except Exception as e: + log.error('Cannot resolve queue_url: %s' % e) + response = {} + return response.get('QueueUrl', None) + + def _chunk_messages(self, msgs): + """ Chunks messages into self.send_batch_size batches (for efficiency). + + :param msgs: list of messages to be chunked + """ + for i in range(0, len(msgs), self.batch_size): + yield msgs[i:i + self.batch_size] + + def _send_messages(self, msgs, retries=3): + """ Sends msgs to the ingestion queue (with retries for failed messages). + + :param msgs: to be sent + :param retries: number of times to resend failed messages, decremented on recursion + :return: list of any failed messages + """ + failed = [] + for msg_batch in self._chunk_messages(msgs): + log.info('Trying to chunk messages: %s' % msgs) + entries = [] + for msg in msg_batch: + entries.append({ + 'Id': str(int(time.time() * 1000000)), + 'MessageBody': json.dumps(msg) + }) + response = self.client.send_message_batch( + QueueUrl=self.queue_url, + Entries=entries + ) + failed_messages = response.get('Failed', []) + + # attempt resend of failed messages + if failed_messages and retries > 0: + msgs_to_retry = [] + for failed_message in failed_messages: + fail_id = failed_message.get('Id') + msgs_to_retry.extend([json.loads(ent['MessageBody']) for ent in entries if ent['Id'] == fail_id]) + if msgs_to_retry: + failed_messages = self._send_messages(msgs_to_retry, retries=retries - 1) + failed.extend(failed_messages) + return failed + + def delete_messages(self, messages): + """ + Called after a message has been successfully received and processed. + Removes message from the queue. + Input should be the messages directly from receive messages. At the + very least, needs a list of messages with 'Id' and 'ReceiptHandle' as this + metadata is necessary to identify the message in SQS internals. + + NOTE: deletion does NOT have a retry mechanism + + :param messages: messages to be deleted + :returns: a list with any failed messages + """ + failed = [] + for batch in self._chunk_messages(messages): + # need to change message format, since deleting takes slightly + # different fields what's return from receiving + for i in range(len(batch)): + to_delete = { + 'Id': batch[i]['MessageId'], + 'ReceiptHandle': batch[i]['ReceiptHandle'] + } + batch[i] = to_delete + response = self.client.delete_message_batch( + QueueUrl=self.queue_url, + Entries=batch + ) + failed.extend(response.get('Failed', [])) + return failed + + def add_uuids(self, uuids, ingestion_type='vcf'): + """ Takes a list of string uuids and adds them to the ingestion queue. + If ingestion_type is not specified, it defaults to 'vcf'. + + :precondition: uuids are all of type FileProcessed + :param uuids: uuids to be added to the queue. + :param ingestion_type: the ingestion type of the uuids (default 'vcf' for legacy reasons) + :returns: 2-tuple: uuids queued, failed messages (if any) + """ + curr_time = datetime.datetime.utcnow().isoformat() + msgs = [] + for uuid in uuids: + current_msg = { + 'ingestion_type': ingestion_type, + 'uuid': uuid, + 'timestamp': curr_time + } + msgs.append(current_msg) + failed = self._send_messages(msgs) + return uuids, failed + + def get_counts(self): + """ Returns number counts of waiting/inflight messages + * Makes a boto3 API Call to do so * + + :returns: 2 tuple of waiting, inflight messages + """ + response = self.client.get_queue_attributes( + QueueUrl=self.queue_url, + AttributeNames=[ + 'ApproximateNumberOfMessages', + 'ApproximateNumberOfMessagesNotVisible' + ] + ) + formatted = { + 'waiting': response.get('Attributes', {}).get('ApproximateNumberOfMessages'), + 'inflight': response.get('Attributes', {}).get('ApproximateNumberOfMessagesNotVisible') + } + return formatted['waiting'], formatted['inflight'] + + def receive_messages(self, batch_size=None): + """ Returns an array of messages, if any that are waiting + + :param batch_size: an integer number of messages + :returns: messages received or [] if no messages were ready to be received + """ + response = self.client.receive_message( + QueueUrl=self.queue_url, + MaxNumberOfMessages=self.batch_size if batch_size is None else batch_size + ) + return response.get('Messages', []) + + def clear_queue(self): + """ Clears the queue by receiving all messages. BE CAREFUL as this has potential to + infinite loop under certain conditions. This risk is preferred to using 'purge', which + has a long timeout. The guarantees this functions provides are minimal at best - it should + really only be used in testing. + """ + while True: + messages = self.receive_messages() + self.delete_messages(messages) + if len(messages) == 0: + break diff --git a/snovault/loadxl.py b/snovault/loadxl.py new file mode 100644 index 000000000..f5ffc512d --- /dev/null +++ b/snovault/loadxl.py @@ -0,0 +1,771 @@ +# -*- coding: utf-8 -*- +"""Load collections and determine the order.""" + +import gzip +import json +import magic +import mimetypes +import os +import re +import structlog +from typing import Union +import webtest +import traceback +import uuid + +from base64 import b64encode +from dcicutils.misc_utils import ignored, environ_bool, VirtualApp +from dcicutils.secrets_utils import assume_identity +from PIL import Image +from pyramid.paster import get_app +from pyramid.response import Response +from pyramid.view import view_config +from snovault.util import debug_log + +from .project_app import app_project +from .server_defaults import add_last_modified + + +text = type(u'') +logger = structlog.getLogger(__name__) + + +def includeme(config): + # provide an endpoint to do bulk uploading that just uses loadxl + config.add_route('load_data', '/load_data') + config.scan(__name__) + + +# order of items references with linkTo in a field in 'required' in schemas +# This should be set by the downstream application +ORDER = app_project().loadxl_order() + +IS_ATTACHMENT = [ + 'attachment', + 'file_format_specification', +] + + +# This uuid should be constant across all portals +LOADXL_USER_UUID = "3202fd57-44d2-44fb-a131-afb1e43d8ae5" + + +class LoadGenWrapper(object): + """ + Simple class that accepts a generator function and handles errors by + setting self.caught to the error message. + """ + def __init__(self, gen): + self.gen = gen + self.caught = None + + def __iter__(self): + """ + Iterate through self.gen and see if 'ERROR: ' bytes are in any yielded + value. If so, store the error message as self.caught and raise + StopIteration to halt the generator. + """ + # self.caught = yield from self.gen + for iter_val in self.gen: + if b'ERROR:' in iter_val: + self.caught = iter_val.decode() + yield iter_val + + def close(self): + if self.caught: + logger.error('load_data: failed to load with iter_response', error=self.caught) + + +@view_config(route_name='load_data', request_method='POST', permission='add') +@debug_log +def load_data_view(context, request): + """ + expected input data + + {'local_path': path to a directory or file in file system + 'fdn_dir': inserts folder under encoded + 'store': if not local_path or fdn_dir, look for a dictionary of items here + 'overwrite' (Bool): overwrite if existing data + 'itype': (list or str): only pick some types from the source or specify type in in_file + 'iter_response': invoke the Response as an app_iter, directly calling load_all_gen + 'config_uri': user supplied configuration file} + + post can contain 2 different styles of data + 1) reference to a folder or file (local_path or fd_dir). If this is done + itype can be optionally used to specify type of items loaded from files + 2) store in form of {'item_type': [items], 'item_type2': [items]} + item_type should be same as insert file names i.e. file_fastq + """ + ignored(context) + # this is a bit weird but want to reuse load_data functionality so I'm rolling with it + config_uri = request.json.get('config_uri', 'production.ini') + patch_only = request.json.get('patch_only', False) + post_only = request.json.get('post_only', False) + app = get_app(config_uri, 'app') + environ = {'HTTP_ACCEPT': 'application/json', 'REMOTE_USER': 'TEST'} + testapp = webtest.TestApp(app, environ) + # expected response + request.response.status = 200 + result = { + 'status': 'success', + '@type': ['result'], + } + store = request.json.get('store', {}) + local_path = request.json.get('local_path') + fdn_dir = request.json.get('fdn_dir') + overwrite = request.json.get('overwrite', False) + itype = request.json.get('itype') + iter_resp = request.json.get('iter_response', False) + inserts = None + from_json = False + if fdn_dir: + inserts = app_project().project_filename('tests/data/' + fdn_dir + '/') + elif local_path: + inserts = local_path + elif store: + inserts = store + from_json = True + # if we want to iterate over the response to keep the connection alive + # this directly calls load_all_gen, instead of load_all + if iter_resp: + return Response( + content_type='text/plain', + app_iter=LoadGenWrapper( + load_all_gen(testapp, inserts, None, overwrite=overwrite, itype=itype, + from_json=from_json, patch_only=patch_only, post_only=post_only) + ) + ) + # otherwise, it is a regular view and we can call load_all as usual + if inserts: + res = load_all(testapp, inserts, None, overwrite=overwrite, itype=itype, from_json=from_json) + else: + res = 'No uploadable content found!' + + if res: # None if load_all is successful + print(LOAD_ERROR_MESSAGE) + request.response.status = 422 + result['status'] = 'error' + result['@graph'] = str(res) + return result + + +def trim(value): + """Shorten excessively long fields in error log.""" + if isinstance(value, dict): + return {k: trim(v) for k, v in value.items()} + if isinstance(value, list): + return [trim(v) for v in value] + if isinstance(value, str) and len(value) > 160: + return value[:77] + '...' + value[-80:] + return value + + +def find_doc(docsdir, filename): + """tries to find the file, if not returns false.""" + path = None + if not docsdir: + return + for dirpath in docsdir: + candidate = os.path.join(dirpath, filename) + if not os.path.exists(candidate): + continue + if path is not None: + msg = 'Duplicate filenames: %s, %s' % (path, candidate) + raise ValueError(msg) + path = candidate + if path is None: + return + return path + + +def attachment(path): + """Create an attachment upload object from a filename Embeds the attachment as a data url.""" + filename = os.path.basename(path) + mime_type, encoding = mimetypes.guess_type(path) + major, minor = mime_type.split('/') + try: + detected_type = magic.from_file(path, mime=True).decode('ascii') + except AttributeError: + detected_type = magic.from_file(path, mime=True) + # XXX This validation logic should move server-side. + if not (detected_type == mime_type or + detected_type == 'text/plain' and major == 'text'): + raise ValueError('Wrong extension for %s: %s' % (detected_type, filename)) + with open(path, 'rb') as stream: + attach = {'download': filename, + 'type': mime_type, + 'href': 'data:%s;base64,%s' % (mime_type, b64encode(stream.read()).decode('ascii'))} + if mime_type in ('application/pdf', "application/zip", 'text/plain', + 'text/tab-separated-values', 'text/html', 'application/msword', 'application/vnd.ms-excel', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'): + # XXX Should use chardet to detect charset for text files here. + return attach + if major == 'image' and minor in ('png', 'jpeg', 'gif', 'tiff'): + # XXX we should just convert our tiffs to pngs + stream.seek(0, 0) + im = Image.open(stream) + im.verify() + if im.format != minor.upper(): + msg = "Image file format %r does not match extension for %s" + raise ValueError(msg % (im.format, filename)) + attach['width'], attach['height'] = im.size + return attach + raise ValueError("Unknown file type for %s" % filename) + + +def format_for_attachment(json_data, docsdir): + for field in IS_ATTACHMENT: + if field in json_data: + if isinstance(json_data[field], dict): + pass + elif isinstance(json_data[field], str): + path = find_doc(docsdir, json_data[field]) + if not path: + del json_data[field] + logger.error('Removing {} form {}, expecting path'.format(field, json_data['uuid'])) + else: + json_data[field] = attachment(path) + else: + # malformatted attachment + del json_data[field] + logger.error('Removing {} form {}, expecting path'.format(field, json_data['uuid'])) + return json_data + + +LOAD_ERROR_MESSAGE = """# ██▓ ▒█████ ▄▄▄ ▓█████▄ ██▓ ███▄ █ ▄████ +# ▓██▒ ▒██▒ ██▒▒████▄ ▒██▀ ██▌▓██▒ ██ ▀█ █ ██▒ ▀█▒ +# ▒██░ ▒██░ ██▒▒██ ▀█▄ ░██ █▌▒██▒▓██ ▀█ ██▒▒██░▄▄▄░ +# ▒██░ ▒██ ██░░██▄▄▄▄██ ░▓█▄ ▌░██░▓██▒ ▐▌██▒░▓█ ██▓ +# ░██████▒░ ████▓▒░ ▓█ ▓██▒░▒████▓ ░██░▒██░ ▓██░░▒▓███▀▒ +# ░ ▒░▓ ░░ ▒░▒░▒░ ▒▒ ▓▒█░ ▒▒▓ ▒ ░▓ ░ ▒░ ▒ ▒ ░▒ ▒ +# ░ ░ ▒ ░ ░ ▒ ▒░ ▒ ▒▒ ░ ░ ▒ ▒ ▒ ░░ ░░ ░ ▒░ ░ ░ +# ░ ░ ░ ░ ░ ▒ ░ ▒ ░ ░ ░ ▒ ░ ░ ░ ░ ░ ░ ░ +# ░ ░ ░ ░ ░ ░ ░ ░ ░ ░ +# ░ +# ██▓ ███▄ █ ██████ ▓█████ ██▀███ ▄▄▄█████▓ ██████ +# ▓██▒ ██ ▀█ █ ▒██ ▒ ▓█ ▀ ▓██ ▒ ██▒▓ ██▒ ▓▒▒██ ▒ +# ▒██▒▓██ ▀█ ██▒░ ▓██▄ ▒███ ▓██ ░▄█ ▒▒ ▓██░ ▒░░ ▓██▄ +# ░██░▓██▒ ▐▌██▒ ▒ ██▒▒▓█ ▄ ▒██▀▀█▄ ░ ▓██▓ ░ ▒ ██▒ +# ░██░▒██░ ▓██░▒██████▒▒░▒████▒░██▓ ▒██▒ ▒██▒ ░ ▒██████▒▒ +# ░▓ ░ ▒░ ▒ ▒ ▒ ▒▓▒ ▒ ░░░ ▒░ ░░ ▒▓ ░▒▓░ ▒ ░░ ▒ ▒▓▒ ▒ ░ +# ▒ ░░ ░░ ░ ▒░░ ░▒ ░ ░ ░ ░ ░ ░▒ ░ ▒░ ░ ░ ░▒ ░ ░ +# ▒ ░ ░ ░ ░ ░ ░ ░ ░ ░░ ░ ░ ░ ░ ░ +# ░ ░ ░ ░ ░ ░ ░ +# +# █████▒▄▄▄ ██▓ ██▓ ▓█████ ▓█████▄ +# ▓██ ▒▒████▄ ▓██▒▓██▒ ▓█ ▀ ▒██▀ ██▌ +# ▒████ ░▒██ ▀█▄ ▒██▒▒██░ ▒███ ░██ █▌ +# ░▓█▒ ░░██▄▄▄▄██ ░██░▒██░ ▒▓█ ▄ ░▓█▄ ▌ +# ░▒█░ ▓█ ▓██▒░██░░██████▒░▒████▒░▒████▓ +# ▒ ░ ▒▒ ▓▒█░░▓ ░ ▒░▓ ░░░ ▒░ ░ ▒▒▓ ▒ +# ░ ▒ ▒▒ ░ ▒ ░░ ░ ▒ ░ ░ ░ ░ ░ ▒ ▒ +# ░ ░ ░ ▒ ▒ ░ ░ ░ ░ ░ ░ ░ +# ░ ░ ░ ░ ░ ░ ░ ░ +# ░ """ + + +def load_all(testapp, inserts, docsdir, overwrite=True, itype=None, from_json=False, patch_only=False, post_only=False, + skip_types=None): + """ + Wrapper function for load_all_gen, which invokes the generator returned + from that function. Takes all of the same args as load_all_gen, so + please reference that docstring. + + This function uses LoadGenWrapper, which will catch a returned value from + the execution of the generator, which is an Exception in the case of + load_all_gen. Return that Exception if encountered, which is consistent + with the functionality of load_all_gen. + """ + gen = LoadGenWrapper( + load_all_gen(testapp, inserts, docsdir, overwrite, itype, from_json, patch_only, post_only, skip_types) + ) + # run the generator; don't worry about the output + for _ in gen: + pass + # gen.caught is None for success and an error message on failure + if gen.caught is None: + return None + else: + return Exception(gen.caught) + + +LOADXL_ALLOW_NONE = environ_bool("LOADXL_ALLOW_NONE", default=True) + + +def load_all_gen(testapp, inserts, docsdir, overwrite=True, itype=None, from_json=False, + patch_only=False, post_only=False, skip_types=None, validate_only=False): + """ + Generator function that yields bytes information about each item POSTed/PATCHed. + Is the base functionality of load_all function. + + convert data to store format dictionary (same format expected from from_json=True), + assume main function is to load reasonable number of inserts from a folder + + Args: + testapp + inserts : either a folder, file, or a dictionary in the store format + docsdir : attachment folder + overwrite (bool) : if the database contains the item already, skip or patch + itype (list or str): limit selection to certain type/types + from_json (bool) : if set to true, inserts should be dict instead of folder name + patch_only (bool) : if set to true will only do second round patch - no posts + post_only (bool) : if set to true posts full item no second round or lookup - + use with care - will not work if linkTos to items not in db yet + skip_types (list) : if set to a list of item files the process will ignore these files + Yields: + Bytes with information on POSTed/PATCHed items + + Returns: + None if successful, otherwise a bytes error message + """ + if docsdir is None: + docsdir = [] + # Collect Items + store = {} + if from_json: # we are directly loading json + store = inserts + if not from_json: # we are loading a file + use_itype = False + if os.path.isdir(inserts): # we've specified a directory + if not inserts.endswith('/'): + inserts += '/' + files = [i for i in os.listdir(inserts) if (i.endswith('.json') or i.endswith('.json.gz')) + and (i not in skip_types if skip_types else True)] + elif os.path.isfile(inserts): # we've specified a single file + files = [inserts] + # use the item type if provided AND not a list + # otherwise guess from the filename + use_itype = True if (itype and isinstance(itype, str)) else False + else: # cannot get the file + err_msg = 'Failure loading inserts from %s. Could not find matching file or directory.' % inserts + print(err_msg) + yield str.encode('ERROR: %s\n' % err_msg) + return + # raise StopIteration + # load from the directory/file + for a_file in files: + if use_itype: + item_type = itype + else: + item_type = a_file.split('/')[-1].split(".")[0] + a_file = inserts + a_file + store[item_type] = get_json_file_content(a_file) + + # if there is a defined set of items, subtract the rest + if itype: + if isinstance(itype, list): + store = {i: store[i] for i in itype if i in store} + else: + store = {itype: store.get(itype, [])} + # clear empty values + store = {k: v for k, v in store.items() if v is not None} + if not store: + if LOADXL_ALLOW_NONE: + return + if from_json: + err_msg = 'No items found in input "store" json' + else: + err_msg = 'No items found in %s' % inserts + if itype: + err_msg += ' for item type(s) %s' % itype + print(err_msg) + yield str.encode('ERROR: %s' % err_msg) + return + # raise StopIteration + # order Items + all_types = list(store.keys()) + for ref_item in reversed(ORDER): + if ref_item in all_types: + all_types.insert(0, all_types.pop(all_types.index(ref_item))) + # collect schemas + profiles = testapp.get('/profiles/?frame=raw').json + + # run step1 - if item does not exist, post with minimal metadata (and skip indexing since we will patch + # in round 2) + second_round_items = {} + if not patch_only: + for a_type in all_types: + first_fields = [] + if not post_only: + # this conversion of schema name to object type works for all existing schemas at the moment + obj_type = "".join([i.title() for i in a_type.split('_')]) + # minimal schema + schema_info = profiles[obj_type] + req_fields = schema_info.get('required', []) + ids = schema_info.get('identifyingProperties', []) + # some schemas did not include aliases + if 'aliases' not in ids: + ids.append('aliases') + # file format is required for files, but its usability depends this field + if a_type in ['file_format', 'experiment_type']: + req_fields.append('valid_item_types') + first_fields = list(set(req_fields + ids)) + skip_existing_items = set() + posted = 0 + skip_exist = 0 + for an_item in store[a_type]: + exists = False + if not post_only: + try: + # 301 because @id is the existing item path, not uuid + testapp.get('/'+an_item['uuid'], status=[200, 301]) + exists = True + except Exception: + pass + # skip the items that exists + # if overwrite=True, still include them in PATCH round + if exists: + skip_exist += 1 + if not overwrite: + skip_existing_items.add(an_item['uuid']) + yield str.encode('SKIP: %s\n' % an_item['uuid']) + else: + if post_only: + to_post = an_item + else: + to_post = {key: value for (key, value) in an_item.items() if key in first_fields} + post_request = f'/{a_type}?skip_indexing=true' + if validate_only: + post_request += '&check_only=true' + to_post = format_for_attachment(to_post, docsdir) + try: + res = testapp.post_json(post_request, to_post) # skip indexing in round 1 + if not validate_only: + assert res.status_code == 201 + posted += 1 + # yield bytes to work with Response.app_iter + yield str.encode('POST: %s\n' % res.json['@graph'][0]['uuid']) + else: + assert res.status_code == 200 + yield str.encode('CHECK: %s\n' % an_item['uuid']) + except Exception as e: + print('Posting {} failed. Post body:\n{}\nError Message:{}' + ''.format(a_type, str(first_fields), str(e))) + # remove newlines from error, since they mess with generator output + e_str = str(e).replace('\n', '') + yield str.encode('ERROR: %s\n' % e_str) + return + # raise StopIteration + if not validate_only: + if not post_only: + second_round_items[a_type] = [i for i in store[a_type] if i['uuid'] not in skip_existing_items] + else: + second_round_items[a_type] = [] + logger.info('{} 1st: {} items posted, {} items exists.'.format(a_type, posted, skip_exist)) + logger.info('{} 1st: {} items will be patched in second round' + .format(a_type, str(len(second_round_items.get(a_type, []))))) + elif overwrite and not post_only: + logger.info('Posting round skipped') + for a_type in all_types: + second_round_items[a_type] = [i for i in store[a_type]] + logger.info('{}: {} items will be patched in second round' + .format(a_type, str(len(second_round_items.get(a_type, []))))) + + # Round II - patch the rest of the metadata (ensuring to index by not passing the query param) + rnd = ' 2nd' if not patch_only else '' + for a_type in all_types: + patched = 0 + if not second_round_items[a_type]: + logger.info('{}{}: no items to patch'.format(a_type, rnd)) + continue + for an_item in second_round_items[a_type]: + an_item = format_for_attachment(an_item, docsdir) + try: + add_last_modified(an_item, userid=LOADXL_USER_UUID) + res = testapp.patch_json('/'+an_item['uuid'], an_item) + assert res.status_code == 200 + patched += 1 + # yield bytes to work with Response.app_iter + yield str.encode('PATCH: %s\n' % an_item['uuid']) + except Exception as e: + print('Patching {} failed. Patch body:\n{}\n\nError Message:\n{}'.format( + a_type, str(an_item), str(e))) + print('Full error: %s' % traceback.format_exc()) + e_str = str(e).replace('\n', '') + yield str.encode('ERROR: %s\n' % e_str) + return + # raise StopIteration + logger.info('{}{}: {} items patched .'.format(a_type, rnd, patched)) + + # explicit return upon finish + return None + + +def get_json_file_content(filename): + """ + Helper function to obtain objects from (compressed) json files. + + :param filename: str file path + :returns: object loaded from file + """ + if filename.endswith(".json"): + with open(filename) as f: + result = json.loads(f.read()) + elif filename.endswith(".json.gz"): + with gzip.open(filename) as f: + result = json.loads(f.read()) + else: + raise Exception("Expecting a .json or .json.gz file but found %s." % filename) + return result + + +def load_data(app, indir='inserts', docsdir=None, overwrite=False, + use_master_inserts=True, skip_types=None): + """ + This function will take the inserts folder as input, and place them to the given environment. + args: + app: + indir (inserts): inserts folder, should be relative to tests/data/ + docsdir (None): folder with attachment documents, relative to tests/data + """ + environ = { + 'HTTP_ACCEPT': 'application/json', + 'REMOTE_USER': 'TEST', + } + testapp = webtest.TestApp(app, environ) + # load master-inserts by default + if indir != 'master-inserts' and use_master_inserts: + master_inserts = app_project().project_filename('tests/data/master-inserts/') + master_res = load_all(testapp, master_inserts, [], skip_types=skip_types) + if master_res: # None if successful + print(LOAD_ERROR_MESSAGE) + logger.error('load_data: failed to load from %s' % master_inserts, error=master_res) + return master_res + + if not indir.endswith('/'): + indir += '/' + inserts = app_project().project_filename('tests/data/' + indir) + if docsdir is None: + docsdir = [] + else: + if not docsdir.endswith('/'): + docsdir += '/' + docsdir = [app_project().project_filename('tests/data/' + docsdir)] + res = load_all(testapp, inserts, docsdir, overwrite=overwrite) + if res: # None if successful + print(LOAD_ERROR_MESSAGE) + logger.error('load_data: failed to load from %s' % docsdir, error=res) + return res + return None # unnecessary, but makes it more clear that no error was encountered + + +def load_test_data(app, overwrite=False): + """ + Load inserts and master-inserts + + Returns: + None if successful, otherwise Exception encountered + """ + return load_data(app, docsdir='documents', indir='inserts', + overwrite=overwrite) + + +def load_local_data(app, overwrite=False): + """ + Load inserts from temporary insert folders, if present and populated + with .json insert files. + If not present, load inserts and master-inserts. + + Returns: + None if successful, otherwise Exception encountered + """ + + test_insert_dirs = [ + 'temp-local-inserts', + 'demo_inserts' + ] + + for test_insert_dir in test_insert_dirs: + chk_dir = app_project().project_filename("tests/data/" + test_insert_dir) + for (dirpath, dirnames, filenames) in os.walk(chk_dir): + if any([fn for fn in filenames if fn.endswith('.json') or fn.endswith('.json.gz')]): + logger.info('Loading inserts from "{}" directory.'.format(test_insert_dir)) + return load_data(app, docsdir='documents', indir=test_insert_dir, use_master_inserts=True, + overwrite=overwrite) + + # Default to 'inserts' if no temp inserts found. + return load_data(app, docsdir='documents', indir='inserts', use_master_inserts=True, overwrite=overwrite) + + +def load_prod_data(app, overwrite=False): + """ + Load master-inserts + + Returns: + None if successful, otherwise Exception encountered + """ + return load_data(app, indir='master-inserts', overwrite=overwrite) + + +def load_deploy_data(app, overwrite=True, **kwargs): + """ + Load deploy-inserts and master-inserts. Overwrites duplicate items + in both directories to match deploy-inserts version. + + Returns: + None if successful, otherwise Exception encountered + """ + return load_data(app, docsdir='documents', indir="deploy-inserts", overwrite=True) + + +# Set of emails required by the application to function +REQUIRED_USER_CONFIG = [ + { + 'email': 'loadxl@hms.harvard.edu', + 'first_name': 'loadxl', + 'last_name': 'loadxl', + 'uuid': '3202fd57-44d2-44fb-a131-afb1e43d8ae5' + }, + { + 'email': 'cgap.platform@gmail.com', + 'first_name': 'Platform', + 'last_name': 'Admin', + 'uuid': 'b5f738b6-455a-42e5-bc1c-77fbfd9b15d2' + }, + { + 'email': 'foursight.app@gmail.com', + 'first_name': 'Foursight', + 'last_name': 'App', + 'uuid': '7677f8a8-79d2-4cff-ab0a-a967a2a68e39' + }, + { + 'email': 'tibanna.app@gmail.com', + 'first_name': 'Tibanna', + 'last_name': 'App', + 'uuid': 'b041dba8-e2b2-4e54-a621-97edb508a0c4' + }, +] + + +def load_custom_data(app, overwrite=False): + """ + Load deploy-inserts and master-inserts, EXCEPT instead of loading the default user.json, + generate users (if they do not already exist) from the ENCODED_ADMIN_USERS setting in + the GAC. We assume it has structure consistent with what the template will build in 4dn-cloud-infra + ie: + [{"first_name": "John", "last_name": "Doe", "email": "john_doe@example.com"}] + """ + # start with the users + environ = { + 'HTTP_ACCEPT': 'application/json', + 'REMOTE_USER': 'TEST', + } + testapp = webtest.TestApp(app, environ) + identity = assume_identity() + admin_users = json.loads(identity.get('ENCODED_ADMIN_USERS', '{}')) + if not admin_users: # we assume you must have set one of these + print(LOAD_ERROR_MESSAGE) + logger.error('load_custom_data: failed to load users as none were set - ensure GAC value' + ' ENCODED_ADMIN_USERS is set and formatted correctly!') + return admin_users + + # post all users + for user in (admin_users + REQUIRED_USER_CONFIG): + try: + first_name, last_name, email, _uuid = (user['first_name'], user['last_name'], user['email'], + user.get('uuid', str(uuid.uuid4()))) + except KeyError: + print(LOAD_ERROR_MESSAGE) + logger.error('load_custom_data: failed to load users as they were malformed - ensure GAC value' + ' ENCODED_ADMIN_USERS is set, has type array and consists of objects all containing keys' + ' and values for first_name, last_name and email!') + return user + item = { + 'first_name': first_name, + 'last_name': last_name, + 'email': email, + 'groups': ['admin'], + 'uuid': _uuid + } + testapp.post_json('/User', item, status=201) + + res = load_data(app, docsdir='documents', indir='deploy-inserts', overwrite=overwrite, skip_types=['user.json']) + if res: # None if successful + print(LOAD_ERROR_MESSAGE) + logger.error('load_custom_data: failed to load from deploy-inserts', error=res) + return res + + return None + + +def load_cypress_data(app, overwrite=False): + """ + Load master-inserts and cypress-test-inserts. + By default, does not overwrite duplicate items in both directories + + Returns: + None if successful, otherwise Exception encountered + """ + return load_data(app, indir='cypress-test-inserts', overwrite=overwrite) + + +def load_data_by_type(app, indir='master-inserts', overwrite=True, itype=None): + """ + This function will load inserts of type itype from the indir directory. + args: + indir (inserts): inserts folder, should be relative to tests/data/ + itype: item type to load (e.g. "higlass_view_config") + """ + + if itype is None: + print('load_data_by_type: No item type specified. Not loading anything.') + return + + environ = { + 'HTTP_ACCEPT': 'application/json', + 'REMOTE_USER': 'TEST', + } + testapp = webtest.TestApp(app, environ) + + if not indir.endswith('/'): + indir += '/' + inserts = app_project().project_filename('tests/data/' + indir) + + res = load_all(testapp, inserts, docsdir=[], overwrite=overwrite, itype=itype) + if res: # None if successful + print(LOAD_ERROR_MESSAGE) + logger.error('load_data_by_type: failed to load from %s' % indir, error=res) + return res + return None # unnecessary, but makes it more clear that no error was encountered + + +def load_data_via_ingester(vapp: VirtualApp, + ontology: dict, + itype: Union[str, list] = ["ontology", "ontology_term"], + validate_only: bool = False) -> dict: + """ + Entry point for call from encoded.ingester.processors.handle_ontology_update (2023-03-08). + Returns dictionary itemizing the created (post), updated (patch), skipped (skip), checked (check), + and errored (error) ontology term uuids; as well as a count of the number of unique uuids processed; + the checked category is for validate_only; + """ + response = load_all_gen(vapp, ontology, None, overwrite=True, itype=itype, + from_json=True, patch_only=False, validate_only=validate_only) + results = {"post": [], "patch": [], "skip": [], "check": [], "error": []} + unique_uuids = set() + INGESTION_RESPONSE_PATTERN = re.compile(r"^([A-Z]+): ([0-9a-f-]+)$") + for item in response: + # Assume each item in the response looks something like one of (string or bytes): + # POST: 15425d13-01ce-4e61-be5d-cd04401dff29 + # PATCH: 5b45e66f-7b4f-4923-824b-d0864a689bb + # SKIP: 4efe24b5-eb17-4406-adb8-060ea2ae2180 + # CHECK: deadbeef-eb17-4406-adb8-0eacafebabe + # ERROR: 906c4667-483e-4a08-96b9-3ce85ce8bf8c + # Note that SKIP means skip post/insert; still may to patch/update (if overwrite). + if isinstance(item, bytes): + item = item.decode("ascii") + elif not isinstance(item, str): + logger.warning(f"load_data_via_ingester: skipping response item of unexpected type ({type(item)}): {item!r}") + continue + match = INGESTION_RESPONSE_PATTERN.match(item) + if not match: + logger.warning(f"load_data_via_ingester: skipping response item in unexpected form: {item!r}") + continue + action = match.group(1).lower() + uuid = match.group(2) + if not results.get(action): + results[action] = [] + results[action].append(uuid) + unique_uuids.add(uuid) + results["unique"] = len(unique_uuids) + return results diff --git a/snovault/local_roles.py b/snovault/local_roles.py new file mode 100644 index 000000000..d117a6859 --- /dev/null +++ b/snovault/local_roles.py @@ -0,0 +1,131 @@ +from dcicutils.misc_utils import environ_bool, PRINT +from pyramid.authorization import ACLAuthorizationPolicy +from pyramid.compat import is_nonstr_iter +from pyramid.interfaces import IAuthorizationPolicy +from pyramid.location import lineage +from zope.interface import implementer + + +DEBUG_PERMISSIONS = environ_bool("DEBUG_PERMISSIONS", default=False) + + +# This code (right now) is identical to that of https://github.com/lrowe/pyramid_localroles +# We are going to need to rip this out for CGAP. So first let's bring it in so we can debug and +# modify it while we implement a new permissions structure. + +def local_principals(context, principals): + """ The idea behind this is to process __ac_local_roles__ (and a boolean __ac_local_roles_block__ + to disable) and add local principals. This only works if you're in correct context, though, + which does not seem to be the case. + """ + local_principals = set() + + block = False + for location in lineage(context): + if block: + break + block = getattr(location, '__ac_local_roles_block__', False) + local_roles = getattr(location, '__ac_local_roles__', None) + + if local_roles and callable(local_roles): + local_roles = local_roles() + + if not local_roles: + continue + + for principal in principals: + try: + roles = local_roles[principal] + except KeyError: + pass + else: + if not is_nonstr_iter(roles): + roles = [roles] + local_principals.update(roles) + + if not local_principals: + return principals + + local_principals.update(principals) + + if DEBUG_PERMISSIONS: + PRINT("local_principals") + PRINT(" context.collection=", context.collection) + PRINT(" context.__acl__()=", context.__acl__()) + PRINT(" context.collection.__ac_local_roles_()=", context.__ac_local_roles__()) + PRINT("local_principals returning", local_principals) + + return local_principals + + +def merged_local_principals(context, principals): + # XXX Possibly limit to prefix like 'role.' + set_principals = frozenset(principals) + local_principals = set() + block = False + for location in lineage(context): + if block: + break + + block = getattr(location, '__ac_local_roles_block__', False) + local_roles = getattr(location, '__ac_local_roles__', None) + + if local_roles and callable(local_roles): + local_roles = local_roles() + + if not local_roles: + continue + + for principal, roles in local_roles.items(): + if not is_nonstr_iter(roles): + roles = [roles] + if not set_principals.isdisjoint(roles): + local_principals.add(principal) + + if not local_principals: + return principals + + local_principals.update(principals) + + local_principals = list(local_principals) + + if DEBUG_PERMISSIONS: + PRINT("merged_local_principals") + PRINT(" context.collection=", context.collection) + PRINT(" context.__acl__()=", context.__acl__()) + PRINT(" context.collection.__ac_local_roles_()=", context.__ac_local_roles__()) + PRINT("merged_local_principals returning", local_principals) + + return local_principals + + +@implementer(IAuthorizationPolicy) +class LocalRolesAuthorizationPolicy(object): + """Local roles authorization policy + Objects may be given an ``__ac_local_roles__`` property which may be either a + mapping or a callable that returns a mapping from principal id to a list of principals. + """ + def __init__(self, wrapped_policy=None): + if wrapped_policy is None: + wrapped_policy = ACLAuthorizationPolicy() + self.wrapped_policy = wrapped_policy + + def permits(self, context, principals, permission): + principals = local_principals(context, principals) + result = self.wrapped_policy.permits(context, principals, permission) + if DEBUG_PERMISSIONS: + PRINT("LocalRolesAuthorizationPolicy.permits") + PRINT(" permission=", permission) + PRINT(" principals=", principals) + PRINT("LocalRolesAuthorizationPolicy.permits returning", result) + return result + + def principals_allowed_by_permission(self, context, permission): + principals = self.wrapped_policy.principals_allowed_by_permission(context, permission) + result = merged_local_principals(context, principals) + if DEBUG_PERMISSIONS: + PRINT("LocalRolesAuthorizationPolicy.principals_allowed_by_permission") + PRINT(" permission=", permission) + PRINT(" principals=", principals) + PRINT("LocalRolesAuthorizationPolicy.principals_allowed_by_permission returning", result) + return result diff --git a/snovault/memlimit.py b/snovault/memlimit.py new file mode 100644 index 000000000..10c25f599 --- /dev/null +++ b/snovault/memlimit.py @@ -0,0 +1,74 @@ +import humanfriendly +import logging +import psutil + + +# https://code.google.com/p/modwsgi/wiki/RegisteringCleanupCode + + +class Generator2: + def __init__(self, iterable, callback, environ): + self.__iterable = iterable + self.__callback = callback + self.__environ = environ + + def __iter__(self): + for item in self.__iterable: + yield item + + def close(self): + try: + if hasattr(self.__iterable, 'close'): + self.__iterable.close() + finally: + self.__callback(self.__environ) + + +class ExecuteOnCompletion2: + def __init__(self, application, callback): + self.__application = application + self.__callback = callback + + def __call__(self, environ, start_response): + try: + result = self.__application(environ, start_response) + except BaseException: + self.__callback(environ) + raise + return Generator2(result, self.__callback, environ) + + +def rss_checker(rss_limit=None, rss_percent_limit=None): + """ + Uses a configured rss_limit (absolute amount in bytes) and percentage + rss_limit to determine whether to kill the running process. + If the current rss usage is above rss_limit AND the percentage rss usage + of physical memory is above rss_percent_limit, kill the process + """ + log = logging.getLogger(__name__) + process = psutil.Process() + + def callback(environ): + rss = process.memory_info().rss + over_rss = rss_limit and rss > rss_limit + rss_perc = process.memory_percent(memtype="rss") # XXX: this does not work on Fargate (reports host stats) + if rss_percent_limit: + over_perc = rss_perc > rss_percent_limit + else: + over_perc = True # only consider rss if we have no percent set + if over_rss and over_perc: + log.error(f"Killing process. Memory usage: {rss}Mb (limit {rss_limit}); Percentage " + f"{rss_perc} (limit {rss_percent_limit})") + process.kill() + + return callback + + +def filter_app(app, global_conf, rss_limit=None, rss_percent_limit=None): + if rss_limit is not None: + rss_limit = humanfriendly.parse_size(rss_limit) + if rss_percent_limit is not None: + rss_percent_limit = float(rss_percent_limit) + + callback = rss_checker(rss_limit, rss_percent_limit) + return ExecuteOnCompletion2(app, callback) diff --git a/snovault/nginx-dev.conf b/snovault/nginx-dev.conf new file mode 100644 index 000000000..52df908ee --- /dev/null +++ b/snovault/nginx-dev.conf @@ -0,0 +1,34 @@ +# Minimal nginx proxy for development +# brew install nginx +# nginx -p . nginx-dev.conf + +events { + worker_connections 2048; +} +http { + resolver 8.8.8.8; + upstream app { + server 127.0.0.1:6543; + keepalive 10; + } + server { + listen 8000; + location / { + # Normalize duplicate slashes + if ($request ~ ^(GET|HEAD)\s([^?]*)//(.*)\sHTTP/[0-9.]+$) { + return 301 $2/$3; + } + proxy_set_header Host $http_host; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_pass http://app; + proxy_set_header Connection ""; + } + location ~ ^/_proxy/(.*)$ { + internal; + proxy_buffering off; + proxy_pass $1$is_args$args; + } + } + client_max_body_size 0; +} diff --git a/snovault/parallel.py b/snovault/parallel.py new file mode 100644 index 000000000..47462eb98 --- /dev/null +++ b/snovault/parallel.py @@ -0,0 +1,33 @@ +import logging +from multiprocessing import cpu_count # pylint: disable=no-name-in-module +from multiprocessing import Pool # pylint: disable=no-name-in-module + + +log = logging.getLogger(__name__) + + +class ParallelTask(object): + def __init__(self, task_func, num_cpu=None, no_parallel=False): + """ + Args: + - task_func (callable): Task to run on each work item. Must be a + global function, or instance of a global class, due to + multiprocessing's limitations. + - no_parallel (bool): If true, run everything in the main thread. + """ + self.task_func = task_func + self.no_parallel = no_parallel + self.num_cpu = num_cpu or cpu_count() - 1 + + def run(self, items, chunk_size=1): + """Run task in parallel on a list of work items. + + Uses multiprocessing in order to avoid Python's GIL. + """ + if not self.no_parallel: + with Pool(self.num_cpu) as pool: + for res in pool.imap(self.task_func, items, chunk_size): + yield res + else: + for res in map(self.task_func, items): + yield res diff --git a/snovault/project/access_key.py b/snovault/project/access_key.py new file mode 100644 index 000000000..957d68853 --- /dev/null +++ b/snovault/project/access_key.py @@ -0,0 +1,3 @@ +class SnovaultProjectAccessKey: + def access_key_has_expiration_date(self): + return True diff --git a/snovault/project/authentication.py b/snovault/project/authentication.py new file mode 100644 index 000000000..636b1101e --- /dev/null +++ b/snovault/project/authentication.py @@ -0,0 +1,33 @@ +# Authentication related functions which may be overriden by an implementing app, +# e.g. Foursight or CGAP portal, using the dcicutils project_utils mechanism. + +from ..authentication import login, logout + +class SnovaultProjectAuthentication: + + def login(self, context, request, *, samesite): + return login(context, request, samesite=samesite) + + def logout(self, context, request): + return logout(context, request) + + def namespaced_authentication_policy_authenticated_userid(self, namespaced_authentication_policy, request, set_user_info_property): + return namespaced_authentication_policy._authenticated_userid_implementation(request, set_user_info_property) + + def namespaced_authentication_policy_unauthenticated_userid(self, namespaced_authentication_policy, request): + return namespaced_authentication_policy._unauthenticated_userid_implementation(request) + + def note_auth0_authentication_policy_unauthenticated_userid(self, auth0_authentication_policy, request, email, id_token): + pass + + # TODO: Maybe something like ... + # def __init__(self): + # self.login_policy = + + +# TODO: Maybe something like ... +# def SnovaultNamespacedAuthenticationPolicy: +# def __init__(self, app_project): +# self.app_project = app_project +# def authenticated_userid(self, namespaced_authentication_policy, request, set_user_info_property): +# return namespaced_authentication_policy._authenticated_userid_implementation(request, set_user_info_property) diff --git a/snovault/project/authorization.py b/snovault/project/authorization.py new file mode 100644 index 000000000..1bba8b42f --- /dev/null +++ b/snovault/project/authorization.py @@ -0,0 +1,9 @@ +# Authorization related functions which may be overriden by an implementing app, +# e.g. Foursight or CGAP portal, using the dcicutils project_utils mechanism. + +from ..authorization import _create_principals + +class SnovaultProjectAuthorization: + + def authorization_create_principals(self, login, user, collections): + return _create_principals(login, user, collections) diff --git a/snovault/project/ingestion.py b/snovault/project/ingestion.py new file mode 100644 index 000000000..c1f932634 --- /dev/null +++ b/snovault/project/ingestion.py @@ -0,0 +1,6 @@ +# Ingestion related functions which may be overriden by an implementing app, +# e.g. Foursight or CGAP portal, using the dcicutils project_utils mechanism. + +class SnovaultProjectIngestion: + def note_ingestion_enqueue_uuids_for_request(self, ingestion_type, request, uuids): + pass diff --git a/snovault/project/loadxl.py b/snovault/project/loadxl.py new file mode 100644 index 000000000..87d173c35 --- /dev/null +++ b/snovault/project/loadxl.py @@ -0,0 +1,6 @@ +# Loadxl related functions which may be overriden by an implementing app, +# e.g. Foursight or CGAP portal, using the dcicutils project_utils mechanism. + +class SnovaultProjectLoadxl: + def loadxl_order(self): + return [] diff --git a/snovault/project_app.py b/snovault/project_app.py new file mode 100644 index 000000000..4c7006aa9 --- /dev/null +++ b/snovault/project_app.py @@ -0,0 +1,3 @@ +from dcicutils.project_utils import C4ProjectRegistry + +app_project = C4ProjectRegistry.app_project_maker() diff --git a/snovault/project_defs.py b/snovault/project_defs.py new file mode 100644 index 000000000..35109d132 --- /dev/null +++ b/snovault/project_defs.py @@ -0,0 +1,17 @@ +from dcicutils.project_utils import C4ProjectRegistry, C4Project +from .project.access_key import SnovaultProjectAccessKey +from .project.authentication import SnovaultProjectAuthentication +from .project.authorization import SnovaultProjectAuthorization +from .project.ingestion import SnovaultProjectIngestion +from .project.loadxl import SnovaultProjectLoadxl + + +@C4ProjectRegistry.register("dcicsnovault") +class SnovaultProject(SnovaultProjectAccessKey, + SnovaultProjectAuthentication, + SnovaultProjectAuthorization, + SnovaultProjectIngestion, + SnovaultProjectLoadxl, + C4Project): + NAMES = {"NAME": "snovault", "PYPI_NAME": "dcicsnovault"} + ACCESSION_PREFIX = "SNO" diff --git a/snovault/renderers.py b/snovault/renderers.py index 103d0675a..15ae6bd53 100644 --- a/snovault/renderers.py +++ b/snovault/renderers.py @@ -1,124 +1,239 @@ +import json import logging +import os -from dcicutils.misc_utils import ignorable, ignored +import psutil +import time + +from dcicutils.misc_utils import environ_bool, PRINT, ignored +from functools import lru_cache +from pkg_resources import resource_filename from pyramid.events import BeforeRender, subscriber from pyramid.httpexceptions import ( - HTTPMovedPermanently, HTTPPreconditionFailed, HTTPUnauthorized, HTTPUnsupportedMediaType, + HTTPMovedPermanently, + HTTPPreconditionFailed, + HTTPUnauthorized, + HTTPUnsupportedMediaType, + HTTPNotAcceptable, + HTTPServerError ) -from pyramid.security import forget +from pyramid.response import Response from pyramid.settings import asbool from pyramid.threadlocal import manager from pyramid.traversal import split_path_info, _join_path_tuple +from subprocess_middleware.worker import TransformWorker +from urllib.parse import urlencode from webob.cookies import Cookie - -from .validation import CSRFTokenError +from .project_app import app_project +from .util import content_type_allowed log = logging.getLogger(__name__) def includeme(config): - config.add_tween('.renderers.fix_request_method_tween_factory', - under='snovault.stats.stats_tween_factory') - config.add_tween('.renderers.normalize_cookie_tween_factory', - under='.renderers.fix_request_method_tween_factory') - config.add_tween('.renderers.set_x_request_url_tween_factory', - under='.renderers.normalize_cookie_tween_factory') - config.add_tween('.renderers.security_tween_factory', - under='pyramid_tm.tm_tween_factory') - config.scan(__name__) + """ + Can get tween ordering by executing the following on command-line from root dir: + `bin/ptween development.ini` + + We could alternatively put these in the base.ini file explicitly. + + See: https://docs.pylonsproject.org/projects/pyramid/en/latest/narr/hooks.html#registering-tweens + + --- IMPORTANT --- + The `handler` arg of 'tween factory' functions refers to the subsequent tween to be called. + This means that if handler(request) is called, then the downstream tweens are acted upon it, + until response is returned. It's an ONION depending on where handler(request) called within a tween + and NOT necessarily an explicit ordering (unless `return handler(request)` is last line of each tween). + + A request goes down the tween chain from INGRESS to MAIN and then back up to INGRESS. + `handler(request)` calls the subsequent tween and returns complete tweened-from-there response. + + Tween Chain as of 05/23/2019: + + Position Name + -------- ---- + - INGRESS + 0 snovault.stats.stats_tween_factory + 1 .renderers.validate_request_tween_factory + 2 .renderers.render_page_html_tween_factory + 3 .renderers.set_response_headers_tween_factory + 4 pyramid_tm.tm_tween_factory + 5 .renderers.security_tween_factory + 6 pyramid.tweens.excview_tween_factory + - MAIN + + The `handler` kwarg of tween factories refers to the subsequent tween to be called. + This means that if handler(request) is called, then the downstream tweens are acted upon it, + until response is returned. It's an ONION! + + """ + + config.add_tween('.renderers.validate_request_tween_factory', under='snovault.stats.stats_tween_factory') + # DISABLED - .add_tween('.renderers.remove_expired_session_cookies_tween_factory', + # under='.renderers.validate_request_tween_factory') + config.add_tween('.renderers.render_page_html_tween_factory', under='.renderers.validate_request_tween_factory') + # The above tweens, when using response (= `handler(request)`) act on the _transformed_ response + # (containing HTML body). + # The below tweens run _before_ the JS rendering. Responses in these tweens have not been transformed to HTML yet. + config.add_tween('.renderers.set_response_headers_tween_factory', under='.renderers.render_page_html_tween_factory') -def fix_request_method_tween_factory(handler, registry): - """ Fix Request method changed by mod_wsgi. + # If this isn't placed under 'pyramid_tm.tm_tween_factory' (e.g. under resp headers or something) + # then the headers aren't preserved or available in server-side render or response. + config.add_tween('.renderers.security_tween_factory', under='pyramid_tm.tm_tween_factory') - See: https://github.com/GrahamDumpleton/mod_wsgi/issues/2 + config.scan(__name__) + + +def validate_request_tween_factory(handler, registry): + """ + Updates request.environ's REQUEST_METHOD to be X_REQUEST_METHOD if present. + Asserts that if a POST (or similar) request is in application/json format, + with exception for /metadata/* endpoints. Apache config: SetEnvIf Request_Method HEAD X_REQUEST_METHOD=HEAD """ ignored(registry) - def fix_request_method_tween(request): + def validate_request_tween(request): + + # Fix Request method changed by mod_wsgi. + # See: https://github.com/GrahamDumpleton/mod_wsgi/issues/2 environ = request.environ if 'X_REQUEST_METHOD' in environ: environ['REQUEST_METHOD'] = environ['X_REQUEST_METHOD'] - return handler(request) - return fix_request_method_tween + if request.method in ('GET', 'HEAD'): + # If GET request, don't need to check `request.content_type` + # Includes page text/html requests. + return handler(request) + + elif content_type_allowed(request): + return handler(request) + + else: + detail = "Request content type %s is not 'application/json'" % request.content_type + raise HTTPUnsupportedMediaType(detail) + + return validate_request_tween def security_tween_factory(handler, registry): ignored(registry) def security_tween(request): - login = None - ignorable(login) # The value of this assignment will long be used if there's an error in the 'if'-logic below. + """ + Executed inside/prior-to any page transforms and inside/prior-to + `pyramid_tm.tm_tween_factory` (transaction management tween). + This is because request.authenticated_userid as well as `request.user_info` + property getters _may_ access Postgres DB to get user properties (if not yet + indexed in ES) and all DB transactions must complete before transaction + management tween is completed. + """ + expected_user = request.headers.get('X-If-Match-User') - if expected_user is not None: - login = request.authenticated_userid - if login != 'mailto.' + expected_user: + if expected_user is not None: # Not sure when this is the case + if request.authenticated_userid != 'mailto.' + expected_user: detail = 'X-If-Match-User does not match' raise HTTPPreconditionFailed(detail) + if request.authorization is not None or asbool(request.headers.get('X-Auth-Challenge', False)): + # wget may only send credentials following a challenge response. + if not request.authenticated_userid: + if not hasattr(request, 'auth0_expired'): + # Not a "Bearer" JWT token in Auth header. Or other error. + # We send a 401 "Unauthorized" exception if authentication issue or expiration. + # We send a 403 "Forbidden" (TODO: assert) if authorized correctly but no view permission + raise HTTPUnauthorized( + title="No Access", + comment="Invalid Authorization header or Auth Challenge response.", + headers={ + 'WWW-Authenticate': ( + f'Bearer realm="{request.domain}";' + f' Basic realm="{request.domain}"' + ) + } + ) + + if hasattr(request, 'auth0_expired'): + # Add some security-related headers on the up-swing + response = handler(request) + if request.auth0_expired: + # return response + # + # If have the attribute and it is true, then our session has expired. + # This is true for both AJAX requests (which have request.authorization) & browser page + # requests (which have cookie); both cases handled in authentication.py + # Informs client or libs/react-middleware.js serverside render of expired token + # to set logged-out state in front-end in either doc request or xhr request & set appropriate alerts + response.headers['X-Request-JWT'] = "expired" + + # Especially for initial document requests by browser, but also desired for AJAX and other requests, + # unset jwtToken cookie so initial client-side React render has App(instance).state.session = false + # to be synced w/ server-side + response.set_cookie( + name='jwtToken', + value=None, # = i.e., same as response.delete_cookie(..) + domain=request.domain, + max_age=0, + path='/', + overwrite=True + ) + response.status_code = 401 + response.headers['WWW-Authenticate'] = ( + f'Bearer realm="{request.domain}", title="Session Expired";' + f' Basic realm="{request.domain}"' + ) + else: + # We have JWT and it's not expired. Add 'X-Request-JWT' & 'X-User-Info' header. + # For performance, only do it if should transform to HTML as is not needed on every request. + if should_transform(request, response): + login = request.authenticated_userid + if login: + authtype, email = login.split('.', 1) + if authtype == 'auth0': + # This header is parsed in renderer.js, or, more accurately, + # by libs/react-middleware.js which is imported by server.js and compiled into + # renderer.js. Is used to get access to User Info on initial web page render. + response.headers['X-Request-JWT'] = request.cookies.get('jwtToken', '') + user_info = request.user_info.copy() # Re-ified property set in authentication.py + response.headers['X-User-Info'] = json.dumps(user_info) + else: + response.headers['X-Request-JWT'] = "null" + return response - # wget may only send credentials following a challenge response. - auth_challenge = asbool(request.headers.get('X-Auth-Challenge', False)) - if auth_challenge or request.authorization is not None: - login = request.authenticated_userid - if login is None: - raise HTTPUnauthorized(headerlist=forget(request)) - - if request.method in ('GET', 'HEAD'): - return handler(request) - - if request.content_type != 'application/json': - detail = "%s is not 'application/json'" % request.content_type - raise HTTPUnsupportedMediaType(detail) - - token = request.headers.get('X-CSRF-Token') - if token is not None: - # Avoid dirtying the session and adding a Set-Cookie header - # XXX Should consider if this is a good idea or not and timeouts - if token == dict.get(request.session, '_csrft_', None): - return handler(request) - raise CSRFTokenError('Incorrect CSRF token') - - # NOTE: cutting out CSRF protection here ... why protect against CSRF if you provide an - # unathenticated endpoint that will delivery the CSRF token? I'm looking at you /session. - # this should be revisted, either embed the csrf token in the index.html as part of the - # rendering subprocess somehow, or return it from the login view and let the client store it - # but of course that sounds a lot like JWT... return handler(request) - # TODO: Code beyond here is unreachable? Is that right? Should we keep the code? -kmp 7-Aug-2022 - - if login is None: - login = request.authenticated_userid - if login is not None: - namespace, userid = login.split('.', 1) - if namespace not in ('mailto', 'persona'): - return handler(request) - raise CSRFTokenError('Missing CSRF token') - return security_tween -def normalize_cookie_tween_factory(handler, registry): +def remove_expired_session_cookies_tween_factory(handler, registry): + """ + CURRENTLY DISABLED + Original purpose of this was to remove expired (session?) cookies. + See: https://github.com/ENCODE-DCC/encoded/commit/75854803c99e5044a6a33aedb3a79d750481b6cd#diff-bc19a9793a1b3b4870cff50e7c7c9bd1R135 + + We disable it for now via removing from tween chain as are using JWT tokens and handling + their removal in security_tween_factory & authentication.py as well as client-side + (upon "Logout" action). If needed for some reason, can re-enable. + """ # noQA - not going to break the long URL line above ignored(registry) ignore = { '/favicon.ico', } - def normalize_cookie_tween(request): + def remove_expired_session_cookies_tween(request): if request.path in ignore or request.path.startswith('/static/'): return handler(request) session = request.session - if session or session._cookie_name not in request.cookies: - return handler(request) + # if session or session._cookie_name not in request.cookies: + # return handler(request) response = handler(request) + # Below seems to be empty always; though we do have some in request.cookies existing = response.headers.getall('Set-Cookie') if existing: cookies = Cookie() @@ -135,18 +250,22 @@ def normalize_cookie_tween(request): return response - return normalize_cookie_tween + return remove_expired_session_cookies_tween -def set_x_request_url_tween_factory(handler, registry): +def set_response_headers_tween_factory(handler, registry): + """Add additional response headers here""" ignored(registry) - def set_x_request_url_tween(request): + def set_response_headers_tween(request): response = handler(request) response.headers['X-Request-URL'] = request.url + # Setter automatically converts set back to tuple. + # See: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Vary + response.vary = set((response.vary or ()) + ('Accept', 'Authorization')) return response - return set_x_request_url_tween + return set_response_headers_tween @subscriber(BeforeRender) @@ -161,7 +280,7 @@ def canonical_redirect(event): return if request.response.status_int != 200: return - if not request.environ.get('snowflakes.canonical_redirect', True): + if not request.environ.get('encoded.canonical_redirect', True): return if request.path_info == '/': return @@ -184,5 +303,196 @@ def canonical_redirect(event): return qs = canonical_qs or request.query_string - location = canonical_path + ('?' if qs else '') + qs - raise HTTPMovedPermanently(location=location) + # add redirect information to the query string, but not for the routes specified below + if not any(route in canonical_path for route in ['/search/', '/browse/', '/metadata/']): + redir_qs = (qs + '&' if qs else '') + urlencode([('redirected_from', request.path_info)]) + else: + redir_qs = qs + location = canonical_path + ('?' if redir_qs else '') + redir_qs + raise HTTPMovedPermanently(location=location, detail="Redirected from " + str(request.path_info)) + + +# Web browsers send an Accept request header for initial (e.g. non-AJAX) page requests +# which should contain 'text/html' +MIME_TYPE_HTML = 'text/html' +MIME_TYPE_JSON = 'application/json' +MIME_TYPE_LD_JSON = 'application/ld+json' + +# Note: In cgap-portal, MIME_TYPE_JSON is at the head of this list. In fourfront, MIME_TYPE_HTML is. +# The cgap-portal behavior might be a bug we should look at bringing into alignment. -kmp 29-Jan-2022 +MIME_TYPES_SUPPORTED = [MIME_TYPE_JSON, MIME_TYPE_HTML, MIME_TYPE_LD_JSON] +MIME_TYPE_DEFAULT = MIME_TYPES_SUPPORTED[0] +MIME_TYPE_TRIAGE_MODE = 'modern' # if this doesn't work, fall back to 'legacy' + +DEBUG_MIME_TYPES = environ_bool("DEBUG_MIME_TYPES", default=False) + + +def best_mime_type(request, mode=MIME_TYPE_TRIAGE_MODE): + # TODO: I think this function does nothing but return MIME_TYPES_SUPPORTED[0] -kmp 3-Feb-2021 + """ + Given a request, tries to figure out the best kind of MIME type to use in response + based on what kinds of responses we support and what was requested. + + In the case we can't comply, we just use application/json whether or not that's what was asked for. + """ + if mode == 'legacy': + # See: + # https://tedboy.github.io/flask/generated/generated/werkzeug.Accept.best_match.html#werkzeug-accept-best-match + # Note that this is now deprecated, or will be. The message is oddly worded ("will be deprecated") + # that presumably means "will be removed". Deprecation IS the warning of actual action, not the action itself. + # "This is currently maintained for backward compatibility, and will be deprecated in the future. + # AcceptValidHeader.best_match() uses its own algorithm (one not specified in RFC 7231) to determine + # what is a best match. The algorithm has many issues, and does not conform to RFC 7231." + # Anyway, we were getting this warning during testing: + # DeprecationWarning: The behavior of AcceptValidHeader.best_match is currently + # being maintained for backward compatibility, but it will be deprecated in the future, + # as it does not conform to the RFC. + # TODO: Once the modern replacement is shown to work, we should remove this conditional branch. + result = request.accept.best_match(MIME_TYPES_SUPPORTED, MIME_TYPE_DEFAULT) + else: + options = request.accept.acceptable_offers(MIME_TYPES_SUPPORTED) + if not options: + # TODO: Probably we should return a 406 response by raising HTTPNotAcceptable if + # no acceptable types are available. (Certainly returning JSON in this case is + # not some kind of friendly help toa naive user with an old browser.) + # Ref: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status + result = MIME_TYPE_DEFAULT + else: + mime_type, score = options[0] + result = mime_type + if DEBUG_MIME_TYPES: + PRINT("Using mime type", result, "for", request.method, request.url) + for k, v in request.headers.items(): + PRINT("%s: %s" % (k, v)) + PRINT("----------") + return result + + +@lru_cache(maxsize=16) +def should_transform(request, response): + """ + Determines whether to transform the response from JSON->HTML/JS depending on type of response + and what the request is looking for to be returned via these criteria, which are tried in order + until one succeeds: + + * If the request method is other than GET or HEAD, returns False. + * If the response.content_type is other than 'application/json', returns False. + * If a 'frame=' query param is given and not 'page' (the default), returns False. + * If a 'format=json' query param is given explicitly, + * For 'format=html', returns True. + * For 'format=json', returns False. + This rule does not match if 'format=' is not given explicitly. + If 'format=' is given an explicit value of ther than 'html' or 'json', an HTTPNotAcceptable error will be raised. + * If the first element of MIME_TYPES_SUPPORTED[0] is 'text/html', returns True. + * Otherwise, in all remaining cases, returns False. + + NOTE: Memoized via `lru_cache`. Cache size is set to be 16 (> 1) in case sub-requests fired off during handling. + """ + # We always return JSON in response to POST, PATCH, etc. + if request.method not in ('GET', 'HEAD'): + return False + + # Only JSON response/content can be plugged into HTML/JS template responses. + if response.content_type != 'application/json': + return False + + # If we have a 'frame' that is not None or page, force JSON, since our UI doesn't handle all various + # forms of the data, just embedded/page. + request_frame = request.params.get("frame", "page") + if request_frame != "page": + return False + + # The `format` URI param allows us to override request's 'Accept' header. + format_param = request.params.get('format') + if format_param is not None: + format_param = format_param.lower() + if format_param == 'json': + return False + if format_param == 'html': + return True + else: + raise HTTPNotAcceptable("Improper format URI parameter", + comment="The format URI parameter should be set to either html or json.") + + # Web browsers send an Accept request header for initial (e.g. non-AJAX) page requests + # which should contain 'text/html' + # See: https://tedboy.github.io/flask/generated/generated/werkzeug.Accept.best_match.html#werkzeug-accept-best-match + mime_type = best_mime_type(request) # Result will be one of MIME_TYPES_SUPPORTED + + # N.B. ld+json (JSON-LD) is likely more unique case and might be sent by search engines (?) + # which can parse JSON-LDs. At some point we could maybe have it to be same as + # making an `@@object` or `?frame=object` request (?) esp if fill + # out @context response w/ schema(s) (or link to schema) + + return mime_type == MIME_TYPE_HTML + + +def render_page_html_tween_factory(handler, registry): + + class TransformErrorResponse(HTTPServerError): + """Extends 500 server error""" + explanation = 'Transformation of JSON response to HTML webpage failed.' + + node_env = os.environ.copy() + node_env['NODE_PATH'] = '' + + # Rendering huge pages can make the node process memory usage explode. + # Ideally we would let the OS handle this with `ulimit` or by calling + # `resource.setrlimit()` from a `subprocess.Popen(preexec_fn=...)`. + # Unfortunately Linux does not enforce RLIMIT_RSS. + # An alternative would be to use cgroups, but that makes per-process limits + # tricky to enforce (we would need to create one cgroup per process.) + # So we just manually check the resource usage after each transform. + + # This has been adjusted upward to account for larger memory availability with nginx/Fargate - Will Jan 13 2023 + kilo_bytes = 1024 + mega_bytes = 1024 * kilo_bytes + rss_limit = 512 * mega_bytes + + reload_process = (True + if registry.settings.get('reload_templates', False) + else lambda proc: psutil.Process(proc.pid).memory_info().rss > rss_limit) + + # TransformWorker inits and manages a subprocess + # it re-uses the subprocess so interestingly data in JS global variables + # might persist in between renders (from different users, even). + transform = TransformWorker( + Response=Response, + reload_process=reload_process, + # Other kwargs, including env below, get passed down to subprocess.Popen + # First argument to resource_filename to be 'snovault' or 'encoded' (for fourfront, cgap-port, smaht-portal). + args=['node', resource_filename(app_project().PACKAGE_NAME, 'static/build/renderer.js')], + env=node_env + ) + + def render_page_html_tween(request): + # Result of downstream tweens. Body not yet transformed into HTML. + response = handler(request) + + if not should_transform(request, response): + # Continue back up the tween chain with JSON response body. + return response + + # The stats below are converted into "X-Stats" header in snovault. + # Maybe we could conditionally disable this at some point in .ini config + # for minute performance enhancement(s). + + transform_start_time = time.time() + + try: + response = transform(response) + except ValueError as e: + response = TransformErrorResponse(e.args[0]) + else: + transform_end_time = time.time() + transform_duration = int((transform_end_time - transform_start_time) * 1e6) + stats = request._stats + stats['render_count'] = stats.get('render_count', 0) + 1 + stats['render_time'] = stats.get('render_time', 0) + transform_duration + # We don't care about getting this back in form of a cookie. Will be available + # as header. + request._add_stats_cookie = False + + return response + + return render_page_html_tween diff --git a/snovault/resources.py b/snovault/resources.py index 252517dd3..ec10da13c 100644 --- a/snovault/resources.py +++ b/snovault/resources.py @@ -6,7 +6,9 @@ from pyramid.decorator import reify from pyramid.httpexceptions import HTTPInternalServerError from pyramid.security import ( - # Allow, + Allow, + Deny, + ALL_PERMISSIONS, Everyone, Authenticated, principals_allowed_by_permission @@ -15,6 +17,8 @@ resource_path, traverse ) +from .server_defaults_user import get_userid +from .types.acl import ONLY_ADMIN_VIEW_ACL from .calculated import ( calculate_properties, calculated_property, @@ -39,9 +43,35 @@ def includeme(config): config.include(auth0_config) + config.include(recaptcha_config) config.scan(__name__) +def acl_from_settings(settings): + # XXX Unsure if any of the demo instance still need this + acl = [] + for k, v in settings.items(): + if k.startswith('allow.'): + action = Allow + permission = k[len('allow.'):] + principals = v.split() + elif k.startswith('deny.'): + action = Deny + permission = k[len('deny.'):] + principals = v.split() + else: + continue + if permission == 'ALL_PERMISSIONS': + permission = ALL_PERMISSIONS + for principal in principals: + if principal == 'Authenticated': + principal = Authenticated + elif principal == 'Everyone': + principal = Everyone + acl.append((action, principal, permission)) + return acl + + class Resource(object): """ Just used to add global calculated properties @@ -72,6 +102,19 @@ class Root(Resource): def __init__(self, registry): self.registry = registry + @reify + def __acl__(self): + acl = acl_from_settings(self.registry.settings) + [ + (Allow, Everyone, ['list', 'search']), + (Allow, 'group.admin', ALL_PERMISSIONS), + (Allow, 'remoteuser.EMBED', 'restricted_fields'), + ] + [ + (Allow, 'remoteuser.INDEXER', ['view', 'view_raw', 'list', 'index']), + (Allow, 'remoteuser.EMBED', ['view', 'view_raw', 'expand']), + (Allow, Everyone, ['visible_for_edit']) + ] + return acl + @reify def connection(self): return self.registry[CONNECTION] @@ -155,6 +198,31 @@ def auth0_config_view(request): config.add_view(auth0_config_view, route_name='auth0-config') +def recaptcha_config(config): + """ Route that exposes the recaptcha site key """ + config.add_route( + 'recaptcha-config', + '/recaptcha_config' + ) + recaptcha_config_values = { # determines which values are echoed + 'g.recaptcha.key': 'RecaptchaKey', + } + + def recaptcha_config_view(request): + response = request.response + response.content_type = 'application/json; charset=utf-8' + response_dict = { + 'title': 'Recaptcha Config', + } + settings = config.registry.settings + for config_key, result_key in recaptcha_config_values.items(): + if config_key in settings: + response_dict[result_key] = settings[config_key] + return response_dict + + config.add_view(recaptcha_config_view, route_name='recaptcha-config') + + class AbstractCollection(Resource, Mapping): """ Collection for a certain type of resource that stores the following info: @@ -292,7 +360,6 @@ def index_settings(): "type": "string", } - class Item(Resource): """ Base Item resource that corresponds to a Collection or AbstractCollection @@ -308,11 +375,22 @@ class Item(Resource): schema = None AbstractCollection = AbstractCollection Collection = Collection + STATUS_ACL = {} # note that this should ALWAYS be overridden by downstream application def __init__(self, registry, model): self.registry = registry self.model = model + def __acl__(self): + """This sets the ACL for the item based on mapping of status to ACL. + If there is no status or the status is not included in the STATUS_ACL + lookup then the access is set to admin only + """ + # Don't finalize to avoid validation here. + properties = self.upgrade_properties().copy() + status = properties.get('status') + return self.STATUS_ACL.get(status, ONLY_ADMIN_VIEW_ACL) + def __repr__(self): return '<%s at %s>' % (type(self).__name__, resource_path(self)) @@ -432,12 +510,40 @@ def get_filtered_rev_links(self, request, name): request._rev_linked_uuids_by_item[str(self.uuid)] = to_update return filtered_uuids + def rev_link_atids(self, request, rev_name): + """ + Returns the list of reverse linked items given a defined reverse link, + which should be formatted like: + rev = { + '': ('', ''), + } + + """ + conn = request.registry[CONNECTION] + return [request.resource_path(conn[uuid]) for uuid in + self.get_filtered_rev_links(request, rev_name)] + def unique_keys(self, properties): return { name: [v for prop in props for v in ensurelist(properties.get(prop, ()))] for name, props in self.type_info.schema_keys.items() } + def add_accession_to_title(self, title): + if self.properties.get('accession') is not None: + return title + ' - ' + self.properties.get('accession') + return title + + def is_update_by_admin_user(self): + # determine if the submitter in the properties is an admin user + userid = get_userid() + users = self.registry['collections']['User'] + user = users.get(userid) + if 'groups' in user.properties: + if 'admin' in user.properties['groups']: + return True + return False + def upgrade_properties(self): """ Calls the upgrader on the Item if properties.schema_version is not current @@ -631,5 +737,29 @@ def principals_allowed(self): return allowed @calculated_property(schema=display_title_schema) - def display_title(self): - return str(self.uuid) + def display_title(self, request=None): + """create a display_title field.""" + display_title = "" + look_for = [ + "title", + "name", + "location_description", + "accession", + ] + properties = self.upgrade_properties() + for field in look_for: + # special case for user: concatenate first and last names + display_title = properties.get(field, None) + if display_title: + if field != 'accession': + display_title = self.add_accession_to_title(display_title) + return display_title + # if none of the existing terms are available, use @type + date_created + try: + type_date = self.__class__.__name__ + " from " + properties.get("date_created", None)[:10] + return type_date + # last resort, use uuid + except Exception: + return properties.get('uuid', None) +# def display_title(self): +# return str(self.uuid) diff --git a/snovault/root.py b/snovault/root.py new file mode 100644 index 000000000..bcb77c00c --- /dev/null +++ b/snovault/root.py @@ -0,0 +1,287 @@ +import sys +import uptime + +from collections import OrderedDict +from dcicutils import lang_utils +from dcicutils.s3_utils import HealthPageKey +from dcicutils.env_utils import infer_foursight_url_from_env +from pyramid.decorator import reify +from pyramid.security import ALL_PERMISSIONS, Allow, Authenticated, Deny, Everyone +from . import Root, calculated_property, root, COLLECTIONS, STORAGE +from .appdefs import APP_VERSION_REGISTRY_KEY, ITEM_INDEX_ORDER +from .schema_formats import is_accession +from .util import SettingsKey + + +def includeme(config): + config.include(health_check) + config.include(item_counts) + config.include(type_metadata) + config.include(submissions_page) + config.scan(__name__) + + +def item_counts(config): + config.add_route( + 'item-counts', + '/counts' + ) + + def counts_view(request): + response = request.response + response.content_type = 'application/json; charset=utf-8' + + db_total = 0 + es_total = 0 + # find db and es counts for each item type + db_es_compare = OrderedDict() + for item_type in request.registry[COLLECTIONS].by_item_type: + # use the write (DB) storage with only the specific item_type + # need to count items with props in ES differently + db_count = request.registry[STORAGE].write.__len__(item_type) + es_count = request.registry[STORAGE].read.__len__(item_type) + db_total += db_count + es_total += es_count + warn_str = build_warn_string(db_count, es_count) + item_name = request.registry[COLLECTIONS][item_type].type_info.name + db_es_compare[item_name] = ("DB: %s ES: %s %s" % (str(db_count), str(es_count), warn_str)) + warn_str = build_warn_string(db_total, es_total) + db_es_total = ("DB: %s ES: %s %s" % + (str(db_total), str(es_total), warn_str)) + response_dict = { + 'title': 'Item Counts', + 'db_es_total': db_es_total, + 'db_es_compare': db_es_compare + } + return response_dict + + config.add_view(counts_view, route_name='item-counts') + + +def type_metadata(config): + """ This needs to be re-written in the upstream repo """ + + config.add_route( + 'type-metadata', + '/type-metadata' + ) + + def type_metadata_view(request): + + return { + 'index_order': ITEM_INDEX_ORDER + } + + config.add_view(type_metadata_view, route_name='type-metadata') + + + +def uptime_info(): + try: + return lang_utils.relative_time_string(uptime.uptime()) + except Exception: + return "unavailable" + + +def health_check(config): + """ + Emulate a lite form of Alex's static page routing + """ + config.add_route( + 'health-check', + '/health' + ) + + def health_page_view(request): + + class ExtendedHealthPageKey(HealthPageKey): + # This class can contain new entries in HealthPageKey that are waiting to move to dcicutils + PYTHON_VERSION = "python_version" + pass + + h = ExtendedHealthPageKey + + s = SettingsKey + + response = request.response + response.content_type = 'application/json; charset=utf-8' + settings = request.registry.settings + + env_name = settings.get('env.name') + foursight_url = infer_foursight_url_from_env(request=request, envname=env_name) + + response_dict = { + + "@type": ["Health", "Portal"], + "@context": "/health", + "@id": "/health", + "content": None, + + h.APPLICATION_BUCKET_PREFIX: settings.get(s.APPLICATION_BUCKET_PREFIX), + h.BEANSTALK_APP_VERSION: settings.get(s.EB_APP_VERSION), + h.BEANSTALK_ENV: env_name, + h.BLOB_BUCKET: settings.get(s.BLOB_BUCKET), + h.DATABASE: settings.get(s.SQLALCHEMY_URL).split('@')[1], # don't show user /password + h.DISPLAY_TITLE: "ENCODED Portal Status and Foursight Monitoring", + h.ELASTICSEARCH: settings.get(s.ELASTICSEARCH_SERVER), + h.FILE_UPLOAD_BUCKET: settings.get(s.FILE_UPLOAD_BUCKET), + h.FOURSIGHT: foursight_url, + h.FOURSIGHT_BUCKET_PREFIX: settings.get(s.FOURSIGHT_BUCKET_PREFIX), + h.IDENTITY: settings.get(s.IDENTITY), + h.INDEXER: settings.get(s.INDEXER), + h.INDEX_SERVER: settings.get(s.INDEX_SERVER), + h.LOAD_DATA: settings.get(s.LOAD_TEST_DATA), + h.METADATA_BUNDLES_BUCKET: settings.get(s.METADATA_BUNDLES_BUCKET), + h.NAMESPACE: settings.get(s.INDEXER_NAMESPACE), + h.PROCESSED_FILE_BUCKET: settings.get(s.FILE_WFOUT_BUCKET), + h.PROJECT_VERSION: settings.get(s.ENCODED_VERSION), + h.PYTHON_VERSION: f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}", + h.S3_ENCRYPT_KEY_ID: settings.get(s.S3_ENCRYPT_KEY_ID), + h.SNOVAULT_VERSION: settings.get(s.SNOVAULT_VERSION), + h.SYSTEM_BUCKET: settings.get(s.SYSTEM_BUCKET), + h.TIBANNA_CWLS_BUCKET: settings.get(s.TIBANNA_CWLS_BUCKET), + h.TIBANNA_OUTPUT_BUCKET: settings.get(s.TIBANNA_OUTPUT_BUCKET), + h.UPTIME: uptime_info(), + h.UTILS_VERSION: settings.get(s.UTILS_VERSION), + + } + + return response_dict + + config.add_view(health_page_view, route_name='health-check') + + +def build_warn_string(db_count, es_count): + if db_count > es_count: + warn_str = ' < DB has %s more items >' % (str(db_count - es_count)) + elif db_count < es_count: + warn_str = ' < ES has %s more items >' % (str(es_count - db_count)) + else: + warn_str = '' + return warn_str + + +def submissions_page(config): + """ + Emulate a lite form of Alex's static page routing + """ + config.add_route( + 'submissions-page', + '/submissions' + ) + + def submissions_page_view(request): + response = request.response + response.content_type = 'application/json; charset=utf-8' + + response_dict = { + "title": "Submissions", + "notification": "success", + "@type": ["Submissions", "Portal"], + "@context": "/submissions", + "@id": "/submissions", + "content": None + } + + return response_dict + + config.add_view(submissions_page_view, route_name='submissions-page') + + +def acl_from_settings(settings): + # XXX Unsure if any of the demo instance still need this + acl = [] + for k, v in settings.items(): + if k.startswith('allow.'): + action = Allow + permission = k[len('allow.'):] + principals = v.split() + elif k.startswith('deny.'): + action = Deny + permission = k[len('deny.'):] + principals = v.split() + else: + continue + if permission == 'ALL_PERMISSIONS': + permission = ALL_PERMISSIONS + for principal in principals: + if principal == 'Authenticated': + principal = Authenticated + elif principal == 'Everyone': + principal = Everyone + acl.append((action, principal, permission)) + return acl + + +@root +class SnovaultRoot(Root): + properties = { + 'title': 'Home', + 'portal_title': 'Snovault', + } + + @reify + def __acl__(self): + acl = acl_from_settings(self.registry.settings) + [ + (Allow, Everyone, ['list', 'search']), + (Allow, 'group.admin', ALL_PERMISSIONS), + (Allow, 'remoteuser.EMBED', 'restricted_fields'), + ] + [ + (Allow, 'remoteuser.INDEXER', ['view', 'view_raw', 'list', 'index']), + (Allow, 'remoteuser.EMBED', ['view', 'view_raw', 'expand']), + (Allow, Everyone, ['visible_for_edit']) + ] + return acl + + def get(self, name, default=None): + resource = super().get(name, None) + if resource is not None: + return resource + resource = self.connection.get_by_unique_key('page:location', name) + if resource is not None: + return resource + if is_accession(name): + resource = self.connection.get_by_unique_key('accession', name) + if resource is not None: + return resource + if ':' in name: + resource = self.connection.get_by_unique_key('alias', name) + if resource is not None: + return resource + return default + + def get_by_uuid(self, uuid, default=None): + return self.connection.get_by_uuid(uuid, default) + + def jsonld_context(self): + """Inherits from '@context' calculated property of Resource in snovault/resources.py""" + return '/home' + + def jsonld_type(self): + """Inherits from '@type' calculated property of Root in snovault/resources.py""" + return ['HomePage', 'StaticPage'] + super().jsonld_type() + + @calculated_property(schema={ + "title": "Static Page Content", + "type": "array" + }) + def content(self, request): + """Returns -object- with pre-named sections""" + return [] + # sections_to_get = ['home.introduction'] + # user = request._auth0_authenticated if hasattr(request, '_auth0_authenticated') else True + # return_list = [] + # for section_name in sections_to_get: + # try: # Can be caused by 404 / Not Found during indexing + # res = request.embed('/static-sections', section_name, '@@embedded', as_user=user) + # return_list.append(res) + # except KeyError: + # pass + # return return_list + + @calculated_property(schema={ + "title": "Application version", + "type": "string", + }) + def app_version(self, registry): + return registry.settings[APP_VERSION_REGISTRY_KEY] diff --git a/snovault/schema_formats.py b/snovault/schema_formats.py new file mode 100644 index 000000000..9b4f25742 --- /dev/null +++ b/snovault/schema_formats.py @@ -0,0 +1,34 @@ +import re + +from jsonschema_serialize_fork import FormatChecker +from .server_defaults import ( + ACCESSION_PREFIX, + ACCESSION_TEST_PREFIX, +) + + +# Codes we allow for testing go here. +ACCESSION_TEST_CODES = "BS|ES|EX|FI|FS|IN|SR|WF" + +accession_re = re.compile(r'^%s[1-9A-Z]{9}$' % ACCESSION_PREFIX) + +test_accession_re = re.compile(r'^%s(%s)[0-9]{4}([0-9][0-9][0-9]|[A-Z][A-Z][A-Z])$' % ( + ACCESSION_TEST_PREFIX, ACCESSION_TEST_CODES)) + +uuid_re = re.compile(r'(?i)[{]?(?:[0-9a-f]{4}-?){8}[}]?') + + +@FormatChecker.cls_checks("uuid") +def is_uuid(instance): + # Python's UUID ignores all dashes, whereas Postgres is more strict + # http://www.postgresql.org/docs/9.2/static/datatype-uuid.html + return bool(uuid_re.match(instance)) + + +def is_accession(instance): + """Just a pattern checker.""" + # Unfortunately we cannot access the accessionType here + return ( + accession_re.match(instance) is not None or + test_accession_re.match(instance) is not None + ) diff --git a/snovault/schema_utils.py b/snovault/schema_utils.py index f5fbdf251..3555cf297 100644 --- a/snovault/schema_utils.py +++ b/snovault/schema_utils.py @@ -1,8 +1,8 @@ import codecs import collections -import uuid +import io import json -import requests +import uuid from datetime import datetime from dcicutils.misc_utils import ignored @@ -11,13 +11,13 @@ FormatChecker, RefResolver, ) -# TODO (C4-177): remove these imports (urlsplit, urlopen) when RefResolverOrdered is removed -from jsonschema_serialize_fork.compat import urlsplit, urlopen from jsonschema_serialize_fork.exceptions import ValidationError +import os from pyramid.path import AssetResolver, caller_package from pyramid.threadlocal import get_current_request from pyramid.traversal import find_resource from uuid import UUID +from .project_app import app_project from .util import ensurelist # This was originally an internal import from "." (__init__.py), but I have replaced that reference @@ -28,68 +28,87 @@ SERVER_DEFAULTS = {} +# TODO: Shouldn't this return func? Otherwise this: +# @server_default +# def foo(instance, subschema): +# return ...something... +# does (approximately): +# SERVER_DEFAULTS['foo'] = lambda(instance, subschema): ...something... +# server_default = None +# It feels like the function should still get defined. -kmp 17-Feb-2023 def server_default(func): SERVER_DEFAULTS[func.__name__] = func -class RefResolverOrdered(RefResolver): +class NoRemoteResolver(RefResolver): + def resolve_remote(self, uri): + raise ValueError('Resolution disallowed for: %s' % uri) + + +def favor_app_specific_schema(schema: str) -> str: """ - Overwrite the resolve_remote method in the RefResolver class, written - by lrowe. See: - https://github.com/lrowe/jsonschema_serialize_fork/blob/master/jsonschema_serialize_fork/validators.py - With python <3.6, json.loads was losing schema order for properties, facets, - and columns. Pass in the object_pairs_hook=collections.OrderedDict - argument to fix this. - WHEN ELASTICBEANSTALK IS RUNNING PY 3.6 WE CAN REMOVE THIS + If the given schema refers to a schema (file) which exists in the app-specific schemas + package/directory then favor that version of the file over the local version by returning + a reference to that schema; otherwise just returns the given schema. + + For example, IF the given schema is snovault:access_key.json AND the current app is fourfront AND + if the file encoded/schemas/access_key.json exists THEN returns: encoded:schemas/access_key.json + + This uses the dcicutils.project_utils mechanism to get the app-specific file/path name. """ - # TODO (C4-177): The stated condition is now met. We are reliably in Python 3.6, so this code should be removed. + if isinstance(schema, str): + schema_parts = schema.split(":") + schema_project = schema_parts[0] if len(schema_parts) > 1 else None + if schema_project != app_project().PACKAGE_NAME: + schema_filename = schema_parts[1] if len(schema_parts) > 1 else schema_parts[0] + app_specific_schema_filename = app_project().project_filename(f"/{schema_filename}") + if os.path.exists(app_specific_schema_filename): + schema = f"{app_project().PACKAGE_NAME}:{schema_filename}" + return schema - def resolve_remote(self, uri): - """ - Resolve a remote ``uri``. - Does not check the store first, but stores the retrieved document in - the store if :attr:`RefResolver.cache_remote` is True. - .. note:: - If the requests_ library is present, ``jsonschema`` will use it to - request the remote ``uri``, so that the correct encoding is - detected and used. - If it isn't, or if the scheme of the ``uri`` is not ``http`` or - ``https``, UTF-8 is assumed. - :argument str uri: the URI to resolve - :returns: the retrieved document - .. _requests: http://pypi.python.org/pypi/requests/ - """ - scheme = urlsplit(uri).scheme - - if scheme in self.handlers: - result = self.handlers[scheme](uri) - elif ( - scheme in ["http", "https"] and - # TODO (C4-177): PyCharm flagged free references to 'requests' in this file as undefined, - # so I've added an import at top of file. However, that suggests this 'elif' - # branch is never entered. When this tangled code is removed, I'll be happier. - # Meanwhile having a definition seems safer than not. -kmp 9-Jun-2020 - requests and - getattr(requests.Response, "json", None) is not None - ): - # Requests has support for detecting the correct encoding of - # json over http - if callable(requests.Response.json): - result = collections.OrderedDict(requests.get(uri).json()) - else: - result = collections.OrderedDict(requests.get(uri).json) - else: - # Otherwise, pass off to urllib and assume utf-8 - result = json.loads(urlopen(uri).read().decode("utf-8"), object_pairs_hook=collections.OrderedDict) - if self.cache_remote: - self.store[uri] = result - return result +def favor_app_specific_schema_ref(schema_ref: str) -> str: + """ + If the given schema_ref refers to a schema (file) which exists in the app-specific schemas + directory, AND it contains the specified element, then favor that version of the file over the + local version by returning a reference to that schema; otherwise just returns the given schema_ref. + For example, IF the given schema is mixins.json#/modified AND the current app is fourfront + AND if the file encoded/schemas/mixins.json exists AND if that file contains the modified + element THEN returns: file:///full-path-to/encoded/schemas/mixins.json#/modified -class NoRemoteResolver(RefResolverOrdered): - def resolve_remote(self, uri): - raise ValueError('Resolution disallowed for: %s' % uri) + This uses the dcicutils.project_utils mechanism to get the app-specific file/path name. + """ + def json_file_contains_element(json_filename: str, json_element: str) -> bool: + """ + If the given JSON file exists and contains the given JSON element name then + returns True, otherwise returnes False. The given JSON element may or may + not begin with a slash. Currently only looks at one single top-level element. + """ + if json_filename and json_element: + try: + with io.open(json_filename, "r") as json_f: + json_content = json.load(json_f) + json_element = json_element.strip("/") + if json_element: + if json_content.get(json_element): + return True + except Exception: + pass + return False + + if isinstance(schema_ref, str): + schema_parts = schema_ref.split("#") + schema_filename = schema_parts[0] + app_specific_schema_filename = app_project().project_filename(f"/schemas/{schema_filename}") + if os.path.exists(app_specific_schema_filename): + schema_element = schema_parts[1] if len(schema_parts) > 1 else None + if schema_element: + if json_file_contains_element(app_specific_schema_filename, schema_element): + schema_ref = f"file://{app_specific_schema_filename}#{schema_element}" + else: + schema_ref = f"file://{app_specific_schema_filename}" + return schema_ref def mixinSchemas(schema, resolver, key_name='properties'): @@ -102,6 +121,10 @@ def mixinSchemas(schema, resolver, key_name='properties'): for mixin in reversed(mixins): ref = mixin.get('$ref') if ref is not None: + # For mixins check if there is an associated app-specific + # schema file and favor that over the local one if any. + # TODO: This may be controversial and up for discussion. 2023-05-27 + ref = favor_app_specific_schema_ref(ref) with resolver.resolving(ref) as resolved: mixin = resolved bases.append(mixin) @@ -278,6 +301,7 @@ class SchemaValidator(Draft4Validator): def load_schema(filename): + filename = favor_app_specific_schema(filename) if isinstance(filename, dict): schema = filename resolver = NoRemoteResolver.from_schema(schema) @@ -286,7 +310,7 @@ def load_schema(filename): asset = AssetResolver(caller_package()).resolve(filename) schema = json.load(utf8(asset.stream()), object_pairs_hook=collections.OrderedDict) - resolver = RefResolverOrdered('file://' + asset.abspath(), schema) + resolver = RefResolver('file://' + asset.abspath(), schema) # use mixinProperties, mixinFacets, mixinAggregations, and mixinColumns (if provided) schema = mixinSchemas( mixinSchemas( diff --git a/snovault/schema_views.py b/snovault/schema_views.py index 7e1ed9c8f..5e0ddb9a5 100644 --- a/snovault/schema_views.py +++ b/snovault/schema_views.py @@ -41,7 +41,7 @@ def _annotated_schema(type_info, request): schema['rdfs:seeAlso'] = urlparse(jsonld_base).path + type_info.name # add links to profiles of children schemas schema['children'] = [ - '/profiles/' + t_name + '.json' for t_name in type_info.child_types + '/profiles/' + t_name + '.json' for t_name in type_info.child_types ] if type_info.factory is None: diff --git a/snovault/schemas/access_key.json b/snovault/schemas/access_key.json new file mode 100644 index 000000000..03015ce29 --- /dev/null +++ b/snovault/schemas/access_key.json @@ -0,0 +1,69 @@ +{ + "title": "Admin access key", + "id": "/profiles/access_key.json", + "$schema": "http://json-schema.org/draft-04/schema#", + "required": [], + "additionalProperties": false, + "mixinProperties": [ + { + "$ref": "mixins.json#/schema_version" + }, + { + "$ref": "mixins.json#/uuid" + }, + { + "$ref": "mixins.json#/submitted" + }, + { + "$ref": "mixins.json#/modified" + } + ], + "type": "object", + "properties": { + "schema_version": { + "default": "1" + }, + "status": { + "title": "Status", + "type": "string", + "default": "current", + "enum": [ + "current", + "deleted" + ] + }, + "user": { + "title": "User", + "comment": "Only admins are allowed to set this value.", + "type": "string", + "linkTo": "User" + }, + "description": { + "title": "Description", + "type": "string", + "formInput": "textarea" + }, + "access_key_id": { + "title": "Access key ID", + "comment": "Only admins are allowed to set this value.", + "type": "string", + "uniqueKey": true + }, + "secret_access_key_hash": { + "title": "Secret access key Hash", + "comment": "Only admins are allowed to set this value.", + "type": "string" + }, + "expiration_date": { + "title": "Expiration Date", + "comment": "Only admins are allowed to set this value.", + "type": "string", + "permission": "restricted_fields" + } + }, + "facets": { + "user.display_title": { + "title": "User Name" + } + } +} diff --git a/snovault/schemas/filter_set.json b/snovault/schemas/filter_set.json new file mode 100644 index 000000000..f3ad6ee9d --- /dev/null +++ b/snovault/schemas/filter_set.json @@ -0,0 +1,156 @@ +{ + "title": "Filter Set", + "description": "Filter Set for encapsulating multiple queries", + "id": "/profiles/filter_set.json", + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "object", + "required": [ + "title" + ], + "identifyingProperties": [ + "uuid", + "aliases" + ], + "additionalProperties": false, + "mixinProperties": [ + { + "$ref": "mixins.json#/schema_version" + }, + { + "$ref": "mixins.json#/uuid" + }, + { + "$ref": "mixins.json#/aliases" + }, + { + "$ref": "mixins.json#/attribution" + }, + { + "$ref": "mixins.json#/submitted" + }, + { + "$ref": "mixins.json#/modified" + }, + { + "$ref": "mixins.json#/tags" + }, + { + "$ref": "mixins.json#/notes" + } + ], + "properties": { + "schema_version": { + "default": "2" + }, + "status": { + "title": "Status", + "type": "string", + "default": "draft", + "notes": "Unlike the status definition in mixins, this lacks permission:restricted_fields so people may edit FilterSet statuses they've saved.", + "enum": [ + "shared", + "obsolete", + "current", + "inactive", + "in review", + "draft", + "deleted" + ] + }, + "title": { + "title": "Title", + "description": "Title for this filter set", + "type": "string" + }, + "search_type": { + "title": "Item Type", + "description": "Item type that the filter set will work on.", + "type": "string", + "enum": [ + "Variant", + "VariantSample", + "StructuralVariant", + "StructuralVariantSample", + "Cohort" + ] + }, + "filter_blocks": { + "title": "Filter Blocks", + "description": "Filter queries that will be joined.", + "type": "array", + "uniqueItems": true, + "items": { + "title": "Filter Block", + "type": "object", + "properties": { + "name": { + "title": "Name", + "type": "string", + "description": "Name of the filter block" + }, + "query": { + "title": "Single query", + "description": "URL Query string", + "type": "string" + }, + "flags_applied": { + "title": "Flags applied", + "description": "Flag names that will be applied to this filter block", + "type": "array", + "uniqueItems": true, + "items": { + "title": "Flag", + "type": "string" + } + } + } + } + }, + "flags": { + "title": "Flags", + "description": "Flags that will be applied to filter blocks with name mapping.", + "type": "array", + "uniqueItems": true, + "items": { + "title": "Flag", + "type": "object", + "properties": { + "name": { + "title": "Name", + "type": "string", + "description": "Name of the flag" + }, + "query": { + "title": "Single query", + "description": "URL Query string", + "type": "string" + } + } + } + }, + "created_in_case_accession": { + "title": "Created in Case Accession", + "description": "Case in context of which this FilterSet was originally created.", + "notes": "Maybe eventually this could be 'case_accession_contexts' (list of accessions) or something of all Cases this is used for if multiple.", + "type": "string", + "format": "accession" + }, + "derived_from_preset_filterset": { + "title": "Derived From Preset UUID", + "description": "If this FilterSet was derived from a FilterSet Preset, then this info is saved here for reference and later de-duplication.", + "type": "string", + "format": "uuid" + }, + "preset_for_users": { + "title": "Preset for user", + "description": "Link filter set to user as a preset", + "type": "array", + "items": { + "title": "User UUID", + "type": "string", + "format": "uuid", + "uniqueItems": true + } + } + } +} diff --git a/snovault/schemas/ingestion_submission.json b/snovault/schemas/ingestion_submission.json new file mode 100644 index 000000000..58801c7f1 --- /dev/null +++ b/snovault/schemas/ingestion_submission.json @@ -0,0 +1,155 @@ +{ + "title": "Ingestion Submission", + "description": "Schema for metadata related to ingestion requests submitted to CGAP.", + "id": "/profiles/ingestion_submission.json", + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "object", + "required": [ "ingestion_type" ], + "additionalProperties": false, + "identifyingProperties": [ + "uuid", + "aliases" + ], + "mixinProperties": [ + { + "$ref": "mixins.json#/schema_version" + }, + { + "$ref": "mixins.json#/aliases" + }, + { + "$ref": "mixins.json#/uuid" + }, + { + "$ref": "mixins.json#/documents" + }, + { + "$ref": "mixins.json#/attribution" + }, + { + "$ref": "mixins.json#/status" + }, + { + "$ref": "mixins.json#/submitted" + }, + { + "$ref": "mixins.json#/modified" + }, + { + "$ref": "mixins.json#/static_embeds" + } + ], + "mixinFacets": [ + { + "$ref": "mixins.json#/facets_common" + } + ], + "properties": { + "schema_version": { + "default": "1" + }, + "object_bucket": { + "title": "Object Bucket", + "description": "The name of the S3 bucket in which the 'object_name' resides.", + "type": "string" + }, + "object_name": { + "title": "Object Name", + "description": "The name of the S3 object corresponding to the submitted document.", + "type": "string" + }, + "ingestion_type": { + "title": "Ingestion Type", + "description": "The type of processing requested for this submission.", + "type": "string", + "enum": [ + "data_bundle", + "accessioning", + "metadata_bundle", + "ontology", + "simulated_bundle", + "vcf", + "genelist", + "variant_update", + "family_history" + ] + }, + "submission_id": { + "title": "Submission ID", + "description": "The name of a folder in the S3 bucket that contains all artifacts related to this submission.", + "type": "string" + }, + "parameters": { + "title": "Parameters", + "description": "A record of explicitly offered form parameters in the submission request.", + "type": "object", + "additionalProperties": true, + "properties": {} + }, + "processing_status": { + "title": "Processing Status", + "description": "A structured description of what has happened so far as the submission is processed.", + "type": "object", + "additionalProperties": false, + "properties": { + "state": { + "title": "State", + "description": "A state machine description of how processing is progressing (created, submitted, processed, or done).", + "type": "string", + "enum": [ + "created", + "submitted", + "processing", + "done" + ], + "default": "created" + }, + "outcome": { + "title": "Outcome", + "description": "A token describing the nature of the final outcome, if any. Options are unknown, success, failure, or error.", + "type": "string", + "enum": [ + "unknown", + "success", + "failure", + "error" + ], + "default": "unknown" + }, + "progress": { + "title": "Progress", + "description": "An adjectival word or phrase assessing progress, such as 'started', 'awaiting prerequisites', '88% done', or 'unavailable'.", + "type": "string", + "default": "unavailable" + } + } + }, + "result": { + "title": "Result", + "description": "An object representing a result if processing ran to completion, whether the outcome was success or failure.", + "type": "object", + "additionalProperties": true, + "properties": {}, + "default": {} + }, + "errors": { + "title": "Errors", + "description": "A list of error messages if processing was aborted before results were obtained.", + "type": "array", + "items": { + "title": "Error Message", + "description": "One of possibly several reasons that processing was not completed.", + "type": "string" + }, + "default": [] + }, + "additional_data": { + "title": "Additional Data", + "description": "Additional structured information resulting from processing, the nature of which may vary by ingestion_type and other factors.", + "type": "object", + "additionalItems": true, + "properties": {}, + "default": {} + } + } +} diff --git a/snovault/schemas/mixins.json b/snovault/schemas/mixins.json new file mode 100644 index 000000000..01cc15e6d --- /dev/null +++ b/snovault/schemas/mixins.json @@ -0,0 +1,519 @@ +{ + "title": "Mixin properties", + "schema_version": { + "schema_version": { + "title": "Schema Version", + "internal_comment": "Do not submit, value is assigned by the server. The version of the JSON schema that the server uses to validate the object. Schema version indicates generation of schema used to save version to to enable upgrade steps to work. Individual schemas should set the default.", + "type": "string", + "exclude_from": [ + "FFedit-create" + ], + "pattern": "^\\d+(\\.\\d+)*$", + "requestMethod": [] + } + }, + "uuid": { + "uuid": { + "title": "UUID", + "type": "string", + "format": "uuid", + "exclude_from": [ + "FFedit-create" + ], + "serverDefault": "uuid4", + "permission": "restricted_fields", + "requestMethod": "POST" + } + }, + "accession": { + "accession": { + "title": "Accession", + "description": "A unique identifier to be used to reference the object.", + "internal_comment": "Only admins are allowed to set or update this value.", + "exclude_from": [ + "FFedit-create" + ], + "type": "string", + "format": "accession", + "permission": "restricted_fields", + "serverDefault": "accession" + }, + "alternate_accessions": { + "title": "Alternate Accessions", + "description": "Accessions previously assigned to objects that have been merged with this object.", + "type": "array", + "lookup": 1000, + "internal_comment": "Only admins are allowed to set or update this value.", + "items": { + "title": "Alternate Accession", + "description": "An accession previously assigned to an object that has been merged with this object.", + "type": "string", + "permission": "restricted_fields", + "format": "accession" + } + } + }, + "aliases": { + "aliases": { + "title": "Aliases", + "description": "Institution-specific ID (e.g. bgm:cohort-1234-a).", + "type": "array", + "comment": "Colon separated lab name and lab identifier, no slash. (e.g. dcic-lab:42).", + "lookup": 1, + "uniqueItems": true, + "ff_flag": "clear clone", + "items": { + "uniqueKey": "alias", + "title": "ID Alias", + "description": "Institution-specific ID (e.g. bgm:cohort-1234-a).", + "type": "string", + "pattern": "^[^\\s\\\\\\/]+:[^\\s\\\\\\/]+$" + } + } + }, + "status": { + "status": { + "title": "Status", + "type": "string", + "default": "in review", + "permission": "restricted_fields", + "enum": [ + "shared", + "obsolete", + "current", + "inactive", + "in review", + "deleted" + ] + } + }, + "submitted": { + "date_created": { + "rdfs:subPropertyOf": "dc:created", + "title": "Date Created", + "lookup": 1000, + "exclude_from": [ + "FFedit-create" + ], + "type": "string", + "anyOf": [ + { + "format": "date-time" + }, + { + "format": "date" + } + ], + "serverDefault": "now", + "permission": "restricted_fields" + }, + "submitted_by": { + "rdfs:subPropertyOf": "dc:creator", + "title": "Submitted By", + "exclude_from": [ + "FFedit-create" + ], + "type": "string", + "linkTo": "User", + "lookup": 1000, + "serverDefault": "userid", + "permission": "restricted_fields" + } + }, + "modified": { + "last_modified": { + "title": "Last Modified", + "exclude_from": [ + "FFedit-create" + ], + "type": "object", + "additionalProperties": false, + "lookup": 1000, + "properties": { + "date_modified": { + "title": "Date Modified", + "description": "Do not submit, value is assigned by the server. The date the object is modified.", + "type": "string", + "anyOf": [ + { + "format": "date-time" + }, + { + "format": "date" + } + ], + "permission": "restricted_fields" + }, + "modified_by": { + "title": "Modified By", + "description": "Do not submit, value is assigned by the server. The user that modfied the object.", + "type": "string", + "linkTo": "User", + "permission": "restricted_fields" + } + } + } + }, + "attribution": { + "institution": { + "title": "Institution", + "description": "Institution associated with the submission.", + "type": "string", + "exclude_from": [ + "FFedit-create" + ], + "linkTo": "Institution", + "serverDefault": "userinstitution" + }, + "project": { + "title": "Project", + "description": "Project associated with the submission.", + "type": "string", + "exclude_from": [ + "FFedit-create" + ], + "linkTo": "Project", + "serverDefault": "userproject" + } + }, + "notes": { + "notes": { + "title": "Notes", + "description": "Internal notes.", + "type": "string", + "exclude_from": [ + "FFedit-create" + ], + "elasticsearch_mapping_index_type": { + "title": "Field mapping index type", + "description": "Defines one of three types of indexing available", + "type": "string", + "default": "analyzed", + "enum": [ + "analyzed", + "not_analyzed", + "no" + ] + } + } + }, + "documents": { + "documents": { + "title": "Documents", + "description": "Documents that provide additional information (not data file).", + "comment": "See Documents sheet or collection for existing items.", + "type": "array", + "uniqueItems": true, + "items": { + "title": "Document", + "description": "A document that provides additional information (not data file).", + "type": "string", + "linkTo": "Document" + } + } + }, + "attachment": { + "attachment": { + "title": "Attached File", + "description": "File attached to this Item.", + "type": "object", + "lookup": 1, + "additionalProperties": false, + "formInput": "file", + "attachment": true, + "ff_flag": "clear clone", + "properties": { + "download": { + "title": "File Name", + "description": "File Name of the attachment.", + "type": "string" + }, + "href": { + "internal_comment": "Internal webapp URL for document file", + "title": "href", + "description": "Path to download the file attached to this Item.", + "type": "string" + }, + "type": { + "title": "Media Type", + "type": "string", + "enum": [ + "application/msword", + "application/vnd.ms-excel", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/pdf", + "application/zip", + "application/proband+xml", + "text/plain", + "text/tab-separated-values", + "image/jpeg", + "image/tiff", + "image/gif", + "text/html", + "image/png", + "image/svs", + "text/autosql" + ] + }, + "md5sum": { + "title": "MD5 Checksum", + "description": "Use this to ensure that your file was downloaded without errors or corruption.", + "type": "string", + "format": "md5sum" + }, + "size": { + "title": "Attachment size", + "description": "Size of the attachment on disk", + "type": "integer" + }, + "width": { + "title": "Image width", + "description": "Width of the image attached, in pixels.", + "type": "integer" + }, + "height": { + "title": "Image height", + "description": "Height of the image attached, in pixels.", + "type": "integer" + }, + "blob_id": { + "title": "Blob ID", + "type": "string", + "internal_comment": "blob storage ID. Use to like with s3/rds" + } + } + } + }, + "dbxrefs": { + "dbxrefs": { + "@type": "@id", + "rdfs:subPropertyOf": "rdfs:seeAlso", + "title": "External identifiers", + "comment": "Enter as a database name:identifier eg. HGNC:PARK2", + "description": "Unique identifiers from external resources.", + "type": "array", + "ff_flag": "clear clone", + "uniqueItems": true, + "items": { + "title": "External identifier", + "description": "A unique identifier from external resource.", + "type": "string" + } + } + }, + "interpretation": { + "interpretations": { + "title": "Clinical Interpretations", + "description": "Clinical Interpretation Notes connected to this item", + "type": "array", + "items": { + "title": "Clinical Interpretation", + "description": "Interpretation connected to this item", + "type": "string", + "linkTo": "NoteInterpretation" + } + }, + "discovery_interpretations": { + "title": "Discovery Interpretations", + "type": "array", + "items": { + "title": "Discovery Interpretation", + "type": "string", + "linkTo": "NoteDiscovery" + } + } + }, + "alternative_ids": { + "alternative_ids": { + "title": "Alternative identifiers", + "comment": "If an item generated from an owl file is deprecated/obsoleted in the ontology then often the id for the item is added to the new rdf that should replace it as an alternative id", + "description": "Alternative id - often referring to a deprecated object which this item replaces.", + "type": "array", + "uniqueItems": true, + "lookup": 1000, + "items": { + "title": "Alternative identifier", + "description": "An alternative identifier from this resource - referring to an object that has been deprecated/obsoleted.", + "type": "string" + } + } + }, + "ingestion_ids": { + "ingestion_ids": { + "title": "Submission IDs", + "description": "uuids of the IngestionSubmission items that created/edited this case", + "type": "array", + "items": { + "title": "Submission ID", + "description": "an IngestionSubmission item that created or edited this case", + "type": "string" + } + } + }, + "tags": { + "tags": { + "title": "Tags", + "description": "Key words that can tag an item - useful for filtering.", + "type": "array", + "lookup": 1000, + "uniqueItems": true, + "ff_flag": "clear clone", + "items": { + "title": "Tag", + "description": "A tag for the item.", + "type": "string", + "minLength": 1, + "maxLength": 50, + "pattern": "^[a-zA-Z0-9_\\-][a-zA-Z0-9_\\-\\s]+[a-zA-Z0-9_\\-]$" + } + } + }, + "static_embeds": { + "static_headers": { + "title": "Static Headers", + "description": "Array of linkTos for static sections to be displayed at the top of an item page", + "type": "array", + "uniqueItems": true, + "permission": "restricted_fields", + "items": { + "title": "Static Header", + "description": "Static section displayed at the top of an item page", + "type": "string", + "linkTo": "UserContent" + } + }, + "static_content": { + "title": "Static Content", + "description": "Array of objects containing linkTo UserContent and 'position' to be placed on Item view(s).", + "type": "array", + "uniqueItems": true, + "permission": "restricted_fields", + "items": { + "title": "Static Content Definition", + "description": "Link to UserContent Item plus location.", + "type": "object", + "required": [ + "location", + "content" + ], + "properties": { + "content": { + "type": "string", + "linkTo": "UserContent", + "title": "Link to Content", + "description": "A UserContent Item." + }, + "location": { + "type": "string", + "title": "Location of Content", + "description": "Where this content should be displayed. Item schemas could potentially define an enum to contrain values.", + "default": "header" + }, + "description": { + "type": "string", + "title": "Description", + "description": "Description or note about this content. Might be displayed as a footnote or caption, if applicable for view." + } + } + } + } + }, + "facets_common": { + "project.display_title": { + "title": "Project" + }, + "institution.display_title": { + "title": "Institution" + } + }, + "supplementary_files": { + "other_processed_files": { + "title": "Supplementary Processed Files", + "description": "Additional, archived or preliminary processed filesets that are derived from files in this experiment set.", + "type": "array", + "lookup": 410, + "exclude_from": [ + "FFedit-create" + ], + "items": { + "title": "Supplementary Processed Filesets", + "description": "Fileset metadata", + "type": "object", + "required": [ + "title", + "files" + ], + "properties": { + "title": { + "title": "Fileset Title", + "type": "string", + "lookup": 411 + }, + "type": { + "title": "Fileset Type", + "type": "string", + "lookup": 412, + "enum": [ + "supplementary", + "archived", + "preliminary", + "visualization" + ] + }, + "description": { + "title": "Description", + "type": "string", + "lookup": 413 + }, + "files": { + "title": "Files", + "type": "array", + "lookup": 414, + "items": { + "title": "File", + "type": "string", + "linkTo": "FileProcessed" + } + } + } + } + } + }, + "submitted_files": { + "files": { + "title": "Submitted Files", + "description": "Submitted files associated with the item", + "type": "array", + "items": { + "title": "Submitted File", + "description": "File metadata.", + "type": "string", + "linkTo": "File" + } + } + }, + "meta_workflow_runs": { + "meta_workflow_runs": { + "title": "MetaWorkflowRuns", + "description": "Bioinformatics analysis pipelines associated with this item", + "type": "array", + "items": { + "title": "MetaWorkflowRun", + "type": "string", + "linkTo": "MetaWorkflowRun" + } + } + }, + "processed_files": { + "processed_files": { + "title": "Processed Files", + "description": "Processed files from bioinformatics pipelines", + "type": "array", + "items": { + "title": "Processed File", + "type": "string", + "linkTo": "FileProcessed" + } + } + } +} diff --git a/snovault/schemas/user.json b/snovault/schemas/user.json new file mode 100644 index 000000000..d51792395 --- /dev/null +++ b/snovault/schemas/user.json @@ -0,0 +1,580 @@ +{ + "title": "User", + "id": "/profiles/user.json", + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "object", + "required": [ + "email", + "first_name", + "last_name" + ], + "identifyingProperties": [ + "uuid", + "email", + "aliases" + ], + "additionalProperties": false, + "mixinProperties": [ + { + "$ref": "mixins.json#/schema_version" + }, + { + "$ref": "mixins.json#/uuid" + }, + { + "$ref": "mixins.json#/aliases" + }, + { + "$ref": "mixins.json#/submitted" + }, + { + "$ref": "mixins.json#/modified" + } + ], + "properties": { + "schema_version": { + "default": "1" + }, + "status": { + "title": "Status", + "type": "string", + "default": "current", + "permission": "restricted_fields", + "enum": [ + "current", + "deleted", + "inactive", + "revoked" + ] + }, + "email": { + "title": "Account Email", + "description": "Email used to log in to the 4DN Portal.", + "type": "string", + "format": "email", + "lookup": 20, + "uniqueKey": true + }, + "first_name": { + "title": "First name", + "description": "The user's first (given) name.", + "type": "string", + "lookup": 30 + }, + "last_name": { + "title": "Last name", + "description": "The user's last (family) name.", + "type": "string", + "lookup": 40 + }, + "job_title": { + "title": "Job Title", + "type": "string", + "comment": "Can be user supplied - purely informational", + "lookup": 41 + }, + "groups": { + "title": "Groups", + "description": "Additional access control groups", + "note": "USE WITH CAUTION - currently how we add admin access to a user", + "type": "array", + "lookup": 80, + "uniqueItems": true, + "permission": "restricted_fields", + "items": { + "type": "string" + } + }, + "preferred_email": { + "title": "Preferred Contact Email", + "description": "Email to contact by, if different from account/sign-in e-mail address", + "type": "string", + "format": "email", + "lookup": 45 + }, + "phone1": { + "title": "Primary phone number", + "description": "The user's primary phone number (with country code).", + "type": "string", + "format": "phone", + "lookup": 100, + "pattern": "[+]?[\\d]{10,36}((\\sx|\\sext|\\sextension)(\\s)?[\\d]{1,7})?$" + }, + "phone2": { + "title": "Alternate phone number", + "description": "The user's secondary phone number (with country code).", + "type": "string", + "format": "phone" + }, + "fax": { + "title": "Fax number", + "description": "A FAX number for the user (with country code).", + "type": "string", + "format": "phone" + }, + "skype": { + "title": "Skype ID", + "type": "string", + "lookup": 110 + }, + "google": { + "title": "Google ID", + "type": "string", + "lookup": 120 + }, + "timezone": { + "title": "Timezone", + "description": "The timezone the user is associated with.", + "type": "string", + "lookup": 130, + "default": "US/Eastern", + "enum": [ + "Africa/Abidjan", + "Africa/Accra", + "Africa/Addis_Ababa", + "Africa/Algiers", + "Africa/Asmara", + "Africa/Bamako", + "Africa/Bangui", + "Africa/Banjul", + "Africa/Bissau", + "Africa/Blantyre", + "Africa/Brazzaville", + "Africa/Bujumbura", + "Africa/Cairo", + "Africa/Casablanca", + "Africa/Ceuta", + "Africa/Conakry", + "Africa/Dakar", + "Africa/Dar_es_Salaam", + "Africa/Djibouti", + "Africa/Douala", + "Africa/El_Aaiun", + "Africa/Freetown", + "Africa/Gaborone", + "Africa/Harare", + "Africa/Johannesburg", + "Africa/Juba", + "Africa/Kampala", + "Africa/Khartoum", + "Africa/Kigali", + "Africa/Kinshasa", + "Africa/Lagos", + "Africa/Libreville", + "Africa/Lome", + "Africa/Luanda", + "Africa/Lubumbashi", + "Africa/Lusaka", + "Africa/Malabo", + "Africa/Maputo", + "Africa/Maseru", + "Africa/Mbabane", + "Africa/Mogadishu", + "Africa/Monrovia", + "Africa/Nairobi", + "Africa/Ndjamena", + "Africa/Niamey", + "Africa/Nouakchott", + "Africa/Ouagadougou", + "Africa/Porto-Novo", + "Africa/Sao_Tome", + "Africa/Tripoli", + "Africa/Tunis", + "Africa/Windhoek", + "America/Adak", + "America/Anchorage", + "America/Anguilla", + "America/Antigua", + "America/Araguaina", + "America/Argentina/Buenos_Aires", + "America/Argentina/Catamarca", + "America/Argentina/Cordoba", + "America/Argentina/Jujuy", + "America/Argentina/La_Rioja", + "America/Argentina/Mendoza", + "America/Argentina/Rio_Gallegos", + "America/Argentina/Salta", + "America/Argentina/San_Juan", + "America/Argentina/San_Luis", + "America/Argentina/Tucuman", + "America/Argentina/Ushuaia", + "America/Aruba", + "America/Asuncion", + "America/Atikokan", + "America/Bahia", + "America/Bahia_Banderas", + "America/Barbados", + "America/Belem", + "America/Belize", + "America/Blanc-Sablon", + "America/Boa_Vista", + "America/Bogota", + "America/Boise", + "America/Cambridge_Bay", + "America/Campo_Grande", + "America/Cancun", + "America/Caracas", + "America/Cayenne", + "America/Cayman", + "America/Chicago", + "America/Chihuahua", + "America/Costa_Rica", + "America/Creston", + "America/Cuiaba", + "America/Curacao", + "America/Danmarkshavn", + "America/Dawson", + "America/Dawson_Creek", + "America/Denver", + "America/Detroit", + "America/Dominica", + "America/Edmonton", + "America/Eirunepe", + "America/El_Salvador", + "America/Fortaleza", + "America/Glace_Bay", + "America/Godthab", + "America/Goose_Bay", + "America/Grand_Turk", + "America/Grenada", + "America/Guadeloupe", + "America/Guatemala", + "America/Guayaquil", + "America/Guyana", + "America/Halifax", + "America/Havana", + "America/Hermosillo", + "America/Indiana/Indianapolis", + "America/Indiana/Knox", + "America/Indiana/Marengo", + "America/Indiana/Petersburg", + "America/Indiana/Tell_City", + "America/Indiana/Vevay", + "America/Indiana/Vincennes", + "America/Indiana/Winamac", + "America/Inuvik", + "America/Iqaluit", + "America/Jamaica", + "America/Juneau", + "America/Kentucky/Louisville", + "America/Kentucky/Monticello", + "America/Kralendijk", + "America/La_Paz", + "America/Lima", + "America/Los_Angeles", + "America/Lower_Princes", + "America/Maceio", + "America/Managua", + "America/Manaus", + "America/Marigot", + "America/Martinique", + "America/Matamoros", + "America/Mazatlan", + "America/Menominee", + "America/Merida", + "America/Metlakatla", + "America/Mexico_City", + "America/Miquelon", + "America/Moncton", + "America/Monterrey", + "America/Montevideo", + "America/Montreal", + "America/Montserrat", + "America/Nassau", + "America/New_York", + "America/Nipigon", + "America/Nome", + "America/Noronha", + "America/North_Dakota/Beulah", + "America/North_Dakota/Center", + "America/North_Dakota/New_Salem", + "America/Ojinaga", + "America/Panama", + "America/Pangnirtung", + "America/Paramaribo", + "America/Phoenix", + "America/Port-au-Prince", + "America/Port_of_Spain", + "America/Porto_Velho", + "America/Puerto_Rico", + "America/Rainy_River", + "America/Rankin_Inlet", + "America/Recife", + "America/Regina", + "America/Resolute", + "America/Rio_Branco", + "America/Santa_Isabel", + "America/Santarem", + "America/Santiago", + "America/Santo_Domingo", + "America/Sao_Paulo", + "America/Scoresbysund", + "America/Shiprock", + "America/Sitka", + "America/St_Barthelemy", + "America/St_Johns", + "America/St_Kitts", + "America/St_Lucia", + "America/St_Thomas", + "America/St_Vincent", + "America/Swift_Current", + "America/Tegucigalpa", + "America/Thule", + "America/Thunder_Bay", + "America/Tijuana", + "America/Toronto", + "America/Tortola", + "America/Vancouver", + "America/Whitehorse", + "America/Winnipeg", + "America/Yakutat", + "America/Yellowknife", + "Antarctica/Casey", + "Antarctica/Davis", + "Antarctica/DumontDUrville", + "Antarctica/Macquarie", + "Antarctica/Mawson", + "Antarctica/McMurdo", + "Antarctica/Palmer", + "Antarctica/Rothera", + "Antarctica/South_Pole", + "Antarctica/Syowa", + "Antarctica/Vostok", + "Arctic/Longyearbyen", + "Asia/Aden", + "Asia/Almaty", + "Asia/Amman", + "Asia/Anadyr", + "Asia/Aqtau", + "Asia/Aqtobe", + "Asia/Ashgabat", + "Asia/Baghdad", + "Asia/Bahrain", + "Asia/Baku", + "Asia/Bangkok", + "Asia/Beirut", + "Asia/Bishkek", + "Asia/Brunei", + "Asia/Choibalsan", + "Asia/Chongqing", + "Asia/Colombo", + "Asia/Damascus", + "Asia/Dhaka", + "Asia/Dili", + "Asia/Dubai", + "Asia/Dushanbe", + "Asia/Gaza", + "Asia/Harbin", + "Asia/Hebron", + "Asia/Ho_Chi_Minh", + "Asia/Hong_Kong", + "Asia/Hovd", + "Asia/Irkutsk", + "Asia/Jakarta", + "Asia/Jayapura", + "Asia/Jerusalem", + "Asia/Kabul", + "Asia/Kamchatka", + "Asia/Karachi", + "Asia/Kashgar", + "Asia/Kathmandu", + "Asia/Khandyga", + "Asia/Kolkata", + "Asia/Krasnoyarsk", + "Asia/Kuala_Lumpur", + "Asia/Kuching", + "Asia/Kuwait", + "Asia/Macau", + "Asia/Magadan", + "Asia/Makassar", + "Asia/Manila", + "Asia/Muscat", + "Asia/Nicosia", + "Asia/Novokuznetsk", + "Asia/Novosibirsk", + "Asia/Omsk", + "Asia/Oral", + "Asia/Phnom_Penh", + "Asia/Pontianak", + "Asia/Pyongyang", + "Asia/Qatar", + "Asia/Qyzylorda", + "Asia/Rangoon", + "Asia/Riyadh", + "Asia/Sakhalin", + "Asia/Samarkand", + "Asia/Seoul", + "Asia/Shanghai", + "Asia/Singapore", + "Asia/Taipei", + "Asia/Tashkent", + "Asia/Tbilisi", + "Asia/Tehran", + "Asia/Thimphu", + "Asia/Tokyo", + "Asia/Ulaanbaatar", + "Asia/Urumqi", + "Asia/Ust-Nera", + "Asia/Vientiane", + "Asia/Vladivostok", + "Asia/Yakutsk", + "Asia/Yekaterinburg", + "Asia/Yerevan", + "Atlantic/Azores", + "Atlantic/Bermuda", + "Atlantic/Canary", + "Atlantic/Cape_Verde", + "Atlantic/Faroe", + "Atlantic/Madeira", + "Atlantic/Reykjavik", + "Atlantic/South_Georgia", + "Atlantic/St_Helena", + "Atlantic/Stanley", + "Australia/Adelaide", + "Australia/Brisbane", + "Australia/Broken_Hill", + "Australia/Currie", + "Australia/Darwin", + "Australia/Eucla", + "Australia/Hobart", + "Australia/Lindeman", + "Australia/Lord_Howe", + "Australia/Melbourne", + "Australia/Perth", + "Australia/Sydney", + "Canada/Atlantic", + "Canada/Central", + "Canada/Eastern", + "Canada/Mountain", + "Canada/Newfoundland", + "Canada/Pacific", + "Europe/Amsterdam", + "Europe/Andorra", + "Europe/Athens", + "Europe/Belgrade", + "Europe/Berlin", + "Europe/Bratislava", + "Europe/Brussels", + "Europe/Bucharest", + "Europe/Budapest", + "Europe/Busingen", + "Europe/Chisinau", + "Europe/Copenhagen", + "Europe/Dublin", + "Europe/Gibraltar", + "Europe/Guernsey", + "Europe/Helsinki", + "Europe/Isle_of_Man", + "Europe/Istanbul", + "Europe/Jersey", + "Europe/Kaliningrad", + "Europe/Kiev", + "Europe/Lisbon", + "Europe/Ljubljana", + "Europe/London", + "Europe/Luxembourg", + "Europe/Madrid", + "Europe/Malta", + "Europe/Mariehamn", + "Europe/Minsk", + "Europe/Monaco", + "Europe/Moscow", + "Europe/Oslo", + "Europe/Paris", + "Europe/Podgorica", + "Europe/Prague", + "Europe/Riga", + "Europe/Rome", + "Europe/Samara", + "Europe/San_Marino", + "Europe/Sarajevo", + "Europe/Simferopol", + "Europe/Skopje", + "Europe/Sofia", + "Europe/Stockholm", + "Europe/Tallinn", + "Europe/Tirane", + "Europe/Uzhgorod", + "Europe/Vaduz", + "Europe/Vatican", + "Europe/Vienna", + "Europe/Vilnius", + "Europe/Volgograd", + "Europe/Warsaw", + "Europe/Zagreb", + "Europe/Zaporozhye", + "Europe/Zurich", + "GMT", + "Indian/Antananarivo", + "Indian/Chagos", + "Indian/Christmas", + "Indian/Cocos", + "Indian/Comoro", + "Indian/Kerguelen", + "Indian/Mahe", + "Indian/Maldives", + "Indian/Mauritius", + "Indian/Mayotte", + "Indian/Reunion", + "Pacific/Apia", + "Pacific/Auckland", + "Pacific/Chatham", + "Pacific/Chuuk", + "Pacific/Easter", + "Pacific/Efate", + "Pacific/Enderbury", + "Pacific/Fakaofo", + "Pacific/Fiji", + "Pacific/Funafuti", + "Pacific/Galapagos", + "Pacific/Gambier", + "Pacific/Guadalcanal", + "Pacific/Guam", + "Pacific/Honolulu", + "Pacific/Johnston", + "Pacific/Kiritimati", + "Pacific/Kosrae", + "Pacific/Kwajalein", + "Pacific/Majuro", + "Pacific/Marquesas", + "Pacific/Midway", + "Pacific/Nauru", + "Pacific/Niue", + "Pacific/Norfolk", + "Pacific/Noumea", + "Pacific/Pago_Pago", + "Pacific/Palau", + "Pacific/Pitcairn", + "Pacific/Pohnpei", + "Pacific/Port_Moresby", + "Pacific/Rarotonga", + "Pacific/Saipan", + "Pacific/Tahiti", + "Pacific/Tarawa", + "Pacific/Tongatapu", + "Pacific/Wake", + "Pacific/Wallis", + "US/Alaska", + "US/Arizona", + "US/Central", + "US/Eastern", + "US/Hawaii", + "US/Mountain", + "US/Pacific", + "UTC" + ] + }, + "was_unauthorized": { + "title": "Was Unauthorized", + "permission": "restricted_fields", + "description": "Flag that is True if user was created with create-unauthorized-user endpoint", + "type": "boolean" + } + }, + "columns": { + "job_title": { + "title": "Job Title", + "default_hidden": true + } + } +} diff --git a/snovault/search/compound_search.py b/snovault/search/compound_search.py new file mode 100644 index 000000000..51116fb6d --- /dev/null +++ b/snovault/search/compound_search.py @@ -0,0 +1,433 @@ +import json +# import os +import urllib.parse + +from pyramid.httpexceptions import HTTPBadRequest +# from pyramid.request import Request +from pyramid.view import view_config +from ..interfaces import TYPES + +from ..util import debug_log, get_item_or_none +from ..types.filter_set import FLAGS, FILTER_BLOCKS + +from .lucene_builder import LuceneBuilder +from .search import SearchBuilder, search as single_query_search +from .search_utils import execute_search, build_sort_dicts, make_search_subreq + + +def includeme(config): + config.add_route('compound_search', '/compound_search') + config.add_route('build_query', '/build_query{slash:/?}') + config.scan(__name__) + + +class CompoundSearchBuilder: + """ Encapsulates methods needed to run a compound search, in other words an + AND or an OR query combining a set of queries. + + Entry point is "execute_filter_set". + """ + TYPE = 'search_type' + ID = '@id' + QUERY = 'query' + NAME = 'name' + FLAGS_APPLIED = 'flags_applied' + BUILD_QUERY_URL = '/build_query/' + + @staticmethod + def build_subreq_from_single_query(request, query, route='/search/', from_=0, to=10): + """ Builds a Request object that is a proper sub-request of the given request. + Passes flags directly as query string params. Intended for use with search. + + :param request: request to build off of + :param query: search query + :param route: route of sub-request to build + :param from_: starting ES hit index + :param to: how many results to returning, index starting at from_ + :return: new Request + """ + + # do some sanitization + # This will actually urlencode the entire previous string, so: + # `&inheritance_modes=Neurodev+2500%2B+%282598%29` will become: + # `&inheritance_modes=Neurodev%2B2500%252B%2B%25282598%2529` (the "&" & "=" chars can be encoded too, but left intact) + # so the "+" become urlencoded into 'plus-sign' encodings (%2B) (rather than space - %20), and previous 'plus-sign' + # encodings go from `%2B` to `%252B` (the percent sign gets encoded). So essentially is an encoding of an encoding. + # But it works once goes into snovault's `make_subreq` downstream. + if len(query) > 0 and query[0] != "?": + query = '?' + query + + # If any '?', '&', or '=' in search term, should have been pre-encoded. + # Meant to handle "+" especially. + query = urllib.parse.quote(query, safe="?&=") + + subreq = make_search_subreq(request, route + '%s&from=%s&limit=%s' % (query, from_, to)) + subreq.headers['Accept'] = 'application/json' + + return subreq + + @staticmethod + def combine_query_strings(qstring1, qstring2): + """ Builds a single URL query from the given flags and blocks. + + :param qstring1: flags, usually ? prefixed + :param qstring2: blocks to add to it + :return: combined query + """ + + dict_to_merge_into = dict(urllib.parse.parse_qs(qstring1.lstrip('?'), keep_blank_values=True)) + dict_with_more_vals = dict(urllib.parse.parse_qs(qstring2.lstrip('?'), keep_blank_values=True)) + + for k, v in dict_with_more_vals.items(): + if k in dict_to_merge_into: + dict_to_merge_into[k] += v + else: + dict_to_merge_into[k] = v + + return urllib.parse.urlencode(dict_to_merge_into, doseq=True) + + @staticmethod + def format_result_for_endpoint_response(request, es_results, filter_set, result_sort, search_builder_instance): + """ Formats es_results from filter_set into a dictionary containing total and @graph, + setting status on the request if needed. Also sets "__matching_filter_block_names" computed field to identify + which filter block indices the result matched. + + :param request: current request + :param es_results: response from ES + :return: dictionary response + """ + + if es_results['hits']['total']['value'] == 0: + request.response.status_code = 404 # see google webmaster doc on why + + if search_builder_instance.search_session_id: # Is 'None' if e.g. limit=all + request.response.set_cookie('searchSessionID', search_builder_instance.search_session_id) + + result_list = [] + for hit in es_results['hits'].get("hits", []): + result = hit['_source']['embedded'] + # Matched query names are returned as string here, regardless of their specified original type. + result["__matching_filter_block_names"] = hit.get("matched_queries", []) + result_list.append(result) + + columns = SearchBuilder.build_initial_columns([ request.registry[TYPES][filter_set[CompoundSearchBuilder.TYPE]].schema ]) + # We used multiple filter blocks, so we add in column for "__matching_filter_block_names" + columns["__matching_filter_block_names"] = { + "title": "Filter Blocks Matched", + "order": 1000 + } + + return { + # "@id": "/compound_search", # Removed - presense of @id on UI is inferred to mean that there is 1 filter block in request. + # "@type": ["SearchResults"], # Not necessary from UI atm but can consider adding for semantics + "total": es_results['hits'].get("total", {}).get("value", 0), + "@graph": result_list, + "columns": columns, + "sort": result_sort + } + + @staticmethod + def invoke_search(context, request, subreq, search_type, return_generator=False): + """ + Wrapper method that invokes the core search API (/search/) with the given subreq and + copies over status code to parent response. + + :param context: context of parent request + :param request: parent request + :param subreq: subrequest + :return: response from /search/ + """ + # Calls SearchBuilder.format_results internally, incl. adding searchSessionID cookie to response. + response = single_query_search(context, subreq, search_type, return_generator) + if subreq.response.status_code == 404: + request.response.status_code = 404 + return response + + @staticmethod + def _add_type_to_flag_if_needed(flags, type_flag): + """ Modifies 'flags' in place by adding type query if it is not present + + :param flags: query substring + :param type_flag: query substring containing type requirement + :return: query string that combines the two, if type requirement isn't already there + """ + if type_flag not in flags or type_flag.lower() not in flags: + if len(flags) > 0: + flags += '&' + type_flag + else: + flags = type_flag + + return flags + + @staticmethod + def es_results_generator(es_results): + for hit in es_results['hits'].get('hits', []): + yield hit['_source']['embedded'] + + @staticmethod + def execute_filter_set(context, request, filter_set, from_=0, to=10, + global_flags=None, return_generator=False, intersect=False): + """ Executes the given filter_set. This function contains the core functionality of the class. + A filter_set with respect to this function is just a dictionary containing the following things: + 1. 'search_type' is the item type we are executing on. Required. + 2. 'filter_blocks' contains the filter blocks we would like to apply with named flags we + would like to apply on this block as well + 3. 'flags' is a dictionary containing named flags to be applied to individual filter_blocks + by name. + + NOTE: if neither 'flags' nor 'filter_blocks' is specified then a generic type=Item + search will be executed. If just 'flags' is specified with no filter_blocks, the + flags will be ignored (since there are no filter_blocks to apply it to). + """ + cls = CompoundSearchBuilder + filter_blocks = filter_set.get(FILTER_BLOCKS, []) + flags = filter_set.get(FLAGS, None) + doc_type = filter_set.get(CompoundSearchBuilder.TYPE) + search_type = filter_set.get(cls.TYPE, 'Item') # if type not set, attempt to search on item + type_flag = 'type=%s' % search_type + + # if we have no filter blocks, there is no context to enable flags, so + # pass type_flag + global_flags + if not filter_blocks and flags: + if global_flags: + query = cls.combine_query_strings(global_flags, type_flag) + else: + query = type_flag + subreq = cls.build_subreq_from_single_query(request, query, from_=from_, to=to) + return CompoundSearchBuilder.invoke_search(context, request, subreq, search_type, return_generator) + + # if we specified global_flags, combine that query with the single filter_block, + # otherwise pass the filter_block query directly + elif not flags and len(filter_blocks) == 1: + block = filter_blocks[0] + block_query = block[cls.QUERY] + if global_flags: + query = cls.combine_query_strings(global_flags, block_query) + else: + query = block_query + query = cls._add_type_to_flag_if_needed(query, type_flag) + subreq = cls.build_subreq_from_single_query(request, query, from_=from_, to=to) + return CompoundSearchBuilder.invoke_search(context, request, subreq, search_type, return_generator) + + # Extract query string and list of applied flags, add global_flags to block_query first + # then add flags as applied and type_flag if needed. + elif flags and len(filter_blocks) == 1: + block_query = filter_blocks[0][cls.QUERY] + flags_applied = filter_blocks[0][cls.FLAGS_APPLIED] + if global_flags: + query = cls.combine_query_strings(global_flags, block_query) + else: + query = block_query + for applied_flag in flags_applied: + for flag in flags: + if flag[cls.NAME] == applied_flag: + query = cls.combine_query_strings(query, flag[cls.QUERY]) + break + query = cls._add_type_to_flag_if_needed(query, type_flag) + subreq = cls.build_subreq_from_single_query(request, query, from_=from_, to=to) + return CompoundSearchBuilder.invoke_search(context, request, subreq, search_type, return_generator) + + # Build the compound_query + # Iterate through filter_blocks, adding global_flags if specified and adding flags if specified + else: + sub_queries = [] + for block_index, block in enumerate(filter_blocks): + block_query = block[cls.QUERY] + flags_applied = block[cls.FLAGS_APPLIED] + query = block_query + if global_flags: + query = cls.combine_query_strings(global_flags, block_query) + for applied_flag in flags_applied: + for flag in flags: + if flag[cls.NAME] == applied_flag: + query = cls.combine_query_strings(query, flag[cls.QUERY]) + break + query = cls._add_type_to_flag_if_needed(query, type_flag) + subreq = cls.build_subreq_from_single_query(request, query, route=cls.BUILD_QUERY_URL, + from_=from_, to=to) + sub_query = request.invoke_subrequest(subreq).json[cls.QUERY] + # See https://www.elastic.co/guide/en/elasticsearch/reference/7.17/query-dsl-bool-query.html#named-queries + sub_query["bool"]["_name"] = str(block.get("name", block_index)) # note in ES7 numbers here must be cast to string + sub_queries.append(sub_query) + + + compound_query = LuceneBuilder.compound_search(sub_queries, intersect=intersect) + compound_subreq = cls.build_subreq_from_single_query(request, ('?type=' + search_type)) + + requested_sorts = filter_set.get("sort", []) + if not requested_sorts and global_flags: + requested_sorts = urllib.parse.parse_qs(global_flags).get("sort", []) + + sort, result_sort = build_sort_dicts(requested_sorts, request, [ doc_type ]) + + search_builder_instance = SearchBuilder.from_search(context, compound_subreq, compound_query, return_generator=return_generator) + search_builder_instance.assure_session_id() + search_builder_instance.query['sort'] = sort + es_results = None + if return_generator and to == "all": + es_results = search_builder_instance.execute_search_for_all_results() + # es_results['hits']['hits'] WILL ALREADY BE A GENERATOR/ITERATOR -- SEE search.py:execute_search_for_all_results + # (Unless result count is <100, in which case will be a list) + return cls.es_results_generator(es_results) + elif return_generator and to != "all": + # TODO: Consider enabling somehow + raise HTTPBadRequest("`return_generator` without `limit=all` not yet supported") + elif to == "all": + raise HTTPBadRequest("`limit=all` is not permitted without `return_generator`") + else: + es_results = execute_search( + es=search_builder_instance.es, + query=search_builder_instance.query, + index=search_builder_instance.es_index, + from_=from_, + size=to, + session_id=search_builder_instance.search_session_id + ) + return cls.format_result_for_endpoint_response(request, es_results, filter_set, result_sort, search_builder_instance) + + @staticmethod + def validate_flag(flag): + """ Validates a given flag has the correct structure/types """ + if CompoundSearchBuilder.NAME not in flag or CompoundSearchBuilder.QUERY not in flag: # existence + raise HTTPBadRequest('Passed a bad flag with missing structure: %s' % flag) + elif not isinstance(flag[CompoundSearchBuilder.NAME], str): # type + raise HTTPBadRequest('Passed a bad flag with incorrect parameter for field %s: %s' % (CompoundSearchBuilder.NAME, flag)) + elif not isinstance(flag[CompoundSearchBuilder.QUERY], str): # type + raise HTTPBadRequest('Passed a bad flag with incorrect parameter for field %s: %s' % (CompoundSearchBuilder.QUERY, flag)) + + @staticmethod + def validate_filter_block(filter_block): + """ Validates a given filter_block has correct structure/types """ + if CompoundSearchBuilder.QUERY not in filter_block or CompoundSearchBuilder.FLAGS_APPLIED not in filter_block: + raise HTTPBadRequest('Passed a bad filter_block with missing structure: %s' % filter_block) + elif not isinstance(filter_block[CompoundSearchBuilder.QUERY], str): + raise HTTPBadRequest('Passed a bad filter_block with wrong type for field %s: %s' % (CompoundSearchBuilder.QUERY, filter_block)) + elif not isinstance(filter_block[CompoundSearchBuilder.FLAGS_APPLIED], list): + raise HTTPBadRequest('Passed a bad filter_block with wrong type for field %s: %s' % + (CompoundSearchBuilder.FLAGS_APPLIED, filter_block)) + + @staticmethod + def extract_filter_set_from_search_body(request, body): + """ Validates the compound_search POST request body, returning a dictionary filter_set item. + + :param request: current request + :param body: body of POST request (in JSON) + :return: a filter_set, to be executed + """ + if CompoundSearchBuilder.ID in body: # prioritize @id + return get_item_or_none(request, body[CompoundSearchBuilder.ID]) + else: + filter_set = {} + if CompoundSearchBuilder.TYPE in body: + filter_set[CompoundSearchBuilder.TYPE] = body[CompoundSearchBuilder.TYPE] + else: + raise HTTPBadRequest('Tried to execute a filter_set without specifying a type!') + if FLAGS in body: + if not isinstance(body[FLAGS], list): + raise HTTPBadRequest('Passed a bad value for flags: %s -- Expected a list.' % body[FLAGS]) + for flag in body[FLAGS]: + CompoundSearchBuilder.validate_flag(flag) + filter_set[FLAGS] = body[FLAGS] + if FILTER_BLOCKS in body: + if not isinstance(body[FILTER_BLOCKS], list): + raise HTTPBadRequest('Passed a bad value for flags: %s -- Expected a list.' % body[FILTER_BLOCKS]) + for filter_block in body[FILTER_BLOCKS]: + CompoundSearchBuilder.validate_filter_block(filter_block) + filter_set[FILTER_BLOCKS] = body[FILTER_BLOCKS] + return filter_set + + + + + +@view_config(route_name='build_query', request_method='GET', permission='search') +@debug_log +def build_query(context, request): + """ Runs the query construction step of the search, returning the lucene query as the response. + Used as a helper for compound_search, making 1 sub-request per filter_block. + """ + builder = SearchBuilder(context, request) + builder._build_query() + return builder.query + + +@view_config(route_name='compound_search', request_method='POST', permission='search') +@debug_log +def compound_search(context, request): + """ Executes a compound_search given a uuid of a filter_set (or filter_set props, tbd). + + You have two options when executing a compound search - you can pass a uuid of an existing + filter_set item or you can pass the relevant filter_set fields directly. This allows the + client to acquire/cache filter_sets then pass modified query params directly to ES without + triggering a write to the base filter_set. + + POST Body Syntax: + { + # flags to be applied globally to the search + "global_flags": , + + # uuid of a filter_set item to execute + "uuid": , # NOTE: if you provide this, the following filter_set related fields are IGNORED + + "search_type": , # item type this filter_set is searching on + "flags": [ + { + "name": "flag_name_one", + "query": + }, + { + "name": "flag_name_two", + "query": + } + ... + ] + + # list of queries to be compounded with below structure + "filter_blocks": [ + { + "query": , (to be combined with global_flags, if specified) + "flags_applied": [ + "flag_name_one", + "flag_name_two" + ] + } + ... + ] + + # other options + "from": , # starting index in ES search results to return, default 0 + "limit": , # number of results to return, default 25 + "return_generator": true/false, default false + "intersect": true/false, if true will compute intersection of filter_blocks, default false + } + + """ + body = json.loads(request.body) + + filter_set = CompoundSearchBuilder.extract_filter_set_from_search_body(request, body) + global_flags = body.get('global_flags', None) + intersect = True if body.get('intersect', False) else False + + # Disabled for time being to allow test(s) to pass. Not sure whether to add Project to FilterSet schema 'search_type' enum. + # if filter_set.get(CompoundSearchBuilder.TYPE) not in request.registry[TYPES]["FilterSet"].schema["properties"][CompoundSearchBuilder.TYPE]["enum"]: + # raise HTTPBadRequest("Passed bad {} body param: {}".format(CompoundSearchBuilder.TYPE, filter_set.get(CompoundSearchBuilder.TYPE))) + + from_ = body.get('from', 0) + limit = body.get('limit', 10) # pagination size 10 works better with ECS + if limit == "all": + raise HTTPBadRequest("compound_search does not support limit=all at this time.") + if limit > 1000: + limit = 1000 + if from_ < 0 or limit < 0: + raise HTTPBadRequest('Passed bad from, to request body params: %s, %s' % (from_, limit)) + + return CompoundSearchBuilder.execute_filter_set( + context, + request, + filter_set, + from_=from_, + to=limit, + global_flags=global_flags, + intersect=intersect, + return_generator=False + ) diff --git a/snovault/search/lucene_builder.py b/snovault/search/lucene_builder.py new file mode 100644 index 000000000..a63c62401 --- /dev/null +++ b/snovault/search/lucene_builder.py @@ -0,0 +1,1158 @@ +import re +import structlog +from copy import deepcopy +from collections import OrderedDict +from pyramid.httpexceptions import HTTPBadRequest +from urllib.parse import urlencode +from snovault import TYPES +from snovault.elasticsearch.create_mapping import determine_if_is_date_field +from .search_utils import ( + find_nested_path, # convert_search_to_dictionary, + QueryConstructionException, + COMMON_EXCLUDED_URI_PARAMS, QUERY, FILTER, MUST, MUST_NOT, BOOL, MATCH, SHOULD, + EXISTS, FIELD, NESTED, PATH, TERMS, RANGE, AGGS, # REVERSE_NESTED, + STATS, + schema_for_field, get_query_field, search_log, MAX_FACET_COUNTS, +) + + +log = structlog.getLogger(__name__) + + +class LuceneBuilder: + """ Collection of methods for working with Lucene queries. These operations can be used + independently of the SearchBuilder state. See SearchBuilder for how these are used. + + Main points of entry: + 1. build_filters (construct the search query itself) + 2. build_facets (construct aggregations on search) + 3. verify_search_has_permissions (to be sure we did not strip permissions while building search) + + All other methods in this class are helper methods. Static methods are "leaf" operations that do + not require additional function calls. Class methods call other methods within the class but could + be "entry-point" methods as well. + """ + to_from_pattern = re.compile("^(.*)[.](to|from)$") + RANGE_DIRECTIONS = ['gt', 'gte', 'lt', 'lte'] + SMALLEST_NONZERO_IEEE_32 = 1.1754e-38 # smallest epsilon > 0 (estimate) + SMALLEST_NEGATIVE_IEEE_32 = -3.4028e38 + # ref: http://www.cs.uwm.edu/classes/cs315/Bacon/Lecture/HTML/ch04s17.html + # 1.00000000000000000000001 x 2^-127 = 1.1754e-38 + + @staticmethod + def apply_range_filters(range_filters, must_filters, es_mapping): + """ + Applies the range filters to the 'must' subquery + Tuple format is required to handle nested fields that are non-range (it is discarded in this case) + Nested range fields must also be separated from other nested sub queries - see comment in handle_nested_filters + Modifies must_filters in place + + :param range_filters: intermediary range_filter format to be converted to valid lucene + :param must_filters: must_filters from build_sub_queries, this is where range filters are applied + :param es_mapping: mapping of the item we searching on, as the range filter could be on a nested field + """ + + # tuple format is required to handle nested fields that are non-range (it is discarded in this case) + # nested range fields must also be separated from other nested sub queries - see comment in 'handle_nested_filters' + for range_field, range_def in range_filters.items(): + nested_path = find_nested_path(range_field, es_mapping) + range_query = {RANGE: {range_field: range_def}} + if 'add_no_value' in range_def: + del range_def['add_no_value'] + range_query = { + BOOL: { + SHOULD: [ + range_query, + {BOOL: {MUST_NOT: {EXISTS: {FIELD: range_field}}}} + ] + } + } + if nested_path: + # look for existing nested sub query - must add to it if it exists + found = False + for query_part in must_filters: + nested_queries = query_part.get(BOOL, {}).get(MUST, {}) + for query in nested_queries: + if NESTED in query and query[NESTED][PATH] == nested_path: + query[NESTED][QUERY][BOOL][MUST].append(range_query) + found = True + break # if we found a valid path, add it here and continue + # if we never found a path, this is the only nested query on that path, so just add it as is + if not found: + new_nested_query = { + NESTED: { + PATH: nested_path, + QUERY: range_query + } + } + must_filters.append(new_nested_query) + else: + must_filters.append(range_query) + + @staticmethod + def handle_should_query(field_name, options): + """ + Builds a lucene 'should' subquery for every option for the given field + + :param field_name: full path to field + :param options: list of options for that field + ex: field_name='embedded.files.file_size.raw', options=[20, 30, 40] + + :return: dsl-subquery that is effectively an OR of all options on the field. See SHOULD. + """ + should_query = {BOOL: {SHOULD: {TERMS: {field_name: options}}}} + return should_query + + @classmethod + def build_sub_queries(cls, field_filters, es_mapping): + """ + Builds queries based on several things: + - What the query field is + - If that field is nested + - If we would like to see items who do not have a value for this field. These items will have 'No value' + for the relevant field. + - If it is a positive (must) or negative (must_not) query. This is the level by which these are separated. + + :param field_filters: Intermediary format to be converted to valid lucene based on the es_mapping + :param es_mapping: mapping of the item we are searching on + :return: 4 tuple consisting of (must_filters, must_not_filters, must_filters_nested, must_not_filters_nested) + """ + must_filters = [] + must_not_filters = [] + must_filters_nested = [] + must_not_filters_nested = [] + + for query_field, filters in field_filters.items(): + # if we are nested, we must construct the query differently than normal + nested_path = find_nested_path(query_field, es_mapping) + if nested_path is not None: + query_field = query_field.replace('.properties', '') + strip_raw = query_field.replace('.raw', '') + + # if we are searching on the nested field itself, we must do something "special" + if nested_path == strip_raw: + query_field = strip_raw + + # if searching on 'No Value' on a nested field, the query has to be written + # slightly differently - note that you cannot combine a 'No value' search with + # anything else on this field path + if filters['add_no_value'] is True: + should_arr = [{EXISTS: {FIELD: query_field}}] + must_not_filters_nested.append((query_field, should_arr)) + continue + + # Build must/must_not sub-queries + # Example: + # {'bool': {'must': {'bool': + # {'should': [{'match': {'embedded.hg19.hg19_hgvsg.raw': 'NC_000001.11:g.12185956del'}}, + # {'match': {'embedded.hg19.hg19_hgvsg.raw': 'NC_000001.11:g.11901816A>T'}} + # ]}}}} + # This is a "normal" query that we must convert to a "nested" sub-query on nested_path + must_terms = cls.construct_nested_sub_queries(query_field, filters, key='must_terms') + must_not_terms = cls.construct_nested_sub_queries(query_field, filters, key='must_not_terms') + + # XXX: In ES6, MUST -> MUST_NOT EXISTS does not work - have to use EXISTS under MUST_NOT + # This means you cannot search on field=value or field DNE + if filters['add_no_value'] is True: # when searching on 'No Value' + should_arr = [must_not_terms] if must_not_terms else [] + should_arr.append({BOOL: {MUST: {EXISTS: {FIELD: query_field}}}}) # field=value OR field DNE + must_not_filters_nested.append((query_field, should_arr)) + if must_terms: + must_filters_nested.append((query_field, must_terms)) + else: # when not searching on 'No Value' + should_arr = [must_terms] if must_terms else [] + should_arr.append({EXISTS: {FIELD: query_field}}) # field=value OR field EXISTS + must_filters_nested.append((query_field, should_arr)) + if must_not_terms: + must_not_filters_nested.append((query_field, must_not_terms)) + + # if we are not nested, handle this with 'terms' query like usual + else: + must_terms = {TERMS: {query_field: filters['must_terms']}} if filters['must_terms'] else {} + must_not_terms = {TERMS: {query_field: filters['must_not_terms']}} if filters['must_not_terms'] else {} + if filters['add_no_value'] is True: + # add to must_not in an OR case, which is equivalent to filtering on 'No value' + should_arr = [must_terms] if must_terms else [] + should_arr.append({BOOL: {MUST_NOT: {EXISTS: {FIELD: query_field}}}}) # field=value OR field DNE + must_filters.append((query_field, {BOOL: {SHOULD: should_arr}})) + elif filters['add_no_value'] is False: + # add to must_not in an OR case, which is equivalent to filtering on '! No value' + should_arr = [must_terms] if must_terms else [] + should_arr.append({EXISTS: {FIELD: query_field}}) # field=value OR field EXISTS + must_filters.append((query_field, {BOOL: {SHOULD: should_arr}})) + elif must_terms: # no filtering on 'No value' + must_filters.append((query_field, must_terms)) + if must_not_terms: + must_not_filters.append((query_field, must_not_terms)) + + return must_filters, must_not_filters, must_filters_nested, must_not_filters_nested + + @staticmethod + def construct_nested_sub_queries(query_field, filters, key='must_terms'): + """ + Helper for build_sub_queries that constructs the base layer of sub-queries + Note that due to the query structure, 'must' is always needed in the base level query, + since at this point we have already split into 'must' or 'must_not'. + + :param query_field: field that we are querying + :param filters: all filters + :param key: one of 'must' or 'must_not' + :return: a lucene sub-query filtering the query field based on the given filters + :raises: QueryConstructionException if bad params make it here + """ + if key not in ['must_terms', 'must_not_terms']: + raise QueryConstructionException( + query_type='nested', + func='construct_nested_sub_queries', + msg='Tried to handle nested filter with key other than must/must_not: %s' % key + ) + my_filters = filters.get(key, []) + if len(my_filters) == 0: + return {} + elif len(my_filters) == 1: # see standard bool/match query + return {MATCH: {query_field: my_filters[0]}} + else: + sub_queries = {BOOL: {SHOULD: []}} # combine all options under SHOULD + for option in my_filters: + sub_queries[BOOL][SHOULD].append({MATCH: {query_field: option}}) + return sub_queries + + @classmethod + def extract_field_from_to(cls, query_part): + """ Neat helper method provided by Kent to clean up a step in 'handle_range_filters'. + Extracts the field_name and whether it is a 'from' or 'to' query + + :param query_part: query part to parse, such as "field.a.from" or "field.to". See the regexp. + :return: 3-tuple consisting of whether or not there was a match, the first grouping and the second grouping + ie: (True, field.name, 'from') + """ + match = cls.to_from_pattern.match(query_part) + if match is not None: + return bool(match), match.group(1), match.group(2) + return False, None, None + + @classmethod + def canonicalize_bounds(cls, range_filter): + """ Canonicalizes the bounds of the range filter such that they are + inclusive on the lower bound and exclusive on the upper bound. + """ + lower, upper = -1e38, 1e38 # very large numbers that should never be in range + for direction, pivot in range_filter.items(): + pivot = float(pivot) + if direction == 'lte': + upper = pivot + cls.SMALLEST_NONZERO_IEEE_32 + elif direction == 'lt': + upper = pivot + elif direction == 'gte': + lower = pivot + elif direction == 'gt': + lower = pivot - cls.SMALLEST_NONZERO_IEEE_32 + return lower, upper + + @classmethod + def range_includes_zero(cls, range_filter): + """ Returns True if the given range_filter includes the value 0. """ + lower, upper = cls.canonicalize_bounds(range_filter) + return lower <= 0 <= upper + + @classmethod + def handle_range_filters(cls, request, result, field_filters, doc_types): + """ + Constructs range_filters based on the given filters as part of the MUST sub-query + + :param request: current request + :param result: result to modify in place + :param field_filters: filters to look at + :param doc_types: types we are searching on + :return: constructed range_filters + """ + range_filters = {} + + for field, term in request.normalized_params.items(): + not_field = False # keep track if query is NOT (!) + exists_field = False # keep track of null values + range_type = False # If we determine is a range request (field.to, field.from), will be populated with string 'date' or 'numerical' + range_direction = None + field_schema = {} + if field == 'q' or field in COMMON_EXCLUDED_URI_PARAMS: + continue + elif field == 'type' and term != 'Item': + continue + elif term == 'No value': + exists_field = True + + # Check for date or numerical range filters + is_range, f_field, which = cls.extract_field_from_to(field) + if is_range: + if which == 'to': + range_direction = 'lte' + else: + range_direction = 'gte' + + # If schema for field is not found (and range_type thus not set), + # then treated as ordinary term filter (likely will get 0 results) + field_schema = schema_for_field(f_field, request, doc_types) + if field_schema: + range_type = 'date' if determine_if_is_date_field(f_field, field_schema) else 'numerical' + + # Add filter to result + qs = urlencode([ + (k.encode('utf-8'), v.encode('utf-8')) + for k, v in request.normalized_params.items() + if k != "limit" and k != "from" and not (k == field and v == term) + ]) + remove_path = '{}?{}'.format(request.path, qs) + + # default to searching type=Item rather than empty filter path + if remove_path[-1] == '?': + remove_path += 'type=Item' + + result['filters'].append({ + 'field': field, + 'term': term, + 'remove': remove_path + }) + + # handle NOT + if field.endswith('!'): + field = field[:-1] + not_field = True + + # Add filter to query + if range_type and f_field and range_type in ('date', 'numerical'): + query_field = 'embedded.' + f_field + elif field.startswith('validation_errors') or field.startswith('aggregated_items'): + query_field = field + '.raw' + elif field == 'type': + query_field = 'embedded.@type.raw' + else: + query_field = 'embedded.' + field + '.raw' + + if range_type: + if query_field not in range_filters: + range_filters[query_field] = {} + if range_type == 'date': + range_filters[query_field]['format'] = 'yyyy-MM-dd HH:mm' + + if range_direction in cls.RANGE_DIRECTIONS: + if range_type == "date" and len(term) == 10: # TODO: refactor to use regex -Will 06/24/2020 + # Correct term to have hours, e.g. 00:00 or 23:59, if not otherwise supplied. + if range_direction == 'gt' or range_direction == 'lte': + term += ' 23:59' + elif range_direction == 'gte' or range_direction == 'lt': + term += ' 00:00' + + if range_filters[query_field].get(range_direction) is None: + range_filters[query_field][range_direction] = term + else: + # If have a value already (e.g. multiple ranges selected), choose the widening option. + if range_direction == 'gt' or range_direction == 'gte': + if term < range_filters[query_field][range_direction]: + range_filters[query_field][range_direction] = term + elif range_direction == 'lt' or range_direction == 'lte': + if term > range_filters[query_field][range_direction]: + range_filters[query_field][range_direction] = term + + # Check if schema requests no value + if 'items' in field_schema: # we are searching on an array of numerics + field_schema = field_schema['items'] + if field_schema.get('add_no_value', False) and cls.range_includes_zero(range_filters[query_field]): + range_filters[query_field]['add_no_value'] = True + + # add these to field_filters directly, handle later with build_sub_queries + else: + if query_field not in field_filters: + field_filters[query_field] = { + 'must_terms': [], + 'must_not_terms': [], + 'add_no_value': None + } + + # handle case of filtering for null values + if exists_field: + # the value below is True when we want to include 'No value' as a filter + field_filters[query_field]['add_no_value'] = False if not_field else True + continue + + if not_field: + field_filters[query_field]['must_not_terms'].append(term) + else: + field_filters[query_field]['must_terms'].append(term) + + return range_filters + + @staticmethod + def initialize_field_filters(request, principals, doc_types): + """ Helper function for build_filters + Initializes field filters with filters that exist on all searches, does some basic updates + """ + field_filters = { + 'principals_allowed.view': { + 'must_terms': principals, + 'must_not_terms': [], + 'add_no_value': None + }, + 'embedded.@type.raw': { + 'must_terms': doc_types, + 'must_not_terms': [], + 'add_no_value': None + }, + 'embedded.status.raw': { + 'must_terms': [], + 'must_not_terms': [], + 'add_no_value': None + } + } + + # Exclude status=deleted Items unless explicitly requested/filtered-in. + if 'deleted' not in request.normalized_params.getall('status'): + field_filters['embedded.status.raw']['must_not_terms'].append('deleted') + if 'replaced' not in request.normalized_params.getall('status'): + field_filters['embedded.status.raw']['must_not_terms'].append('replaced') + + # Exclude type=TrackingItem and type=OntologyTerm from results unless are explictly specified + if 'TrackingItem' not in doc_types: + field_filters['embedded.@type.raw']['must_not_terms'].append('TrackingItem') + if 'OntologyTerm' not in doc_types: + field_filters['embedded.@type.raw']['must_not_terms'].append('OntologyTerm') + + return field_filters + + @staticmethod + def build_nested_query(nested_path, query): + """ Takes the given query and converts it into a nested query on the + given path. + """ + return { + NESTED: { + PATH: nested_path, + QUERY: query + } + } + + @classmethod + def handle_nested_filters_v2(cls, must_nested_filters, must_not_nested_filters, es_mapping): + """ This function implements nested query construction. + + When building a nested query, unlike with traditional queries, selections on the same + field must occur in the same nested sub-query in order to be applied as an intersect + condition on the object field. Previously we would create separate nested sub-queries + per field selection, which causes them to be OR'd. + + :param must_nested_filters: conditions filtered on in the affirmative + :param must_not_nested_filters: conditions filtered on in the negative + :param es_mapping: the ES mapping of the type we are searching on + :returns: a nested sub-query that can be added directly to the parent query + """ + # Build base query structure + # always use MUST + sub queries for MUST_NOT + nested_query = { + BOOL: { + MUST: [], + MUST_NOT: [], + } + } + + # Maps a nested path to a 2-tuple of it's key (must/must_not) and index + nested_path_to_index_map = {} + + # Build array of key, (field, query) so we can process the filters in a single pass + # note that MUST queries are always processed first + filters_to_work_on = [] + if must_nested_filters: + filters_to_work_on += zip([MUST] * len(must_nested_filters), must_nested_filters) + if must_not_nested_filters: + filters_to_work_on += zip([MUST_NOT] * len(must_not_nested_filters), must_not_nested_filters) + + # Process key (must/must_not), field (target of search), query (condition) + # iteratively building nested_query + for key, (field, query) in filters_to_work_on: + nested_path = find_nested_path(field, es_mapping) + + # if we've never seen this path before, bootstrap a sub-query for it + if nested_path not in nested_path_to_index_map: + + # set in tracking, note it is order dependent + new_index = len(nested_query[BOOL][key]) + nested_path_to_index_map[nested_path] = (key, new_index) + + # this nested path could have more filters (under differing keys) + # bootstrap an entire sub-query for this path + combined_query = { + BOOL: { + MUST: [], + MUST_NOT: [] + } + } + for sub_query in query: + # Special case for EXISTS, since we cannot construct these like normal + # queries - add DOES NOT EXIST queries to MUST branches, as these are + # automatically added to MUST_NOT branch + if EXISTS in sub_query and key == MUST_NOT: + combined_query[BOOL][MUST].append(sub_query) + elif sub_query.get(BOOL, {}).get(MUST, {}).get(EXISTS, None): + combined_query[BOOL][MUST].append(sub_query) + else: + combined_query[BOOL][key].append(sub_query) + + # add the combined_query for this nested path to the global nested query + nested_query[BOOL][key].append(cls.build_nested_query(nested_path, combined_query)) + + # We have seen this nested_path before, so in order to achieve proper intersect + # behavior all conditions must be present on the same nested sub-query + else: + + # extract the location of the nested query we would like to add to + # note that the key under which the previous query was added could differ + # from the key we are seeing now ie: EXIST (must) combined with != (must_not) + prev_key, path_index = nested_path_to_index_map[nested_path] + leaf_query = nested_query[BOOL][prev_key][path_index][NESTED][QUERY][BOOL][key] + + # leaf_query is the sub-query we want to build off of + # its possible our current query contains multiple conditions + if isinstance(query, list): + leaf_query += query + elif isinstance(query, dict): + leaf_query.append(query) + else: + raise QueryConstructionException( + query_type='nested', + func='handle_nested_filters_v2', + msg='passed a query with a bad type: %s' % query + ) + + return nested_query + + @classmethod + def build_filters(cls, request, query, result, principals, doc_types, es_mapping): + """ + This function builds the Elasticsearch query based on the request. The structure of the query + is approximately represented below. 'Approximate' because you could not copy-paste directly into + Lucene, but should illustrate enough so it is comprehensible. Note the 'nested' nature of the query. + + QUERY HIERARCHY ('approximate' lucene syntax): + { + 'query': { + 'bool': { + 'filter': { + 'bool': { + 'must': { + + + + 'bool': { + 'should': { option1, option2 ... } + } + }, + 'must_not': { + + + + 'bool': { + 'should': { option1, option2 ... + }}}}} + } + There are several different sub-queries, but most of them are built using 'bool' -> 'must'. + A brief overview follows. + * nested data type sub-queries have a special format. See 'handle_nested_filters'. + * range filter sub-queries also have their own special format. See 'apply_range_filters'. Note that + the format is extra special when you're applying a range filter to a nested data type. + * 'terms' filters are what we 'normally' use. + + :param request: Current request + :param query: Current search query body + :param result: Response to be returned from the view ('/search') + :param principals: Active user roles + :param doc_types: Document type we are searching on + :param es_mapping: Elasticsearch mapping of the document type we're searching on + :returns: 2-tuple containing the updated search based on the request parameters and + information on the filters used in the query. + """ + + # these next two dictionaries should each have keys equal to query_field + # and values: must_terms: [], must_not_terms: [], add_no_value: True/False/None + field_filters = cls.initialize_field_filters(request, principals, doc_types) + range_filters = cls.handle_range_filters(request, result, field_filters, doc_types) + + # construct queries + must_filters, must_not_filters, \ + must_filters_nested, must_not_filters_nested = cls.build_sub_queries(field_filters, es_mapping) + + # initialize filter hierarchy + final_filters = {BOOL: {MUST: [f for _, f in must_filters], MUST_NOT: [f for _, f in must_not_filters]}} + + # Build nested queries + final_nested_query = cls.handle_nested_filters_v2(must_filters_nested, must_not_filters_nested, es_mapping) + final_filters[BOOL][MUST].append(final_nested_query) + + # add range filters after (so nested ranges can be applied with existing nested queries) + cls.apply_range_filters(range_filters, final_filters[BOOL][MUST], es_mapping) + + # at this point, final_filters is valid lucene and can be dropped into the query directly + query[QUERY][BOOL][FILTER] = final_filters + return query, final_filters + + @staticmethod + def _check_and_remove(compare_field, facet_filters, active_filter, query_field, filter_type): + """ Does the actual 'check and removal' since this code is duplicated throughout. """ + if compare_field == query_field: + facet_filters[filter_type].remove(active_filter) + return True + return False + + @classmethod + def _check_and_remove_terms(cls, facet_filters, active_filter, query_field, filter_type): + """ Helper function for _remove_from_active_filters that handles filter removal for terms query """ + # there should only be one key here + for compare_field in active_filter[TERMS].keys(): + # remove filter for a given field for that facet + # skip this for type facet (field = 'type') + # since we always want to include that filter. + if (query_field != 'embedded.@type.raw' and # this evaluation order MUST be preserved! + cls._check_and_remove(compare_field, facet_filters, active_filter, query_field, filter_type)): + break + + @classmethod + def _check_and_remove_range(cls, facet_filters, active_filter, query_field, filter_type): + """ Helper function for _remove_from_active_filters that handles filter removal for terms query """ + for compare_field in active_filter[RANGE].keys(): + if cls._check_and_remove(compare_field, facet_filters, active_filter, query_field, filter_type): + break + + @classmethod + def _check_and_remove_bool_should(cls, facet_filters, active_filter, query_field, filter_type): + """ Helper function for _remove_from_active_filters that handles filter removal for boolean queries that + have multiple options (inner SHOULD query) + """ + # handle No value case + inner_bool = None + inner_should = active_filter.get(BOOL).get(SHOULD, []) + for or_term in inner_should: + # this may be naive, but assume first non-terms + # filter is the No value query + if TERMS in or_term or RANGE in or_term: + continue + else: + inner_bool = or_term + break + if EXISTS in inner_bool: + compare_field = inner_bool[EXISTS].get(FIELD) + else: + # attempt to get the field from the alternative No value syntax + compare_field = inner_bool.get(BOOL, {}).get(MUST_NOT, {}).get(EXISTS, {}).get(FIELD) + if query_field != 'embedded.@type.raw': + cls._check_and_remove(compare_field, facet_filters, active_filter, query_field, filter_type) + + @classmethod + def _check_and_remove_match_from_should(cls, query_options, facet_filters, active_filter, query_field, + filter_type): + """ Helper function that searches a MATCH query for the given query_field, removing the + active filter if found. + """ + for inner_query in query_options: + if MATCH in inner_query: + for field in inner_query.get(MATCH, {}).keys(): # should be only one per block + if cls._check_and_remove(field, facet_filters, active_filter, query_field, + filter_type): + return + else: + search_log(log_handler=log, msg='Encountered a unexpected nested structure in ' + 'query: %s' % inner_query) + + @classmethod + def _check_and_remove_nested(cls, facet_filters, active_filter, query_field, filter_type): + """ Helper function for _remove_from_active_filters that handles filter removal for nested query + Reminder that this code is responsible for constructing the aggregation filter, hence the desire + to omit selections on the field we are aggregating on. + """ + nested_sub_query = active_filter[NESTED][QUERY] + + # For No value searches + if EXISTS in nested_sub_query: + field = nested_sub_query.get(EXISTS, {}).get(FIELD) + cls._check_and_remove(field, facet_filters, active_filter, query_field, filter_type) + + # For all other searches + elif BOOL in nested_sub_query: + for inner_filter_type in [MUST, MUST_NOT]: + for nested_option in nested_sub_query[BOOL].get(inner_filter_type, []): + if isinstance(nested_option, dict): + + # For structure like this: + # {'bool': {'must': [{'match': {'embedded.hg19.hg19_hgvsg.raw': 'NC_000001.11:g.12185956del'}] + if MATCH in nested_option: + for field in nested_option.get(MATCH, {}).keys(): # should only be one per block + if cls._check_and_remove(field, facet_filters, active_filter, query_field, filter_type): + break + + # For structure like this: + # {'bool': {'should': + # [{'match': {'embedded.variant.genes.genes_most_severe_consequence.coding_effect.raw': + # 'Missense'}}, + # {'match': {'embedded.variant.genes.genes_most_severe_consequence.coding_effect.raw': + # 'Synonymous'}}]}} + elif BOOL in nested_option: + inner_inner_bool = nested_option[BOOL] + if SHOULD in inner_inner_bool: + cls._check_and_remove_match_from_should(inner_inner_bool[SHOULD], facet_filters, + active_filter, + query_field, filter_type) + + # For structure like this: + # {'bool': {'must': {'bool': {'should': + # [{'match': {'embedded.hg19.hg19_hgvsg.raw': 'NC_000001.11:g.12185956del'}}, + # {'match': {'embedded.hg19.hg19_hgvsg.raw': 'NC_000001.11:g.11901816A>T'}}]}}}} + elif isinstance(nested_option, str): + inner_bool = nested_sub_query[BOOL].get(inner_filter_type, {}) + if SHOULD in inner_bool: + cls._check_and_remove_match_from_should(inner_bool[SHOULD], facet_filters, active_filter, + query_field, filter_type) + + # For structure like this: + # {'bool': {'should': [ + # {'match': {'embedded.variant.genes.genes_most_severe_consequence.impact.raw': 'MODIFIER'}}, + # {'match': {'embedded.variant.genes.genes_most_severe_consequence.impact.raw': 'LOW'}}]}} + elif BOOL in inner_bool: + inner_inner_bool = inner_bool[BOOL] + if SHOULD in inner_inner_bool: + cls._check_and_remove_match_from_should(inner_inner_bool[SHOULD], facet_filters, active_filter, + query_field, filter_type) + + else: + search_log(log_handler=log, msg='Encountered a unexpected nested structure at top level: %s' + % nested_sub_query[BOOL]) + + @classmethod + def _remove_from_active_filters(cls, facet_filters, query_field, active_filter, filter_type): + """ Helper function for generate_filters_for_terms_agg_from_search_filters + Modifies facet_filters in place to remove the active_filter if it matches + the given query field. + This function is intended to be called on every "sub part" of the base query + for every aggregation. + + TODO: Optimize this - it is inefficient application side regardless of the dominating ES cost + + :param facet_filters: intended filter block to be used on the aggregation + :param query_field: field that we are aggregating on + :param active_filter: which "sub part" of the facet filters we are examining + :param filter_type: one of MUST or MUST_NOT + """ + if BOOL in active_filter and SHOULD in active_filter[BOOL]: + cls._check_and_remove_bool_should(facet_filters, active_filter, query_field, filter_type) + elif TERMS in active_filter: + cls._check_and_remove_terms(facet_filters, active_filter, query_field, filter_type) + elif RANGE in active_filter: + cls._check_and_remove_range(facet_filters, active_filter, query_field, filter_type) + elif NESTED in active_filter: + cls._check_and_remove_nested(facet_filters, active_filter, query_field, filter_type) + + @classmethod + def generate_filters_for_terms_agg_from_search_filters(cls, query_field, search_filters, string_query): + """ + We add a copy of our filters to each facet, minus that of + facet's field itself so that we can get term counts for other terms filters. + And be able to filter w/ it. + Remove filters from fields they apply to. + For example, the 'biosource_type' aggs should not have any + biosource_type filter in place. + Handle 'must' and 'must_not' filters separately + + :param query_field: field terms agg is on + :param search_filters: intermediary format prior to any valid lucene representing the search_filters + from the front-end + :param string_query: query string if provided + :return: Copy of search_filters, minus filter for current query_field (if one set). + """ + if not search_filters or BOOL not in search_filters: # a sane default if this happens -Will 11/17/20 + log.error('Encountered an unexpected query format: %s' % search_filters) + return {BOOL: {MUST: [{TERMS: {'principals_allowed.view': ['system.Everyone']}}]}} + + facet_filters = deepcopy(search_filters[BOOL]) + + for filter_type in [MUST, MUST_NOT]: + # active_filter => e.g. { 'terms' : { 'embedded.@type.raw': ['ExperimentSetReplicate'] } } + for active_filter in search_filters[BOOL][filter_type]: + cls._remove_from_active_filters(facet_filters, query_field, active_filter, filter_type) + + # add the string_query, if present, to the bool term with facet_filters + if string_query and string_query[MUST]: + # combine statements within 'must' for each + facet_filters[MUST].append(string_query[MUST]) + + return {BOOL: facet_filters} + + @staticmethod + def set_additional_aggregations(search_as_dict, request, doc_types, extra_aggregations=None): + """ + Per-type aggregations may be defined in schemas. Apply them OUTSIDE of globals so they act on our + current search filters. Warning: `search_as_dict` is modified IN PLACE. + + :param search_as_dict: elasticsearch_dsl object converted to_dict() + :param request: current request + :param doc_types: types we are searching on + :param extra_aggregations: aggregations to add + :return: search_as_dict, same as originally passed in, but modified in this function + """ + + types = request.registry[TYPES] + schema = types[doc_types[0]].schema + + if schema.get('aggregations'): + for schema_agg_name in schema['aggregations'].keys(): + if schema_agg_name == 'all_items': + raise QueryConstructionException( + query_type='aggregations', + func='set_additional_aggregations', + msg='all_items is a reserved agg name and not allowed as an extra aggregation name.') + search_as_dict['aggs'][schema_agg_name] = schema['aggregations'][schema_agg_name] + + if extra_aggregations: + for extra_agg_name in extra_aggregations.keys(): + if extra_agg_name == 'all_items': + raise QueryConstructionException( + query_type='extra_aggregations', + func='set_additional_aggregations', + msg='all_items is a reserved agg name and not allowed as an extra aggregation name.') + search_as_dict['aggs'][extra_agg_name] = extra_aggregations[extra_agg_name] + + return search_as_dict + + @staticmethod + def _build_nested_aggregation(sub_query, nested_path, requested=None): + """ Builds a nested aggregation. + + :param sub_query: query to use as the 'primary_agg' + :param nested_path: path to nested object we are searching on + :param requested: requested agg, if any + :returns: the nested form of sub_query + """ + if requested: + return { + NESTED: { + PATH: nested_path + }, + AGGS: { + 'primary_agg': sub_query, + 'requested_agg': requested + } + } + else: + return { + NESTED: { + PATH: nested_path + }, + AGGS: { + 'primary_agg': sub_query + } + } + + @classmethod + def _add_stats_aggregation(cls, field, facet, field_schema, query_field, search_filters, string_query, + nested_path, aggs, agg_name): + """ Builds a stats aggregation, adding it to the given aggs. + + :param field: raw field name we are searching on (ex: AF) + :param facet: facet metadata + :param field_schema: schema for the field we are searching on + :param query_field: ES document field we are searching on (ex: embedded.AF) + :param search_filters: filters we are searching on + :param string_query: simple query string if specified + :param nested_path: path to nested object we are aggregating on + :param aggs: the aggregation object we are building + :param agg_name: name of the aggregation we are building + """ + is_date_field = field_schema and determine_if_is_date_field(field, field_schema) + is_numerical_field = field_schema and field_schema['type'] in ("integer", "float", "number") + if is_date_field: + facet['field_type'] = 'date' + elif is_numerical_field: + facet["field_type"] = field_schema['type'] or "number" + if "number_step" not in facet: + if "number_step" in field_schema: + facet["number_step"] = field_schema['number_step'] + elif facet["field_type"] == "integer": + facet["number_step"] = 1 + else: # Default + facet["number_step"] = "any" + facet_filters = cls.generate_filters_for_terms_agg_from_search_filters(query_field, search_filters, + string_query) + # stats aggregations could be nested too + stats_agg = { + STATS: { + 'field': query_field + } + } + if nested_path: + facet['aggregation_type'] = 'nested:stats' + aggs[facet['aggregation_type'] + ':' + agg_name] = { + AGGS: { + 'primary_agg': cls._build_nested_aggregation(stats_agg, nested_path) + }, + FILTER: facet_filters + } + + else: + aggs[facet['aggregation_type'] + ":" + agg_name] = { + AGGS: { + 'primary_agg': stats_agg + }, + FILTER: facet_filters + } + + @classmethod + def _build_range_aggregation(cls, query_field, ranges): + """ Builds a range aggregation. + Detects when 0-0 range is specified and replaces 'to' with the + smallest IEEE 32 value such that the bucket effectively only captures + the value 0. + """ + for r in ranges: + if 'from' in r and 'to' in r: + if r['from'] == 0 and r['to'] == 0: + r['to'] = cls.SMALLEST_NONZERO_IEEE_32 + if 'to' in r and r['to'] != cls.SMALLEST_NONZERO_IEEE_32: + r['to'] += cls.SMALLEST_NONZERO_IEEE_32 + return { + RANGE: { + FIELD: query_field, + 'ranges': ranges + } + } + + @classmethod + def _add_range_aggregation(cls, facet, query_field, search_filters, string_query, nested_path, aggs, agg_name): + """ Builds a range aggregation utilizing the ranges defined on schema facet, adding it to the given aggs. + + :param facet: facet metadata + :param query_field: field we are searching on. + :param search_filters: filters we are searching on + :param string_query: simple query string if specified + :param nested_path: path to nested object we are aggregating on + :param aggs: the aggregation object we are building + :param agg_name: name of the aggregation we are building + """ + facet_filters = cls.generate_filters_for_terms_agg_from_search_filters(query_field, search_filters, + string_query) + ranges = [{k: v for k, v in r.items() if k in ['from', 'to']} for r in facet['ranges']] + range_agg = cls._build_range_aggregation(query_field, ranges) + if nested_path: + facet['aggregation_type'] = 'nested:range' + field = facet['aggregation_type'] + ':' + agg_name + range_agg = cls._build_nested_aggregation(range_agg, nested_path) + else: + facet['aggregation_type'] = RANGE + field = facet['aggregation_type'] + ':' + agg_name + aggs[field] = { + AGGS: { + 'primary_agg': range_agg + }, + FILTER: facet_filters + } + + @staticmethod + def _build_terms_aggregation(query_field, facet, requested_values=None, nested=False): + """ Builds a terms aggregation, specifically requesting counts for any selected values. """ + agg = { + TERMS: { + 'size': MAX_FACET_COUNTS, + 'field': query_field, + 'missing': facet.get('missing_value_replacement', 'No value') + } + } + if requested_values: # getall returns [], not None + agg[TERMS]['include'] = requested_values + if nested: + agg[AGGS] = { + 'primary_agg_reverse_nested': { + 'reverse_nested': {} + } + } + return agg + + @classmethod + def _add_terms_aggregation(cls, facet, query_field, search_filters, string_query, nested_path, aggs, agg_name, + requested_values): + """ Builds a standard terms aggregation, setting a nested identifier to be repaired later + by elasticsearch_dsl, adding it to the given aggs. + + :param facet: facet metadata + :param query_field: field we are searching on. + :param search_filters: filters we are searching on + :param string_query: simple query string if specified + :param nested_path: path to nested object we are aggregating on + :param aggs: the aggregation object we are building + :param agg_name: name of the aggregation we are building + :param requested_values: values for this terms agg we requested (to be explicitly included) + """ + is_nested = nested_path is not None + if is_nested: + facet['aggregation_type'] = NESTED # special in that it is used to identify (broken) facets - Will 11/17/20 + else: + facet['aggregation_type'] = TERMS + + facet_filters = cls.generate_filters_for_terms_agg_from_search_filters(query_field, search_filters, + string_query) + terms_aggregation = cls._build_terms_aggregation(query_field, facet, None, is_nested) + + # NOTE: if we requested values for this field, we must expand to do two aggregations + # Unfortunately when you pass "include" to a terms aggregation it acts as a hard filter, + # not a "force bucket", which makes implementing this very tricky. To get around this we + # expand to 2 aggregations - one for the requested field and one for the remaining top fields + if requested_values: + terms_aggregation_requested = cls._build_terms_aggregation(query_field, facet, requested_values, + is_nested) + if nested_path: + aggs[facet['aggregation_type'] + ":" + agg_name] = { + AGGS: {'primary_agg': + cls._build_nested_aggregation(terms_aggregation, nested_path, + terms_aggregation_requested), + }, + FILTER: facet_filters, + } + else: + aggs[facet['aggregation_type'] + ":" + agg_name] = { + AGGS: { + 'primary_agg': terms_aggregation_requested, + 'requested_agg': terms_aggregation + }, + FILTER: facet_filters, + } + + else: + if nested_path: + aggs[facet['aggregation_type'] + ":" + agg_name] = { + AGGS: {'primary_agg': + cls._build_nested_aggregation(terms_aggregation, nested_path), + }, + FILTER: facet_filters, + } + else: + aggs[facet['aggregation_type'] + ":" + agg_name] = { + AGGS: { + 'primary_agg': terms_aggregation + }, + FILTER: facet_filters, + } + + @classmethod + def build_facets(cls, query, facets, search_filters, string_query, request, doc_types, + custom_aggregations=None, size=25, from_=0, es_mapping=None): + """ + Sets facets in the query as ElasticSearch aggregations, with each aggregation to be + filtered by search_filters minus filter affecting facet field in order to get counts + for other facet term options. + ES5 - simply sets aggs by calling update_from_dict after adding them in + :param facets: Facet field (0) in object dot notation, and a dict or OrderedDict with title property (1). + :type facets: List of tuples. + :param search_filters: Dict of filters which are set for the ES query in build_filters + :param string_query: Dict holding the query_string used in the search + """ + if from_ != 0: + return query + + aggs = OrderedDict() + for field, facet in facets: # E.g. 'type','experimentset_type','experiments_in_set.award.project', ... + field_schema = schema_for_field(field, request, doc_types, should_log=True) + query_field = get_query_field(field, facet) + nested_path = find_nested_path(query_field, es_mapping) + requested_values = request.params.getall(field) + + # Build the aggregation based on its type (by side-effect) - stats, range or terms + agg_name = field.replace('.', '-') + facet_type = facet.get('aggregation_type') + if facet_type in ['stats', 'nested:stats']: + cls._add_stats_aggregation(field, facet, field_schema, query_field, search_filters, string_query, + nested_path, aggs, agg_name) + elif facet_type in ['range', 'nested:range']: + cls._add_range_aggregation(facet, query_field, search_filters, string_query, nested_path, + aggs, agg_name) + else: # assume terms + cls._add_terms_aggregation(facet, query_field, search_filters, string_query, nested_path, + aggs, agg_name, requested_values) + + # Update facet with title, description from field_schema, if missing. + if facet.get('title') is None and field_schema and 'title' in field_schema: + facet['title'] = field_schema['title'] + if facet.get('description') is None and field_schema and 'description' in field_schema: + facet['description'] = field_schema['description'] + + # to achieve OR behavior within facets, search among GLOBAL results, + # not just returned ones. to do this, wrap aggs in ['all_items'] + # and add "global": {} to top level aggs query + # see elasticsearch global aggs for documentation (should be ES5 compliant) + query['aggs'] = { + 'all_items': { + 'global': {}, + 'aggs': aggs + } + } + + if size == 0: + # Only perform aggs if size==0 requested, to improve performance for search page queries. + # We do currently have (hidden) monthly date histogram facets which may yet to be utilized for common size!=0 agg use cases. + cls.set_additional_aggregations(query, request, doc_types, custom_aggregations) + + return query + + @staticmethod + def verify_search_has_permissions(request, query): + """ + Inspects the search object to ensure permissions are still present on the query + This method depends on the query structure defined in 'build_filters'. + + :param request: the current request + :param query: search query object to inspect + :raises: HTTPBadRequest if permissions not present + """ + effective_principals_on_query = None + found = False # set to True if we found valid 'principals_allowed.view' + try: + for boolean_clause in [query['query']['bool']['filter']]: # should always be present + if 'bool' in boolean_clause and 'must' in boolean_clause['bool']: # principals_allowed.view is on 'must' + possible_permission_block = boolean_clause['bool']['must'] + for entry in possible_permission_block: + if 'terms' in entry: + if 'principals_allowed.view' in entry['terms']: + effective_principals_on_query = entry['terms']['principals_allowed.view'] + if effective_principals_on_query != request.effective_principals: + raise QueryConstructionException( + query_type='principals', + func='verify_search_has_permissions', + msg='principals_allowed was modified - see application logs') + else: + found = True + break + except QueryConstructionException: + search_log(log_handler=log, msg='Detected URL query param manipulation, principals_allowed.view was' + ' modified from %s to %s' % (request.effective_principals, + effective_principals_on_query)) + raise HTTPBadRequest('The search failed - the DCIC team has been notified.') + except KeyError: + search_log(log_handler=log, msg='Malformed query detected while checking for principals_allowed') + raise HTTPBadRequest('The search failed - the DCIC team has been notified.') + if not found: + search_log(log_handler=log, msg='Did not locate principals_allowed.view on search query body: %s' + % query) + raise HTTPBadRequest('The search failed - the DCIC team has been notified.') + + @classmethod + def compound_search(cls, sub_queries, intersect=False): + """ Takes an array of sub-queries and merges them into one query + + :param sub_queries: list of query to be combined, typically starting with "bool" + :param intersect: whether or not to intersect the sub-queries + :return: lucene query combining the sub_queries with OR + """ + if not intersect: + key = SHOULD + else: + key = MUST + + query = { + 'query': { + 'bool': { + key: [] + } + } + } + for q in sub_queries: + query['query']['bool'][key].append(q) + return query diff --git a/snovault/search/search.py b/snovault/search/search.py new file mode 100644 index 000000000..55a0072f4 --- /dev/null +++ b/snovault/search/search.py @@ -0,0 +1,1307 @@ +import re +import math +import itertools +import uuid +import structlog +from pyramid.view import view_config +from webob.multidict import MultiDict +from functools import reduce +from pyramid.httpexceptions import HTTPBadRequest +from urllib.parse import urlencode +from collections import OrderedDict +from copy import deepcopy +from snovault import ( + AbstractCollection, + TYPES, + COLLECTIONS, + STORAGE +) +from snovault.elasticsearch import ELASTIC_SEARCH +from snovault.util import ( + debug_log, +) +from snovault.elasticsearch.indexer_utils import get_namespaced_index +from snovault.typeinfo import AbstractTypeInfo +from ..util import is_admin_request +from .lucene_builder import LuceneBuilder +from .search_utils import ( + find_nested_path, schema_for_field, get_es_index, get_es_mapping, is_date_field, is_numerical_field, + is_array_of_numerical_field, + execute_search, make_search_subreq, build_sort_dicts, + NESTED, COMMON_EXCLUDED_URI_PARAMS, MAX_FACET_COUNTS, +) + + +log = structlog.getLogger(__name__) + + +def includeme(config): + config.add_route('search', '/search{slash:/?}') + config.scan(__name__) + + +sanitize_search_string_re = re.compile(r'[\\\+\-\&\|\!\(\)\{\}\[\]\^\~\:\/\\\*\?]') + + +class SearchBuilder: + """ A monolithic object that encapsulates information needed to perform searches. + The purpose of this class is to organize state + methods used for bootstrapping + the search. + + Static methods in this class serve on of two purposes: + 1. Take state and use it to generate new state in the initializer. + 2. Functions that operate at the "leaf" and do not require state. + + The point is to split apart logic needed for query construction with logic needed for the + API itself. Search is by far our most complicated API, thus there is a lot of state. + """ + DEFAULT_SEARCH_FRAME = 'embedded' + DEFAULT_HIDDEN = 'default_hidden' # facet is hidden by default + ADDITIONAL_FACETS = 'additional_facet' # specifies an aggregation to compute in addition + RESCUE_TERMS = 'rescue_terms' # special facet field that contains terms that should always have buckets + DEBUG = 'debug' # search debug parameter + CARDINALITY_RANGE = '-3.4028E38-*' + PAGINATION_SIZE = 10 # for ECS, 10 is much better than 25, and may even do better when lowered + MISSING = object() + SEARCH_INFO_HEADER_TYPES = [ + 'Workflow' # TODO: add types here as needed + ] + + def __init__(self, context, request, search_type=None, return_generator=False, forced_type='Search', + custom_aggregations=None, skip_bootstrap=False): + self.context = context # request context + self.request = request # request who requested a search + self.response = {} + + # setup needed regardless of whether we are building from a base query or building a new one + # from params + self.types = self.request.registry[TYPES] # all types in the system + self.doc_types = self.set_doc_types(self.request, self.types, search_type) # doc_types for this search + self.es = self.request.registry[ELASTIC_SEARCH] # handle to remote ES + self.es_index = get_es_index(self.request, self.doc_types) # what index we are searching on + self.query = {} # new search object, just raw lucene - NOT ElasticSearch-DSL + + # skip setup needed for building the query, if desired + if not skip_bootstrap: + self._bootstrap_query(search_type, return_generator, forced_type, custom_aggregations) + + # To be computed later, initialized to None here + self.result = None + self.from_ = None + self.size = None + self.facets = None + self.search_session_id = None + self.string_query = None + self.facet_order_overrides = {} + + def _get_es_mapping_if_necessary(self): + """ Looks in the registry to see if the single doc_type mapping is cached in the registry, which it + should be - thus saving us some time from external API calls at the expense of application memory. + """ + if len(self.doc_types) == 1: # extract mapping from storage if we're searching on a single doc type + item_type_snake_case = ''.join(['_' + c.lower() if c.isupper() else c for c in self.doc_types[0]]).lstrip('_') + mappings = self.request.registry[STORAGE].read.mappings.get() + if get_namespaced_index(self.request, item_type_snake_case) == self.es_index and self.es_index in mappings: + return mappings[self.es_index]['mappings']['properties'] + else: # new item was added after last cache update, get directly via API + return get_es_mapping(self.es, self.es_index) + return {} + + def _bootstrap_query(self, search_type=None, return_generator=False, forced_type='Search', + custom_aggregations=None): + """ Helper method that will bootstrap metadata necessary for building a search query. """ + self.return_generator = return_generator # whether or not this search should return a generator + self.custom_aggregations = custom_aggregations # any custom aggregations on this search + self.forced_type = forced_type # (mostly deprecated) search type + self.principals = self.request.effective_principals # permissions to apply to this search + + # Initialized via outside function call + # schemas for doc_types + self.schemas = [ + self.types[item_type].schema + for item_type in self.doc_types + ] + # item_type hierarchy we are searching on + self.search_types = self.build_search_types(self.types, self.doc_types) + [ + self.forced_type + ] + self.search_base = self.normalize_query(self.request, self.types, self.doc_types) + self.search_frame = self.request.normalized_params.get('frame', self.DEFAULT_SEARCH_FRAME) # embedded + self.prepared_terms = self.prepare_search_term(self.request) + self.additional_facets = self.request.normalized_params.getall(self.ADDITIONAL_FACETS) + self.debug_is_active = self.request.normalized_params.getall(self.DEBUG) # only used if admin + self.source_fields = sorted(self.list_source_fields()) + + # Can potentially make an outside API call, but ideally is cached + # Only needed if searching on a single item type + self.item_type_es_mapping = self._get_es_mapping_if_necessary() + + @property + def forced_type_token(self): + """ Do any processing needed to be applied to self.forced_type """ + return self.forced_type.lower() + + @classmethod + def from_search(cls, context, request, s, return_generator=False): + """ Builds a SearchBuilder object with a pre-built search by skipping the bootstrap + initialization and setting self.query directly. + + :param context: context from request + :param request: current request + :param s: dictionary of lucene query + :return: instance of this class with the search query dropped in + """ + search_builder_instance = cls(context, request, return_generator=return_generator, skip_bootstrap=True) # bypass (most of) bootstrap + search_builder_instance.query = s # parse compound query + return search_builder_instance + + @staticmethod + def build_search_types(types, doc_types): + """ Builds search_types based on the given doc_types + :param types: TypesTool from the registry + :param doc_types: Type names we would like to search on + :return: search_types, or a list of 'SearchResults' type candidates + """ + search_types = [] + if len(doc_types) == 1: # if we have one, add it and its base_type + ti = types[doc_types[0]] + search_types.append(ti.name + "SearchResults") + if hasattr(ti, 'base_types'): + for base_type in ti.base_types: + search_types.append(base_type + "SearchResults") + + # If we have more than one, compute and add common ancestors to search_types + # TODO: handle more than 2 common ancestors + else: + base_types = [] + for ti in doc_types: + if hasattr(types[ti], 'base_types'): + base_types.append(set(types[ti].base_types)) + common_ancestors = reduce(lambda x, y: x & y, base_types) + if not common_ancestors: + raise HTTPBadRequest("Tried to search on types with no common ancestor. This should never happen.") + + for ancestor in common_ancestors: + if ancestor != "Item": + search_types.append(ancestor + "SearchResults") + search_types.append("ItemSearchResults") + return search_types + + @staticmethod + def normalize_query(request, types, doc_types): + """ + Normalize the query by calculating and setting request.normalized_params + (a webob MultiDict) that is derived from custom query rules and also + the list of doc_types specified by set_doc_types(). The normalize_param + helper function finds field_schema for each query parameter and enforces + a set of rules (see below). If the query item types differ from doc_types, + override with doc_types + + :param request: the current Request + :param types: registry[TYPES] + :param doc_types: item_types to use for the search + + :returns: query string built from normalized params + """ + + # TODO: Optimize method structure here, see C4-71 PR comments -Will 6/24/2020 + def normalize_param(key, val): + """ + Process each key/val in the original query param. As part of this, + obtain the field schema for each parameter. Changes the query string + to redirect the search to the normalized parameters + Current rules: + - for 'type', get name from types (from the registry) + - append '.display_title' to any terminal linkTo query field + - append '.display_title' to sorts on linkTo fields + """ + # type param is a special case. use the name from TypeInfo + if key == 'type' and val in types: + return key, types[val].name + + # if key is sort, pass val as the key to this function + # if it appends display title we know its a linkTo and + # should be treated as such + if key == 'sort': + # do not use '-' if present + sort_val = val[1:] if val.startswith('-') else val + new_val, _ = normalize_param(sort_val, None) + if new_val != sort_val: + val = val.replace(sort_val, new_val) + return key, val + + # find schema for field parameter and drill down into arrays/subobjects + field_schema = schema_for_field(key, request, doc_types) + while field_schema and ('items' in field_schema or 'properties' in field_schema): + try: + field_schema = field_schema['items'] + except KeyError: + pass + try: + field_schema = field_schema['properties'] + except KeyError: + pass + if field_schema and 'linkTo' in field_schema: + # add display_title to terminal linkTo query fields + if key.endswith('!'): # handle NOT + return key[:-1] + '.display_title!', val + return key + '.display_title', val + else: + return key, val + + # use a MultiDict to emulate request.params + # TODO: Evaluate whether or not MultiDict is really useful here -Will 6/24/2020 + normalized_params = MultiDict( + normalize_param(k, v) + for k, v in request.params.items() + ) + # overwrite 'type' if not equal to doc_types to ensure consistency + if set(normalized_params.getall('type')) != set(doc_types): + if 'type' in normalized_params: + del normalized_params['type'] + for dtype in doc_types: + normalized_params.add('type', dtype) + + # add the normalized params to the request + # these will be used in place of request.params for the rest of search + setattr(request, 'normalized_params', normalized_params) + + # the query string of the normalized search + qs = '?' + urlencode([ # XXX: do we actually need to encode k,v individually? -Will 6/24/2020 + (k.encode('utf-8'), v.encode('utf-8')) + for k, v in request.normalized_params.items() + ]) + return qs + + def prepare_search_term(self, request): + """ + Prepares search terms by making a dictionary where the keys are fields and the values are arrays + of query strings. This is an intermediary format which will be modified when constructing the + actual search query. + + Ignore certain keywords, such as type, format, and field + + :param request: current request + :return: dictionary mapping field --> query strings + """ + prepared_terms = {} + for field, val in request.normalized_params.items(): + if (field.startswith('validation_errors') or + field.startswith('aggregated_items') or + field == self.ADDITIONAL_FACETS): + continue + elif field == 'q': # searched string has field 'q' + # people shouldn't provide multiple queries, but if they do, + # combine them with AND logic + if 'q' in prepared_terms: + join_list = [prepared_terms['q'], val] + prepared_terms['q'] = ' AND '.join(join_list) + else: + prepared_terms['q'] = val + elif field not in COMMON_EXCLUDED_URI_PARAMS + ['type']: + if 'embedded.' + field not in prepared_terms: + prepared_terms['embedded.' + field] = [] + prepared_terms['embedded.' + field].append(val) + return prepared_terms + + @staticmethod + def set_doc_types(request, types, search_type): + """ + Set the type of documents resulting from the search; order and check for + invalid types as well. If a forced search_type is enforced, use that; + otherwise, set types from the query params. Default to Item if none set. + + :param request: the current Request + :param types: registry[TYPES] + :param search_type: forced search item type + + :returns: list: the string item types to use for the search + :raises: HTTPBadRequest: if an invalid item type is supplied + """ + if search_type is None: + doc_types = request.params.getall('type') + if '*' in doc_types: + doc_types = ['Item'] + else: + doc_types = [search_type] + # Normalize to item_type + try: + doc_types = sorted({types[name].name for name in doc_types}) + except KeyError: + # Check for invalid types + bad_types = [t for t in doc_types if t not in types] + msg = "Invalid type: {}".format(', '.join(bad_types)) + raise HTTPBadRequest(explanation=msg) + if len(doc_types) == 0: + doc_types = ['Item'] + return doc_types + + def add_search_header_if_needed(self): + """ + Get static section (if applicable) when searching a single item type + Note: Because we rely on 'source', if the static_section hasn't been indexed + into Elasticsearch it will not be loaded + + Only check for this if the item type is declared to have one in + the class constant SEARCH_INFO_HEADER_TYPES + """ + if (len(self.doc_types) == 1 and 'Item' not in self.doc_types and + self.doc_types[0] in self.SEARCH_INFO_HEADER_TYPES): + search_term = 'search-info-header.' + self.doc_types[0] + # XXX: this could be cached application side as well + try: + static_section = self.request.registry['collections']['StaticSection'].get(search_term) + except Exception: # NotFoundError not caught, search could fail + static_section = None + if static_section and hasattr(static_section.model, 'source'): # extract from ES structure + item = static_section.model.source['object'] + self.response['search_header'] = {} + self.response['search_header']['content'] = item.get('content', 'Content Missing') + self.response['search_header']['title'] = item.get('title', item['display_title']) + self.response['search_header']['filetype'] = item.get('filetype', 'No filetype') + elif static_section and hasattr(static_section.model, 'data'): # extract form DB structure + item = static_section.upgrade_properties() + self.response['search_header'] = {} + self.response['search_header']['content'] = item.get('body', 'Content Missing') + self.response['search_header']['title'] = item.get('title', 'No title') + self.response['search_header']['filetype'] = item.get('filetype', 'No filetype') + else: + pass # no static header found + + def set_pagination(self): + """ + Fill from_ and size parameters for search if given in the query string + """ + from_ = self.request.normalized_params.get('from', 0) + size = self.request.normalized_params.get('limit', self.PAGINATION_SIZE) + if size in ('all', ''): + size = "all" + else: + try: + size = int(size) + except ValueError: + size = self.PAGINATION_SIZE + try: + from_ = int(from_) + except ValueError: + from_ = 0 + self.from_, self.size = from_, size + + def build_type_filters(self): + """ + Set the type filters for the search. If no doc_types, default to Item. + This also sets the facet filter override, allowing you to apply custom facet ordering + by specifying the FACET_ORDER_OVERRIDE field on the type definition. See VariantSample + or _sort_custom_facets for examples. + """ + if not self.doc_types: + self.doc_types = ['Item'] + else: + for item_type in self.doc_types: + ti = self.types[item_type] + if hasattr(ti, 'factory'): # if not abstract + self.facet_order_overrides.update(getattr(ti.factory, 'FACET_ORDER_OVERRIDE', {})) + + qs = urlencode([ + (k.encode('utf-8'), v.encode('utf-8')) + for k, v in self.request.normalized_params.items() + if k != "limit" and k != "from" and not (k == 'type' and self.types.all.get('Item' if v == '*' else v) is ti) + ]) + self.response['filters'].append({ + 'field': 'type', + 'term': ti.name, + 'remove': '{}?{}'.format(self.request.path, qs) + }) + + def clear_filters_setup(self): + """ + Clear Filters URI path + + Make a URI path that clears all non-datatype filters + and leaves in `q` (search query) params, if present. + Also preserves currentAction=selection, if is set. + + :returns: A URL path + """ + seach_query_specs = self.request.normalized_params.getall('q') + seach_query_url = urlencode([("q", seach_query) for seach_query in seach_query_specs]) + # types_url will always be present (always >=1 doc_type) + types_url = urlencode([("type", typ) for typ in self.doc_types]) + current_action = self.request.normalized_params.get('currentAction') + + clear_qs = types_url or '' + if seach_query_url: + clear_qs += '&' + seach_query_url + if current_action == 'selection': + clear_qs += '¤tAction=selection' + current_search_sort = self.request.normalized_params.getall('sort') + current_search_sort_url = urlencode([("sort", s) for s in current_search_sort]) + if current_search_sort_url: + clear_qs += '&' + current_search_sort_url + return self.request.route_path(self.forced_type_token, slash='/') + (('?' + clear_qs) if clear_qs else '') + + def initialize_search_response(self): + """ Initializes the search response """ + self.response = { + '@context': self.request.route_path('jsonld_context'), + '@id': '/' + self.forced_type_token + '/' + self.search_base, + '@type': self.search_types, + 'title': self.forced_type, + 'filters': [], + 'facets': [], + '@graph': [], + 'notification': '', + 'sort': {}, + 'clear_filters': self.clear_filters_setup() + } + self.add_search_header_if_needed() + self.set_pagination() + self.build_type_filters() + + def list_source_fields(self): + """ + Returns set of fields that are requested by user or default fields. + These fields are used to further limit the results from the search. + Note that you must provide the full fieldname with embeds, such as: + 'field=biosample.biosource.individual.organism.name' and not just + 'field=name' + """ + fields_requested = self.request.normalized_params.getall('field') + if fields_requested: + fields = ['embedded.@id', 'embedded.@type'] + for field in fields_requested: + fields.append('embedded.' + field) + elif self.search_frame == 'embedded': + fields = [self.search_frame + '.*'] + elif self.search_frame in ['object', 'raw']: + # frame=raw corresponds to 'properties' in ES + if self.search_frame == 'raw': + frame = 'properties' + else: + frame = self.search_frame + # let embedded be searched as well (for faceting) + fields = ['embedded.*', frame + '.*'] + else: + fields = ['embedded.*'] + return fields + + def build_query(self): + """ + Bootstraps our query format, building the q= part of the query if one is specified. + If multiple are specified the first one that occurs in the URL will be used. + """ + query_info = {} + string_query = None + query_dict = {'query': {'bool': {}}} + # handle field, frame + self.query['_source'] = self.source_fields + # locate for 'q' query, if any + for field, value in self.prepared_terms.items(): + if field == 'q': + query_info['query'] = value + query_info['lenient'] = True + query_info['default_operator'] = 'AND' + query_info['fields'] = ['full_text'] + string_query = {'must': {'simple_query_string': query_info}} + query_dict = {'query': {'bool': string_query}} + break + self.query.update(query_dict) + self.string_query = string_query + + def set_sort_order(self): + """ + sets sort order for elasticsearch results + example: /search/?type=Biosource&sort=display_title + will sort by display_title in ascending order. To set descending order, + use the "-" flag: sort_by=-date_created. + Sorting is done alphatbetically, case sensitive by default. + + ES5: simply pass in the sort OrderedDict into search.sort + """ + + # Prefer sort order specified in request, if any + requested_sorts = self.request.normalized_params.getall('sort') + text_search = self.prepared_terms.get('q') + sort, result_sort = build_sort_dicts(requested_sorts, self.request, self.doc_types, text_search) + + # Otherwise we use a default sort only when there's no text search to be ranked + if not sort and text_search and text_search != '*': + # Multi-level sort. See http://www.elastic.co/guide/en/elasticsearch/guide/current/_sorting.html#_multilevel_sorting & https://stackoverflow.com/questions/46458803/python-elasticsearch-dsl-sorting-with-multiple-fields + self.query['sort'] = [{'_score': {"order": "desc"}}, + {'embedded.date_created.raw': {'order': 'desc', 'unmapped_type': 'keyword'}, + 'embedded.label.raw': {'order': 'asc', 'unmapped_type': 'keyword', 'missing': '_last'}}, + {'_id': {'order': 'asc'}} # ES7 - _uid removed, now use _id + ] + # 'embedded.uuid.raw' (instd of _id) sometimes results in 400 bad request : 'org.elasticsearch.index.query.QueryShardException: No mapping found for [embedded.uuid.raw] in order to sort on' + + self.response['sort'] = result_sort = {'_score': {"order": "desc"}} + + if sort and result_sort: + self.response['sort'] = result_sort + self.query['sort'] = sort + + def _initialize_additional_facets(self, facets_so_far, current_type_schema): + """ Helper function for below method that handles additional_facets URL param + + :param facets_so_far: list to add additional_facets to + :param current_type_schema: schema of the item we are faceting on + """ + for extra_facet in self.additional_facets: + aggregation_type = 'terms' # default + + # determine if nested + if self.item_type_es_mapping and find_nested_path(extra_facet, self.item_type_es_mapping): + aggregation_type = 'nested' # handle nested + + # check if defined in facets + if 'facets' in current_type_schema: + schema_facets = current_type_schema['facets'] + if extra_facet in schema_facets: + if not schema_facets[extra_facet].get('disabled', False): + facets_so_far.append((extra_facet, schema_facets[extra_facet])) + continue # if we found the facet, always continue from here + + # not specified as facet - infer range vs. term based on schema + field_definition = schema_for_field(extra_facet, self.request, self.doc_types) + if not field_definition: # if not on schema, try "terms" + facets_so_far.append(( + extra_facet, {'title': extra_facet.title()} + )) + else: + t = field_definition.get('type', None) + if not t: + log.error('Encountered an additional facet that has no type! %s' % field_definition) + continue # drop this facet + + # terms for string + if t == 'string': + facets_so_far.append(( + extra_facet, {'title': extra_facet.title(), 'aggregation_type': aggregation_type} + )) + else: # try stats + aggregation_type = 'stats' + facets_so_far.append(( + extra_facet, { + 'title': field_definition.get('title', extra_facet.title()), + 'aggregation_type': aggregation_type, + 'number_step': 'any' + } + )) + + def initialize_facets(self): + """ + Initialize the facets used for the search. If searching across multiple + doc_types, only use the default 'Data Type' and 'Status' facets. + Add facets for custom url filters whether or not they're in the schema. + TODO: clean up this method - see comments in C4-71 PR + + :param doc_types: Item types (@type) for which we are performing a search for. + :param prepared_terms: terms to match in ES, keyed by ES field name. + :param schemas: List of OrderedDicts of schemas for doc_types. + + :returns: list: tuples containing (0) ElasticSearch-formatted field name (e.g. `embedded.status`) + and (1) list of terms for it. + """ + if len(self.doc_types) > 1: # only provide this if we are searching on more than one type + facets = [ + # More facets will be appended to this list from item schema plus from any currently-active filters (as requested in URI params). + ('type', {'title': 'Data Type'}) + ] + else: + facets = [] + + append_facets = [ + # Facets which will be appended after those which are in & added to `facets` + # ('status', {'title': 'Status'}), XXX: uncomment this if you want status facet + + # TODO: Re-enable below line if/when 'range' URI param queries for date & numerical fields are implemented. + # ('date_created', {'title': 'Date Created', 'hide_from_view' : True, 'aggregation_type' : 'date_histogram' }) + ] + validation_error_facets = [ + ('validation_errors.name', { 'title': 'Validation Errors', 'order': 999 }) + ] + current_type_schema = self.request.registry[TYPES][self.doc_types[0]].schema + self._initialize_additional_facets(append_facets, current_type_schema) + # hold disabled facets from schema; we also want to remove these from the prepared_terms facets + disabled_facet_fields = set() + + # Add facets from schema if one Item type is defined. + # Also, conditionally add extra appendable facets if relevant for type from schema. + if len(self.doc_types) == 1 and self.doc_types[0] != 'Item': + if 'facets' in current_type_schema: + schema_facets = OrderedDict(current_type_schema['facets']) + for schema_facet in schema_facets.items(): + if schema_facet[1].get('disabled', False) or schema_facet[1].get(self.DEFAULT_HIDDEN, False): + disabled_facet_fields.add(schema_facet[0]) + continue # Skip disabled facets. + facets.append(schema_facet) + + # Add facets for any non-schema ?field=value filters requested in the search (unless already set, via used_facet_fields) + used_facet_fields = set() + used_facet_titles = set() + for facet in facets + append_facets: + used_facet_fields.add(facet[0]) + if 'title' in facet[1]: + used_facet_titles.add(facet[1]['title']) + + for field in self.prepared_terms: + if field.startswith('embedded'): + + # Will become, e.g. ['embedded', 'experiments_in_set', 'files', 'file_size', 'from'] + split_field = field.strip().split('.') + use_field = '.'.join(split_field[1:]) # e.g. "experiments_in_set.files.file_size.from" + + if use_field in used_facet_fields or use_field in disabled_facet_fields: + # Cancel if already in facets or is disabled (first check, before more broad check re: agg_type:stats, etc) + continue + + # Use the last part of the split field to get the field title + title_field = split_field[-1] + + # workaround: if query has a '!=' condition, title_field ends with '!'. This prevents to find the proper display title. + # TODO: instead of workaround, '!' could be excluded while generating query results + if use_field.endswith('!'): + use_field = use_field[:-1] + title_field = title_field[:-1] + + # if searching for a display_title, use the title of parent object + # use `is_object_title` to keep track of this + if title_field == 'display_title' and len(split_field) > 1: + title_field = split_field[-2] + is_object_title = True + else: + is_object_title = False + + + # 'terms' is the default per-term bucket aggregation for all non-schema facets + if self.item_type_es_mapping and find_nested_path(field, self.item_type_es_mapping): + aggregation_type = 'nested' + else: + aggregation_type = 'terms' + + # If we have a range filter in the URL, strip out the ".to" and ".from" + if title_field == 'from' or title_field == 'to': + if len(split_field) >= 3: + f_field = ".".join(split_field[1:-1]) + field_schema = schema_for_field(f_field, self.request, self.doc_types) + + if field_schema: + # field could be a date, numerical, or array of numerical + if (is_date_field(field, field_schema) or + is_numerical_field(field_schema) or + is_array_of_numerical_field(field_schema)): + title_field = field_schema.get("title", f_field) + use_field = f_field + aggregation_type = 'stats' + + # At moment is equivalent to `if aggregation_type == 'stats'` until/unless more agg types are added for _facets_. + if aggregation_type == 'stats': + # Remove completely if duplicate (e.g. don't need to have` .from` and `.to` both present) + if use_field in used_facet_fields or use_field in disabled_facet_fields: + continue + # Facet would be otherwise added twice if both `.from` and `.to` are requested. + + for schema in self.schemas: + if title_field in schema['properties']: + title_field = schema['properties'][title_field].get('title', title_field) + # see if the title field conflicts for is_object_title facets + if is_object_title and title_field in used_facet_titles: + title_field += ' (Title)' + break + + used_facet_fields.add(use_field) + facets.append(( + use_field, + { 'title': title_field, 'aggregation_type': aggregation_type } + )) + + # Append additional facets (status, validation_errors, ...) at the end of + # list unless were already added via schemas, etc. + used_facet_fields = { facet[0] for facet in facets } # Reset this + for ap_facet in append_facets + validation_error_facets: + if ap_facet[0] not in used_facet_fields: + used_facet_fields.add(ap_facet[0]) + facets.append(ap_facet) + return facets + + def assure_session_id(self): + """ Add searchSessionID information if not part of a sub-request, a generator or a limit=all search """ + if ( + self.request.__parent__ is None and + not getattr(self, "return_generator", None) and + getattr(self, "size", 25) != "all" + ): # Probably unnecessary, but skip for non-paged, sub-reqs, etc. + self.search_session_id = self.request.cookies.get('searchSessionID', 'SESSION-' + str(uuid.uuid1())) + + def build_search_query(self): + """ Builds the search query utilizing a combination of helper methods within this class + to build intermediary structures and LuceneBuilder function calls to handle building + the actual Elasticsearch query. + """ + self.build_query() + self.set_sort_order() + + # Transform into filtered search + self.query, query_filters = LuceneBuilder.build_filters(self.request, self.query, self.response, + self.principals, self.doc_types, + self.item_type_es_mapping) + # Prepare facets in intermediary structure + self.facets = self.initialize_facets() + + # Transform filter search into filter + faceted search + self.query = LuceneBuilder.build_facets(self.query, self.facets, query_filters, self.string_query, + self.request, self.doc_types, self.custom_aggregations, self.size, + self.from_, self.item_type_es_mapping) + + # Add preference from session, if available + # This just sets the value on the class - it is passed to execute_search later + self.assure_session_id() + + @staticmethod + def fix_and_replace_nested_doc_count(result_facet, aggregations, full_agg_name): + """ + 3 things must happen here (all occurring by side-effect, not value): + 1. front-end does not care about 'nested', only what the inner thing is, so lets pretend (so it doesn't break) + 2. We must overwrite the "second level" doc_count with the "third level" because the "third level" + is the 'root' level doc_count, which is what we care about, NOT the nested doc count + 3. We must then re-sort the aggregations so they show up in from greatest to least doc_count wrt the root + level count instead of the "old" nested doc count. + + :param result_facet: facet to be created - 'aggregation_type' is overwritten as 'terms' + :param aggregations: handle to all aggregations that we can access based on name + :param full_agg_name: full name of the aggregation + """ + result_facet['aggregation_type'] = 'terms' + term_to_bucket = {} # so we can deduplicate keys + source_aggregation = aggregations[full_agg_name]['primary_agg'] + primary_buckets = source_aggregation['primary_agg']['buckets'] + for bucket in primary_buckets: + if 'primary_agg_reverse_nested' in bucket: + bucket['doc_count'] = bucket['primary_agg_reverse_nested']['doc_count'] + if bucket['key'] not in term_to_bucket: + term_to_bucket[bucket['key']] = bucket + if 'requested_agg' in source_aggregation: + requested_buckets = source_aggregation['requested_agg']['buckets'] + for bucket in requested_buckets: + if 'primary_agg_reverse_nested' in bucket: + bucket['doc_count'] = bucket['primary_agg_reverse_nested']['doc_count'] + if bucket['key'] not in term_to_bucket: + term_to_bucket[bucket['key']] = bucket + + result_facet['terms'] = sorted(list(term_to_bucket.values()),key=lambda d: d['primary_agg_reverse_nested']['doc_count'], reverse=True) + + def format_facets(self, es_results): + """ + This method processes the 'aggregations' component of the ES response. + It does this by creating result_facet frames for all facets retrieved from ES + and populating the frame with the relevant aggregation info depending on it's type. + + Format the facets for the final results based on the es results. + Sort based off of the 'order' of the facets + These are stored within 'aggregations' of the result. + + If the frame for the search != embedded, return no facets + """ + result = [] + if self.search_frame != 'embedded': + return result + + # Loading facets in to the results + if 'aggregations' not in es_results: + return result + + aggregations = es_results['aggregations']['all_items'] + + # Sort facets by order (ascending). + # If no order is provided, assume 0 to + # retain order of non-explicitly ordered facets + for field, facet in sorted(self.facets, key=lambda fct: fct[1].get('order', 10000)): + if facet.get(self.DEFAULT_HIDDEN, False) and field not in self.additional_facets: # skip if specified + continue + + # Build result frame for the front-end + result_facet = { + 'field': field, + 'title': facet.get('title', field), + 'total': 0 + # To be added depending on facet['aggregation_type']: 'terms', 'min', 'max', 'min_as_string', 'max_as_string', ... + } + + result_facet.update({k: v for k, v in facet.items() if k not in result_facet.keys()}) + field_agg_name = field.replace('.', '-') + full_agg_name = facet['aggregation_type'] + ':' + field_agg_name + + if full_agg_name in aggregations: # found an agg for this field + + # process stats agg + if facet['aggregation_type'] == 'stats': + result_facet['total'] = aggregations[full_agg_name]['doc_count'] + # Used for fields on which can do range filter on, to provide min + max bounds + for k in aggregations[full_agg_name]['primary_agg'].keys(): + result_facet[k] = aggregations[full_agg_name]['primary_agg'][k] + + # nested stats aggregations have a second "layer" for reverse_nested + elif facet['aggregation_type'] == 'nested:stats': + result_facet['total'] = aggregations[full_agg_name]['primary_agg']['doc_count'] + for k in aggregations[full_agg_name]['primary_agg']['primary_agg'].keys(): + result_facet[k] = aggregations[full_agg_name]['primary_agg']['primary_agg'][k] + + # process range agg + elif facet['aggregation_type'] in ['range', 'nested:range']: + # Shift the bucket location + bucket_location = aggregations[full_agg_name]['primary_agg'] + if 'buckets' not in bucket_location: # account for nested structure + bucket_location = bucket_location['primary_agg'] + + # TODO - refactor ? + # merge bucket labels from ranges into buckets + for r in result_facet['ranges']: + for b in bucket_location['buckets']: + if (r.get('from', self.MISSING) == b.get('from', self.MISSING) and + r.get('to', self.MISSING) == b.get('to', self.MISSING)): + r['doc_count'] = b['doc_count'] + break + + # process terms agg + else: + # do the below, except account for nested agg structure + if facet['aggregation_type'] == NESTED: + self.fix_and_replace_nested_doc_count(result_facet, aggregations, full_agg_name) + result.append(result_facet) + continue + + def extract_buckets(path): + if 'buckets' not in path: + path = path['primary_agg'] + if 'buckets' not in path: + raise Exception('No buckets found on terms agg!') + return path['buckets'] + + term_to_bucket = {} # so we can deduplicate keys + source_aggregation = aggregations[full_agg_name] + if 'requested_agg' in source_aggregation: + for bucket in extract_buckets(source_aggregation['requested_agg']): + if bucket['key'] not in term_to_bucket: + term_to_bucket[bucket['key']] = bucket + + # always present + for bucket in extract_buckets(source_aggregation['primary_agg']): + if bucket['key'] not in term_to_bucket: + term_to_bucket[bucket['key']] = bucket + + # bring in rescue terms + if self.RESCUE_TERMS in facet: + for rescue_term in facet[self.RESCUE_TERMS]: + if rescue_term not in term_to_bucket: + term_to_bucket[rescue_term] = { + 'key': rescue_term, + 'doc_count': 0 + } + + result_facet['terms'] = list(term_to_bucket.values()) + + # XXX: not clear this functions as intended - Will 2/17/2020 + if len(aggregations[full_agg_name].keys()) > 2: + result_facet['extra_aggs'] = {k: v for k, v in aggregations[full_agg_name].items() if + k not in ('doc_count', 'primary_agg')} + + # Minor improvement for UI/UX -- if no 'validation errors' facet is included + # but only 1 "No value" term, then exclude the facet so is not shown + if field == "validation_errors.name": + validation_errors_terms_len = len(result_facet["terms"]) + if ( + validation_errors_terms_len == 0 or + (validation_errors_terms_len == 1 and result_facet["terms"][0]["key"] == "No value") + ): + continue + + result.append(result_facet) + + # TODO ALEX: Client will reject 'nested:stats' so overwritten here. + # Ideally, the client should accept 'stats', 'terms', 'nested:terms', 'nested:stats', + # and just treat the nested aggs exactly the same. + for facet in result: + agg_type = facet.get("aggregation_type") + override = None + if agg_type == 'nested:stats': + override = 'stats' + elif agg_type == 'nested:range': + override = 'range' + + # apply override + if override is not None: + facet["aggregation_type"] = override + + return result + + @staticmethod + def format_extra_aggregations(es_results): + """ + Extracts any extra aggregations results returned from elasticsearch + + :param es_results: dictionary response from es + :return: dictionary mapping field -> agg_value, varies based on type of aggregation + """ + if 'aggregations' not in es_results: + return {} + return {k: v for k, v in es_results['aggregations'].items() + if k != 'all_items'} + + def get_collection_actions(self): + """ + Use this method to see actions available on an item type (collection) in the request context + + :return: actions available for this collection at this time + """ + type_info = self.types[self.doc_types[0]] # only care about the "first" collection + collection = self.request.registry[COLLECTIONS].get(type_info.name) + if hasattr(collection, 'actions'): + return collection.actions(self.request) + else: + return None + + @staticmethod + def build_initial_columns(used_type_schemas): + + columns = OrderedDict() + + # Add title column, at beginning always + columns['display_title'] = { + "title": "Title", + "order": -1000 + } + + for schema in used_type_schemas: + if 'columns' in schema: + schema_columns = OrderedDict(schema['columns']) + # Add all columns defined in schema + for name, obj in schema_columns.items(): + if name not in columns: + columns[name] = obj + else: + # If @type or display_title etc. column defined in schema, then override defaults. + columns[name].update(schema_columns[name]) + + # Add status column, if not present, at end. + if 'status' not in columns: + columns['status'] = { + "title": "Status", + "default_hidden": True, + "order": 980 + } + + # Add date column, if not present, at end. + if 'date_created' not in columns: + columns['date_created'] = { + "title": "Date Created", + "colTitle": "Created", + "default_hidden": True, + "order": 1000 + } + + return columns + + def build_table_columns(self): + """ Constructs an ordered dictionary of column information to be rendered by + the front-end. If this functionality is needed outside of general search, this + method should be moved to search_utils.py. + """ + + columns = SearchBuilder.build_initial_columns(self.schemas) + + if self.request.normalized_params.get('currentAction') in ('selection', 'multiselect'): + return columns + + any_abstract_types = 'Item' in self.doc_types + if not any_abstract_types: # Check explictly-defined types to see if any are abstract. + type_infos = [self.request.registry[TYPES][t] for t in self.doc_types if t != 'Item'] + for ti in type_infos: + # We use `type` instead of `isinstance` since we don't want to catch subclasses. + if type(ti) == AbstractTypeInfo: + any_abstract_types = True + break + + # Add type column if any abstract types in search + if any_abstract_types: + columns['@type'] = { + "title": "Item Type", + "colTitle": "Type", + "order": -980, + "description": "Type or category of Item", + # Alternative below, if we want type column to be available but hidden by default in selection mode: + # "default_hidden": request.normalized_params.get('currentAction') == 'selection' + } + + return columns + + def _format_results(self, hits): + """ + Loads results to pass onto UI + Will retrieve the desired frame from the search hits and automatically + add 'validation_errors' and 'aggregated_items' frames if they are present + """ + fields_requested = self.request.normalized_params.getall('field') + if fields_requested: + frame = 'embedded' + elif self.search_frame: + frame = self.search_frame + else: + frame = 'embedded' + + if frame in ['embedded', 'object', 'raw']: + # transform 'raw' to 'properties', which is what is stored in ES + if frame == 'raw': + frame = 'properties' + for hit in hits: + frame_result = hit['_source'][frame] + if 'validation_errors' in hit['_source'] and 'validation_errors' not in frame_result: + frame_result['validation_errors'] = hit['_source']['validation_errors'] + if 'aggregated_items' in hit['_source'] and 'aggregated_items' not in frame_result: + frame_result['aggregated_items'] = hit['_source']['aggregated_items'] + yield frame_result + return + + def get_all_subsequent_results(self, extra_requests_needed_count, size_increment): + """ + Calls `execute_search` in paginated manner iteratively until all results have been yielded. + """ + from_ = 0 + while extra_requests_needed_count > 0: + # print(str(extra_requests_needed_count) + " requests left to get all results.") + from_ = from_ + size_increment + subsequent_search_result = execute_search(es=self.es, query=self.query, index=self.es_index, + from_=from_, size=size_increment, + session_id=self.search_session_id) + extra_requests_needed_count -= 1 + for hit in subsequent_search_result['hits'].get('hits', []): + yield hit + + def execute_search_for_all_results(self, size_increment=100): + """ + Returns a generator that iterates over _all_ results for search. + Calls `execute_search` in paginated manner iteratively until all results have been yielded + via `get_all_subsequent_results`. + + :param size_increment: number of results to get per page, default 100 + :return: all es_results that matched the given query + """ + es_result = execute_search(es=self.es, query=self.query, index=self.es_index, from_=0, size=size_increment, + session_id=self.search_session_id) + + total_results_expected = es_result['hits'].get('total', {}).get('value', 0) + + # Decrease by 1 (first es_result already happened) + extra_requests_needed_count = math.ceil(total_results_expected / size_increment) - 1 + + if extra_requests_needed_count > 0: + # Returns a generator as value of es_result['hits']['hits'] + # Will be returned directly if self.return_generator is true + # or converted to list if meant to be HTTP response. + # Theoretical but unnecessary future: Consider allowing to return HTTP Stream of results w. return_generator=true (?) + es_result['hits']['hits'] = itertools.chain( + es_result['hits']['hits'], + self.get_all_subsequent_results(extra_requests_needed_count, size_increment) + ) + return es_result + + def execute_search(self): + """ Executes the search, accounting for size if necessary """ + LuceneBuilder.verify_search_has_permissions(self.request, self.query) + if self.size == 'all': + es_results = self.execute_search_for_all_results() + else: # from_, size are integers + es_results = execute_search(es=self.es, query=self.query, index=self.es_index, + from_=self.from_, size=self.size, + session_id=self.search_session_id) + return es_results + + def format_results(self, es_results): + """ Takes es_results from Elasticsearch and populates a response object based on + on the given results. + + :param es_results: search results (from elasticsearch-dsl) + """ + # Response formatting + self.response['notification'] = 'Success' + self.response['total'] = es_results['hits']['total']['value'] + self.response['facets'] = self.format_facets(es_results) + self.response['aggregations'] = self.format_extra_aggregations(es_results) + self.response['actions'] = self.get_collection_actions() + columns = self.build_table_columns() + if columns: + self.response['columns'] = columns + + if self.size not in (None, 'all') and self.size < self.response['total']: + params = [(k, v) for k, v in self.request.normalized_params.items() if k != 'limit'] + params.append(('limit', 'all')) + # do not check presence of this field, as it triggers an ES len call! - Will 30 March 2022 + # if self.context: + # self.response['all'] = '%s?%s' % (self.request.resource_path(self.context), urlencode(params)) + + # `graph` below is a generator. + # `es_results['hits']['hits']` will contain a generator instead of list + # if limit=all was requested. `self._format_results` will always return a generator + # that iterates over es_results['hits']['hits'] regardless of its structure. + graph = self._format_results(es_results['hits']['hits']) + + if self.return_generator: + # Preserve `graph` as generator. + self.response['@graph'] = graph + else: + # Convert it into list as we assume we're responding to a HTTP request by default. + self.response['@graph'] = list(graph) + + # Save session ID for re-requests / subsequent pages. + if self.search_session_id: # Is 'None' if e.g. limit=all + self.request.response.set_cookie('searchSessionID', self.search_session_id) + + def _sort_custom_facets(self): + """ Applies custom sort to facets based on a dictionary provided on the type definition + + Specify a 2-tiered dictionary mapping field names to dictionaries of key -> weight + mappings that allow us to sort generally like this: + sorted(unsorted_terms, key=lambda d: field_terms_override_order.get(d['key'], default)) + ex: + { + { + facet_field_name: { + key1: weight, + key2: weight, + key3: weight + '_default': default_weight + } + } + } + If you had field name and wanted to force a facet ordering, you + could add this to the type definition: + FACET_ORDER_OVERRIDE = { + 'name': { + 'Will': 1, + 'Bob': 2, + 'Alice': 3, + '_default': 4, + } + } + When faceting on the 'name' field, the ordering now will always be Will -> Bob -> Alice -> anything else + regardless of the actual facet counts. Note that if no default is specified weight 101 + will be assigned (MAX_FACET_COUNTS + 1). + """ + if 'facets' in self.response: + for entry in self.response['facets']: + field = entry.get('field') + if field in self.facet_order_overrides: + field_terms_override_order = self.facet_order_overrides[field] + default = field_terms_override_order.get('_default', MAX_FACET_COUNTS + 1) + unsorted_terms = entry.get('terms', []) + entry['terms'] = sorted(unsorted_terms, key=lambda d: field_terms_override_order.get(d['key'], + default)) + + def get_response(self): + """ Gets the response for this search, setting 404 status if necessary. """ + if not self.response: + return {} # XXX: rather than raise exception? -Will + + # add query if an admin asks for it + if self.debug_is_active and is_admin_request(self.request): + self.response['query'] = self.query + + # If we got no results, return 404 or [] + if not self.response['total']: + # http://googlewebmastercentral.blogspot.com/2014/02/faceted-navigation-best-and-5-of-worst.html + self.request.response.status_code = 404 + self.response['notification'] = 'No results found' + self.response['@graph'] = [] + return self.response if not self.return_generator else [] + + # if this is a subrequest/gen request, return '@graph' directly + if self.request.__parent__ is not None or self.return_generator: + if self.return_generator: + # If self.return_generator, then self.response['@graph'] will + # contain a generator rather than a list via `self.format_results` + # TODO: Move that functionality into this method instead? + return self.response['@graph'] + + # apply custom facet filtering + self._sort_custom_facets() + + # otherwise just hand off response + return self.response + + def _build_query(self): + """ Builds the query, setting self.query """ + self.initialize_search_response() + self.build_search_query() + + def get_query(self): + """ Grabs the query object, now a dictionary """ + return self.query + + def _search(self): + """ Executes the end-to-end search. + + :returns: a search response (based on the __init__ parameters) + """ + self._build_query() + es_results = self.execute_search() + self.format_results(es_results) + return self.get_response() + + +@view_config(route_name='search', request_method='GET', permission='search') +@debug_log +def search(context, request, search_type=None, return_generator=False, forced_type='Search', custom_aggregations=None): + """ Search view connects to ElasticSearch and returns the results """ + search_builder = SearchBuilder(context, request, search_type, return_generator, forced_type, custom_aggregations) + return search_builder._search() + + +@view_config(context=AbstractCollection, permission='list', request_method='GET') +@debug_log +def collection_view(context, request): + """ + Simply use search results for collections views (e.g./biosamples/) + This is a redirect directly to the search page + """ + return search(context, request, context.type_info.name, False, forced_type='Search') + + +def get_iterable_search_results(request, search_path='/search/', param_lists=None, inherit_user=True, **kwargs): + ''' + Loops through search results, returns 100 (or search_results_chunk_row_size) results at a time. Pass it through itertools.chain.from_iterable to get one big iterable of results. + Potential TODO: Move to search_utils or other file, and have this (or another version of this) handle compound filter_sets. + + :param request: Only needed to pass to do_subreq to make a subrequest with. + :param search_path: Root path to call, defaults to /search/. + :param param_lists: Dictionary of param:lists_of_vals which is converted to URL query. + :param search_results_chunk_row_size: Amount of results to get per chunk. Default should be fine. + ''' + param_lists = deepcopy(param_lists) + param_lists['limit'] = ['all'] + param_lists['from'] = [0] + param_lists['sort'] = param_lists.get('sort', 'uuid') + subreq = make_search_subreq(request, '{}?{}'.format(search_path, urlencode(param_lists, True)), inherit_user=inherit_user) + if not inherit_user: + # Perform request as if an admin. + subreq.remote_user = "UPGRADE" + if 'HTTP_COOKIE' in subreq.environ: + del subreq.environ['HTTP_COOKIE'] + return iter_search_results(None, subreq, **kwargs) + + +# Update? used in ./batch_download.py +def iter_search_results(context, request, **kwargs): + return search(context, request, return_generator=True, **kwargs) + + +_ASSEMBLY_MAPPER = { + 'GRCh38-minimal': 'hg38', + 'GRCh38': 'hg38', + 'GRCh37': 'hg19', + 'GRCm38': 'mm10', + 'GRCm37': 'mm9', + 'BDGP6': 'dm4', + 'BDGP5': 'dm3', + 'WBcel235': 'WBcel235' +} + +hgConnect = 'http://genome.ucsc.edu/cgi-bin/hgTracks?hubClear=' diff --git a/snovault/search/search_utils.py b/snovault/search/search_utils.py new file mode 100644 index 000000000..0ff0286a8 --- /dev/null +++ b/snovault/search/search_utils.py @@ -0,0 +1,477 @@ +import structlog +from collections import OrderedDict +from elasticsearch import ( + TransportError, + RequestError, + ConnectionTimeout +) +from pyramid.httpexceptions import HTTPBadRequest +from snovault import TYPES +from snovault.util import crawl_schema, find_collection_subtypes +from snovault.embed import make_subrequest +from snovault.elasticsearch.indexer_utils import get_namespaced_index +from snovault.elasticsearch.create_mapping import determine_if_is_date_field +from dcicutils.misc_utils import deduplicate_list + + +log = structlog.getLogger(__name__) + +# Constants + +# from now on, use these constants when referring to elastic search +# query keywords when writing elastic search queries - Will 3-20-2020 +QUERY = 'query' +FILTER = 'filter' +MUST = 'must' +MUST_NOT = 'must_not' +BOOL = 'bool' +MATCH = 'match' +SHOULD = 'should' +EXISTS = 'exists' +FIELD = 'field' +NESTED = 'nested' +PATH = 'path' +TERMS = 'terms' +RANGE = 'range' +STATS = 'stats' +AGGS = 'aggs' +REVERSE_NESTED = 'reverse_nested' +# just for book-keeping/readability but is 'unused' for now +# ie: it should be obvious when you are 'effectively' writing lucene +ELASTIC_SEARCH_QUERY_KEYWORDS = [ + QUERY, FILTER, MUST, MUST_NOT, BOOL, MATCH, SHOULD, EXISTS, FIELD, NESTED, PATH, TERMS, RANGE, AGGS, REVERSE_NESTED, +] + + +COMMON_EXCLUDED_URI_PARAMS = [ + # Difference of this and URL params should result in all fields/filters. + 'frame', 'format', 'limit', 'sort', 'from', 'field', + 'mode', 'redirected_from', 'datastore', 'referrer', + 'currentAction', 'additional_facet', 'debug' +] +MAX_FACET_COUNTS = 100 +RAW_FIELD_AGGREGATIONS = [ + 'stats', 'nested:stats', 'date_histogram', 'histogram', 'range', 'nested:range', +] + + +# Exception Classes + + +class SearchException(Exception): + """ Base Search Exception - not meant to be used directly """ + def __init__(self, *, func, msg=None): + if msg is None: + msg = 'Exception occurred in search code at stage %s' % func + super(SearchException, self).__init__(msg) + self.func = func + + +class QueryConstructionException(SearchException): + """ + Query construction exception - throw this if we should throw an exception in query building + due to invalid query params + """ + def __init__(self, *, query_type, func, msg=None): + if msg is None: + msg = 'Exception occurred during query building at query type %s in func %s' % (query_type, func) + super(QueryConstructionException, self).__init__(func=func, msg=msg) + self.query_type = query_type + + +# Functions + + +def search_log(*, log_handler, msg, error=True): + """ Utility function intended to prepend SEARCH to all log messages. All log messages originating + in search code should use this method to log. + + :param log_handler: log handler to use + :param msg: msg to log + :param error: whether or not to log to error log. Default True, otherwise goes to DEBUG + """ + if error: + log_handler.error('SEARCH: ' + msg) + else: + log_handler.debug('SEARCH: ' + msg) + + +def convert_search_to_dictionary(search): + """ Converts the given search to a dictionary. Useful in mocking queries from dictionaries in testing. + + :param search: elasticsearch_dsl object to convert + :return: query in dictionary form + """ + return search.to_dict() + + +def find_nested_path(field, es_mapping): + """ + Returns path to 'highest' level nested field, in other words the first field mapped with type=nested + found by traversing the given field from the *top* level. + + This function relies on information about the structure of the es_mapping to extract + the *path to the object who's mapping is nested*. The comments within this function try to explain + that structure. This information is needed to construct nested queries (it is the path). + It returns None if the given (sub)field is not a member of a type=nested mapped field. + + :param field: the *full path* to the field we are filtering/aggregating on. + For example: "experiments_in_set.biosample.biosource.individual.organism.name" + :param es_mapping: dictionary representation of the es_mapping of the type we are searching on + :return: path for nested query or None + """ + location = es_mapping + possible_nested_paths = [] + path = [] + for cursor in field.split('.'): + if cursor == 'raw': # if we get to this point we're definitely at a leaf and should stop + break + if cursor not in location: # its possible we are at a sub-embedded object boundary. Check if it has properties. + if 'properties' not in location: # if it doesn't have properties, there's nowhere to go, so return None. + return None + location = location['properties'] # else move location forward, but do not add it to the PATH + if cursor not in location: # if we still don't see our 'level', we are not a nested field + break # accumulated path will be discarded (not added to possible_nested_paths) + location = location[cursor] + path.append(cursor) + if location.get('type', None) == 'nested': # this could be a path + possible_nested_paths.append('.'.join(path)) + # the last path added is the closest in proximity to the field and thus is correct + return possible_nested_paths[-1] if possible_nested_paths else None + + +def is_schema_field(field): + """ Returns whether or not we should expect a schema to be found for the given field. + Currently this only applies to validation_errors and aggregated_items. + + :param field: field name to check + :return: False if this field doesn't a schema, True otherwise + """ + # XXX: Consider doing this with regex? - Will 6/11/2020 + if field.startswith('validation_errors') or field.startswith('aggregated_items'): # note that trailing '.' is gone + return False + return True + + +def extract_field_name(field): + """ Pre-processes 'field' from URL query params. Solely handles converting 'type' to '@type' and + discarding the not (!) qualifier. + + :param field: field name to process + :return: correct field_name to search on + """ + use_field = '@type' if field == 'type' else field # 'type' field is really '@type' in the schema + return use_field[:-1] if use_field.endswith('!') else use_field + + +def schema_for_field(field, request, doc_types, should_log=False): + """ + Find the schema for the given field (in embedded '.' format). Uses + ff_utils.crawl_schema from snovault and logs any cases where there is an + error finding the field from the schema. Caches results based off of field + and doc types used + + :param field: embedded field path, separated by '.' + :param request: current Request object + :param doc_types (list): @types for the search + :param should_log (bool): logging will only occur if set to True + + :returns: Dictionary schema for the field, or None if not found + """ + types = request.registry[TYPES] + schemas = [types[dt].schema for dt in doc_types] + field_schema = None + + # We cannot hash dict by list (of doc_types) so we convert to unique ordered string + doc_type_string = ','.join(sorted(doc_types)) # use default sort + + # Check cache, initializing if necessary + cache = getattr(request, '_field_schema_cache', {}) + cache_key = (field, doc_type_string) + if cache is None: + request._field_schema_cache = cache = {} + if cache_key in cache: + return cache[cache_key] + + # for 'validation_errors.*' and 'aggregated_items.*', + # schema will never be found and logging isn't helpful + if schemas and is_schema_field(field): + use_field = extract_field_name(field) + for schema in schemas: + try: + field_schema = crawl_schema(types, use_field, schema) + except Exception as exc: # cannot find schema. Log and Return None + if should_log: + log.warning('Cannot find schema in search.py. Type: %s. Field: %s' + % (doc_types[0], field), field=field, error=str(exc)) + else: + if field_schema is not None: + break + + # Cache result, even if not found, for this request. + cache[cache_key] = field_schema + + return field_schema + + +def get_query_field(field, facet): + """ + Converts a field from its generic field name to a more specific field name referencing its embedded nature + + :param field: generic field name, such as 'files.accession' + :param facet: facet on this field + :return: full path to field on ES mapping + """ + if field == 'type': + return 'embedded.@type.raw' + elif not is_schema_field(field): + return field + '.raw' + elif facet.get('aggregation_type') in RAW_FIELD_AGGREGATIONS: + return 'embedded.' + field + else: + return 'embedded.' + field + '.raw' + + +def find_index_by_doc_types(request, doc_types, ignore): + """ + Find the correct index(es) to be search given a list of doc_types. + The types in doc_types are the item class names, formatted like + 'Experiment HiC' and index names are the item types, formatted like + 'experiment_hi_c'. + Ignore any collection names provided in the ignore param, an array. + Formats output indexes as a string usable by elasticsearch + """ + indexes = [] + for doc_type in doc_types: + if doc_type in ignore: + continue + else: + result = find_collection_subtypes(request.registry, doc_type) + namespaced_results = map(lambda t: get_namespaced_index(request, t), result) + indexes.extend(namespaced_results) + indexes = deduplicate_list(indexes) + index_string = ','.join(indexes) + return index_string + + +def get_es_index(request, doc_types): + """ + Gets ES index based on doc_type (one type per index) + if doc_type is item, search all indexes by setting es_index to None + If multiple, search all specified + + :param request: current request, to be passed + :param doc_types: item types we are searching on + :return: index name + """ + if 'Item' in doc_types: + return get_namespaced_index(request, '*') + else: + return find_index_by_doc_types(request, doc_types, ['Item']) + + +def get_es_mapping(es, es_index): + """ + Get es mapping for given doc type (so we can handle type=nested) + Note this is the mechanism by which we "enable" the ability to do nested searches + ie: only enabled on single index searches. You could feasibly add more criteria. + + :param es: elasticsearch client + :param es_index: index to get mapping from + :return: the mapping for this item type or {} if we are not doing a single index search + """ + if '*' in es_index or ',' in es_index: # no type=nested searches can be done on * or multi-index + return {} + else: + index = es.indices.get(es_index) + return index[es_index]['mappings']['properties'] + + +def get_search_fields(request, doc_types): + """ + Returns set of columns that are being searched and highlights + XXX: Unused + """ + fields = {'uuid'} + highlights = {} + types = request.registry[TYPES] + for doc_type in doc_types: + type_info = types[doc_type] + for value in type_info.schema.get('boost_values', ()): + fields.add('embedded.' + value) + highlights['embedded.' + value] = {} + return fields, highlights + + +def is_linkto_or_object_array_root_field(field, types, doc_types): + """ + Not used currently. + + :param field: field to check + :param types: registry types + :param doc_types: types we are searching on + :return: infer whether or not this field is mapped with type=nested based on the schema alone + """ + schema = types[doc_types[0]].schema + field_root = field.split('.')[0] + fr_schema = (schema and schema.get('properties', {}).get(field_root, None)) or None + if fr_schema and fr_schema['type'] == 'array' and (fr_schema['items'].get('linkTo') is not None or fr_schema['items']['type'] == 'object'): + return True + return False + + +def execute_search(*, es, query, index, from_, size, session_id=None): + """ + Execute the given Elasticsearch-dsl search. Raise HTTPBadRequest for any + exceptions that arise. + + :param es: handle to es + :param query: dictionary representing ES query + :param index: index to search + :param from_: search start index + :param size: # of records to return + :param session_id: session if we are paginating + :returns: Dictionary search results + """ + err_exp = None + es_results = None + try: + # set timeout + es_results = es.search(index=index, body=query, from_=from_, size=size, timeout='30s', preference=session_id) + except ConnectionTimeout: + err_exp = 'The search failed due to a timeout. Please try a different query.' + except RequestError as exc: + # try to get a specific error message. May fail in some cases + try: + err_detail = str(exc.info['error']['root_cause'][0]['reason']) + except Exception: + err_detail = str(exc) + err_exp = 'The search failed due to a request error: ' + err_detail + except TransportError as exc: + # most general exception + exc_status = getattr(exc, 'status_code') + if exc_status == 'TIMEOUT': + err_exp = 'The search failed due to a timeout. Please try a different query.' + else: + err_exp = 'The search failed due to a transport error: ' + str(exc) + except Exception as exc: + err_exp = str(exc) # XXX: We should revisit if we think this is always safe... -Will 4-23-2020 + if err_exp: + raise HTTPBadRequest(explanation=err_exp) + return es_results + + +def make_search_subreq(request, path, **kwargs): + subreq = make_subrequest(request, path, **kwargs) + if hasattr(request, "_stats"): + subreq._stats = request._stats + subreq.registry = request.registry + if hasattr(request, "context"): + subreq.context = request.context + else: + subreq.context = None + subreq.headers['Accept'] = 'application/json' + return subreq + + +def is_numerical_field(field_schema): + """ Helper method that checks field_schemas type and determines if it is a 'numerical' field. """ + return field_schema.get('type', 'n/a') in ("integer", "float", "number") + + +def is_array_of_numerical_field(field_schema): + """ Helper method that checks if field schema is a numerical array field. """ + items = field_schema.get('items', None) + if items: + return is_numerical_field(items) + return False + + +def is_date_field(field, field_schema): + """ Helper method that determines if field_schema is """ + return determine_if_is_date_field(field, field_schema) + + +def build_sort_dicts(requested_sorts, request, doc_types=[], text_search=None): + ''' + `text_search` not applicable for compound filtersets atm.. afaik... maybe we handle it later. + ''' + + sort = OrderedDict() + result_sort = OrderedDict() + + if len(doc_types) == 1: + type_schema = request.registry[TYPES][doc_types[0]].schema + else: + type_schema = None + + def add_to_sort_dict(requested_sort): + if requested_sort.startswith('-'): + name = requested_sort[1:] + order = 'desc' + else: + name = requested_sort + order = 'asc' + + sort_schema = schema_for_field(name, request, doc_types) + + if sort_schema: + sort_type = sort_schema.get('type') + else: + sort_type = 'string' + + # ES type != schema types + if sort_type == 'integer': + sort['embedded.' + name] = result_sort[name] = { + 'order': order, + 'unmapped_type': 'long', + 'missing': '_last' + } + elif sort_type == 'number': + sort['embedded.' + name] = result_sort[name] = { + 'order': order, + 'unmapped_type': 'float', + 'missing': '_last' + } + elif sort_schema and determine_if_is_date_field(name, sort_schema): + sort['embedded.' + name + '.raw'] = result_sort[name] = { + 'order': order, + 'unmapped_type': 'date', + 'missing': '_last' + } + else: + # fallback case, applies to all string type:string fields + sort['embedded.' + name + '.lower_case_sort'] = result_sort[name] = { + 'order': order, + 'unmapped_type': 'keyword', + 'missing': '_last' + } + + + # Prefer sort order specified in request, if any + if requested_sorts: + for rs in requested_sorts: + add_to_sort_dict(rs) + + # Otherwise we use a default sort only when there's no text search to be ranked + if not sort and (text_search == '*' or not text_search): + # If searching for a single type, look for sort options in its schema + if type_schema: + if 'sort_by' in type_schema: + for k, v in type_schema['sort_by'].items(): + # Should always sort on raw field rather than analyzed field + # OR search on lower_case_sort for case insensitive results + sort['embedded.' + k + '.lower_case_sort'] = result_sort[k] = v + # Default is most recent first, then alphabetical by label + if not sort: + sort['embedded.date_created.raw'] = result_sort['date_created'] = { + 'order': 'desc', + 'unmapped_type': 'keyword', + } + sort['embedded.label.raw'] = result_sort['label'] = { + 'order': 'asc', + 'missing': '_last', + 'unmapped_type': 'keyword', + } + + return (sort, result_sort) diff --git a/snovault/server_defaults.py b/snovault/server_defaults.py new file mode 100644 index 000000000..99be08ce5 --- /dev/null +++ b/snovault/server_defaults.py @@ -0,0 +1,89 @@ +import random +import uuid + +from dcicutils.misc_utils import exported, utc_now_str +from jsonschema_serialize_fork import NO_DEFAULT +from pyramid.path import DottedNameResolver +from pyramid.threadlocal import get_current_request +from snovault.schema_utils import server_default +from .interfaces import COLLECTIONS # , ROOT +from string import digits # , ascii_uppercase +from .project_app import app_project +from .server_defaults_misc import add_last_modified, get_now +from .server_defaults_user import _userid, get_userid, get_user_resource + +exported( + COLLECTIONS, + add_last_modified, + get_now, + get_userid, + get_user_resource +) + + + +ACCESSION_FACTORY = __name__ + ':accession_factory' +ACCESSION_PREFIX = app_project().ACCESSION_PREFIX +ACCESSION_TEST_PREFIX = 'TST' + + +def includeme(config): + accession_factory = config.registry.settings.get('accession_factory') + if accession_factory: + factory = DottedNameResolver().resolve(accession_factory) + else: + factory = enc_accession + config.registry[ACCESSION_FACTORY] = factory + + +# XXX: This stuff is all added based on the serverDefault identifier in the schemas +# removing it altogether will totally break our code + + +@server_default +def userid(instance, subschema): # args required by jsonschema-serialize-fork + return _userid() + + +@server_default +def now(instance, subschema): # args required by jsonschema-serialize-fork + return utc_now_str() + + +@server_default +def uuid4(instance, subschema): + return str(uuid.uuid4()) + + +@server_default +def accession(instance, subschema): + if 'external_accession' in instance: + return NO_DEFAULT + request = get_current_request() + factory = request.registry[ACCESSION_FACTORY] + # With 17 576 000 options + ATTEMPTS = 10 + for attempt in range(ATTEMPTS): + new_accession = factory(subschema['accessionType']) + if new_accession in request.root: + continue + return new_accession + raise AssertionError("Free accession not found in %d attempts" % ATTEMPTS) + + +#FDN_ACCESSION_FORMAT = (digits, digits, digits, ascii_uppercase, ascii_uppercase, ascii_uppercase) +FDN_ACCESSION_FORMAT = ['ABCDEFGHIJKLMNOPQRSTUVWXYZ123456789']*7 + +def enc_accession(accession_type): + random_part = ''.join(random.choice(s) for s in FDN_ACCESSION_FORMAT) + return ACCESSION_PREFIX + accession_type + random_part + + +TEST_ACCESSION_FORMAT = (digits, ) * 7 + + +def test_accession(accession_type): + """ Test accessions are generated on test.encodedcc.org + """ + random_part = ''.join(random.choice(s) for s in TEST_ACCESSION_FORMAT) + return 'TST' + accession_type + random_part diff --git a/snovault/server_defaults_misc.py b/snovault/server_defaults_misc.py new file mode 100644 index 000000000..37b30b049 --- /dev/null +++ b/snovault/server_defaults_misc.py @@ -0,0 +1,41 @@ +from dcicutils.misc_utils import utc_now_str +from jsonschema_serialize_fork import NO_DEFAULT +from .server_defaults_user import get_userid + + +def get_now(): + """ Wrapper for the server_default 'now' above so it is not called through SERVER_DEFAULTS in our code """ + return utc_now_str() + + +def add_last_modified(properties, **kwargs): + """ + Uses the above two functions to add the last_modified information to the item + May have no effect + Allow someone to override the request userid (none in this case) by passing in a different uuid + CONSIDER: `last_modified` (and `last_text_edited`) are not really 'server defaults' but rather system-managed fields. + """ + + userid = kwargs.get("userid", None) + field_name_portion = kwargs.get("field_name_portion", "modified") + + last_field_name = "last_" + field_name_portion # => last_modified + by_field_name = field_name_portion + "_by" # => modified_by + date_field_name = "date_" + field_name_portion # => date_modified + + try: + last_modified = { + by_field_name: get_userid(), + date_field_name: get_now(), + } + except AttributeError: # no request in scope ie: we are outside the core application. + if userid: + last_modified = { + by_field_name: userid, + date_field_name: get_now(), + } + properties[last_field_name] = last_modified + else: + # get_userid returns NO_DEFAULT if no userid + if last_modified[by_field_name] != NO_DEFAULT: + properties[last_field_name] = last_modified diff --git a/snovault/server_defaults_user.py b/snovault/server_defaults_user.py new file mode 100644 index 000000000..1a304d8a5 --- /dev/null +++ b/snovault/server_defaults_user.py @@ -0,0 +1,28 @@ +# Factored out of server_defaults to avoid circular dependencies now that get_userid is +# used in resources.Item.is_update_by_admin_user (commonized from fourfront/cgap-portal), +# since server_defaults imports schema_utils which imports resources which wants get_userid. + +from jsonschema_serialize_fork import NO_DEFAULT +from pyramid.threadlocal import get_current_request +from .interfaces import COLLECTIONS + + +def _userid(): + request = get_current_request() + for principal in request.effective_principals: + if principal.startswith('userid.'): + return principal[7:] + return NO_DEFAULT + + +def get_userid(): + """ Wrapper for the server_default 'userid' above so it is not called through SERVER_DEFAULTS in our code """ + return _userid() + + +def get_user_resource(): + request = get_current_request() + userid_found = _userid() + if userid_found == NO_DEFAULT: + return NO_DEFAULT + return request.registry[COLLECTIONS]['user'][userid_found] diff --git a/snovault/sqlalchemy_tools.py b/snovault/sqlalchemy_tools.py new file mode 100644 index 000000000..ff6910d31 --- /dev/null +++ b/snovault/sqlalchemy_tools.py @@ -0,0 +1,59 @@ +import contextlib +import transaction + +from snovault import DBSESSION +from snovault.storage import Base +from sqlalchemy import MetaData +from zope.sqlalchemy import mark_changed + + +# Once debugged, this support probably wants to move to snovault. + +class PyramidAppManager: + + def __init__(self, app): + self.session = app.registry[DBSESSION] + self._meta = None + self._ordered_table_names = None + + def _reflect(self): + if self._meta is None: + meta = MetaData(bind=self.session.connection()) + meta.reflect() + self._meta = meta + + @property + def meta(self): + self._reflect() + return self._meta + + @property + def ordered_table_names(self): + ordered_names = self._ordered_table_names + if ordered_names is None: + self._reflect() + self._ordered_table_names = ordered_names = reversed(Base.metadata.sorted_tables) + return ordered_names + + @contextlib.contextmanager + def connection(self, as_transaction=False): + """ + Context manager executes a body of code with a connection object to the database. + + :param as_transaction: If the action is expected to be read-only, this can be false. + If there will be modifications that need to be committed, specify as_transaction=True. + + """ + connection = self.session.connection().connect() + if not transaction: + yield connection + else: + try: + yield connection + except Exception: + transaction.abort() + else: + # commit all changes to DB + self.session.flush() + mark_changed(self.session()) + transaction.commit() diff --git a/snovault/test_schemas/TestingDownload.json b/snovault/test_schemas/TestingDownload.json index a6a64f0dc..4dc135a4c 100644 --- a/snovault/test_schemas/TestingDownload.json +++ b/snovault/test_schemas/TestingDownload.json @@ -1,5 +1,9 @@ { "type": "object", + "mixinProperties": [ + { "$ref": "mixins.json#/status" }, + { "$ref": "mixins.json#/submitted" } + ], "properties": { "attachment": { "type": "object", diff --git a/snovault/test_schemas/mixins.json b/snovault/test_schemas/mixins.json index 78cd3cf21..6efea1d08 100644 --- a/snovault/test_schemas/mixins.json +++ b/snovault/test_schemas/mixins.json @@ -18,5 +18,38 @@ "deleted" ] } + }, + "submitted": { + "date_created": { + "rdfs:subPropertyOf": "dc:created", + "title": "Date Created", + "lookup": 1000, + "exclude_from": [ + "FFedit-create" + ], + "type": "string", + "anyOf": [ + { + "format": "date-time" + }, + { + "format": "date" + } + ], + "serverDefault": "now", + "permission": "restricted_fields" + }, + "submitted_by": { + "rdfs:subPropertyOf": "dc:creator", + "title": "Submitted By", + "exclude_from": [ + "FFedit-create" + ], + "type": "string", + "linkTo": "User", + "lookup": 1000, + "serverDefault": "userid", + "permission": "restricted_fields" + } } } diff --git a/snovault/tests/authentication.py b/snovault/tests/authentication.py deleted file mode 100644 index 44ecf8c27..000000000 --- a/snovault/tests/authentication.py +++ /dev/null @@ -1,304 +0,0 @@ -import base64 -import os - -from dcicutils.misc_utils import ignored -from operator import itemgetter -from passlib.context import CryptContext -from pyramid.authentication import ( - BasicAuthAuthenticationPolicy as _BasicAuthAuthenticationPolicy, - CallbackAuthenticationPolicy -) -from pyramid.httpexceptions import HTTPForbidden, HTTPFound -from pyramid.path import caller_package, DottedNameResolver -from pyramid.security import ( - # Authenticated, - # Everyone, - forget, - NO_PERMISSION_REQUIRED, - remember, - # principals_allowed_by_permission, -) -from pyramid.settings import asbool # , aslist -from pyramid.view import view_config -from ..interfaces import ROOT, COLLECTIONS -from ..storage import User -from ..calculated import calculate_properties -from ..validation import ValidationFailure -from ..validators import no_validate_item_content_post - - -CRYPT_CONTEXT = __name__ + ':crypt_context' - - -def includeme(config): - config.include('.snowflake_hash') - setting_prefix = 'passlib.' - passlib_settings = { - k[len(setting_prefix):]: v - for k, v in config.registry.settings.items() - if k.startswith(setting_prefix) - } - if not passlib_settings: - passlib_settings = {'schemes': 'snowflake_hash, unix_disabled'} - crypt_context = CryptContext(**passlib_settings) - config.registry[CRYPT_CONTEXT] = crypt_context - - # basic login route - config.add_route('login', '/login') - config.add_route('logout', '/logout') - config.add_route('impersonate-user', '/impersonate-user') - config.add_route('session-properties', '/session-properties') - config.scan(__name__) - - -class NamespacedAuthenticationPolicy(object): - """ Wrapper for authentication policy classes - - As userids are included in the list of principals, it seems good practice - to namespace them to avoid clashes. - - Constructor Arguments - - ``namespace`` - - The namespace used (string). - - ``base`` - - The base authentication policy (class or dotted name). - - Remaining arguments are passed to the ``base`` constructor. - - Example - - To make a ``REMOTE_USER`` 'admin' be 'user.admin' - - .. code-block:: python - - policy = NamespacedAuthenticationPolicy('user', - 'pyramid.authentication.RemoteUserAuthenticationPolicy') - """ - - def __new__(cls, namespace, base, *args, **kw): - # Dotted name support makes it easy to configure with pyramid_multiauth - name_resolver = DottedNameResolver(caller_package()) - base = name_resolver.maybe_resolve(base) - # Dynamically create a subclass - name = 'Namespaced_%s_%s' % (namespace, base.__name__) - klass = type(name, (cls, base), {'_namespace_prefix': namespace + '.'}) - return super(NamespacedAuthenticationPolicy, klass).__new__(klass) - - def __init__(self, namespace, base, *args, **kw): - ignored(namespace, base) - super(NamespacedAuthenticationPolicy, self).__init__(*args, **kw) - - def unauthenticated_userid(self, request): - userid = super(NamespacedAuthenticationPolicy, self) \ - .unauthenticated_userid(request) - if userid is not None: - userid = self._namespace_prefix + userid - return userid - - def remember(self, request, principal, **kw): - if not principal.startswith(self._namespace_prefix): - return [] - principal = principal[len(self._namespace_prefix):] - return super(NamespacedAuthenticationPolicy, self) \ - .remember(request, principal, **kw) - - -class BasicAuthAuthenticationPolicy(_BasicAuthAuthenticationPolicy): - def __init__(self, check, *args, **kw): - # Dotted name support makes it easy to configure with pyramid_multiauth - name_resolver = DottedNameResolver(caller_package()) - check = name_resolver.maybe_resolve(check) - super(BasicAuthAuthenticationPolicy, self).__init__(check, *args, **kw) - - -class LoginDenied(HTTPForbidden): - title = 'Login failure' - - -_fake_user = object() - - -class WebUserAuthenticationPolicy(CallbackAuthenticationPolicy): - - login_path = '/login' - method = 'POST' - - def unauthenticated_userid(self, request): - """ - So basically this is used to do a login, instead of the actual - login view... not sure why, but yeah.. - """ - # if we aren't posting to login just return None - if request.method != self.method or request.path != self.login_path: - return None - - # otherwise do a login, if we aren't already logged in - cached = getattr(request, '_webuser_authenticated', _fake_user) - if cached is not _fake_user: - return cached - - login = request.json.get("username") - password = request.json.get("password") - if not User.check_password(login, password): - request._webuser_authenticated = None - return None - - request._webuser_authenticated = login - return login - - @classmethod - def remember(cls, request, principal, **kw): - ignored(request, principal, kw) - return [] - - @classmethod - def forget(cls, request): - ignored(request) - return [] - - -@view_config(route_name='login', request_method='POST', - permission=NO_PERMISSION_REQUIRED) -def login(request): - login = request.authenticated_userid - if login is None: - namespace = userid = None - else: - namespace, userid = login.split('.', 1) - - if namespace != 'webuser': - request.session.invalidate() - request.response.headerlist.extend(forget(request)) - raise LoginDenied() - - request.session.invalidate() - request.session.get_csrf_token() - request.response.headerlist.extend(remember(request, 'mailto.' + userid)) - - properties = request.embed('/session-properties', as_user=userid) - if 'auth.userid' in request.session: - properties['auth.userid'] = request.session['auth.userid'] - return properties - - -@view_config(route_name='logout', - permission=NO_PERMISSION_REQUIRED, http_cache=0) -def logout(request): - """View to forget the user""" - request.session.invalidate() - request.session.get_csrf_token() - request.response.headerlist.extend(forget(request)) - if asbool(request.params.get('redirect', True)): - raise HTTPFound(location=request.resource_path(request.root)) - return {} - - -@view_config(route_name='session-properties', request_method='GET', - permission=NO_PERMISSION_REQUIRED) -def session_properties(request): - for principal in request.effective_principals: - if principal.startswith('userid.'): - break - else: - return {} - - namespace, userid = principal.split('.', 1) - user = request.registry[COLLECTIONS]['user'][userid] - user_actions = calculate_properties(user, request, category='user_action') - - properties = { - 'user': request.embed(request.resource_path(user)), - 'user_actions': [v for k, v in sorted(user_actions.items(), key=itemgetter(0))] - } - - if 'auth.userid' in request.session: - properties['auth.userid'] = request.session['auth.userid'] - - return properties - - -def webuser_check(username, password, request): - ignored(request) - # webusers have email address for username, thus, make sure we have an email address - if '@' not in username: - return None - if not User.check_password(username, password): - return None - return [] - - -def basic_auth_check(username, password, request): - # We may get called before the context is found and the root set - root = request.registry[ROOT] - collection = root['access-keys'] - try: - access_key = collection[username] - except KeyError: - return None - - properties = access_key.properties - hash = properties['secret_access_key_hash'] - - crypt_context = request.registry[CRYPT_CONTEXT] - valid = crypt_context.verify(password, hash) - if not valid: - return None - - # valid, new_hash = crypt_context.verify_and_update(password, hash) - # if new_hash: - # replace_user_hash(user, new_hash) - - return [] - - -@view_config(route_name='impersonate-user', request_method='POST', - validators=[no_validate_item_content_post], - permission='impersonate') -def impersonate_user(request): - """As an admin, impersonate a different user.""" - userid = request.validated['userid'] - users = request.registry[COLLECTIONS]['user'] - - try: - user = users[userid] - except KeyError: - raise ValidationFailure('body', ['userid'], 'User not found.') - - if user.properties.get('status') != 'current': - raise ValidationFailure('body', ['userid'], 'User is not enabled.') - - request.session.invalidate() - request.session.get_csrf_token() - request.response.headerlist.extend(remember(request, 'mailto.' + userid)) - user_properties = request.embed('/session-properties', as_user=userid) - if 'auth.userid' in request.session: - user_properties['auth.userid'] = request.session['auth.userid'] - - return user_properties - - -def generate_user(): - """ Generate a random user name with 64 bits of entropy - used to generate access_key - remove @ to ensure differentiation from web users, see webuser_check - """ - # Take a random 5 char binary string (80 bits of - # entropy) and encode it as upper cased base32 (8 chars) - random_bytes = os.urandom(5) - user = base64.b32encode(random_bytes).decode('ascii').rstrip('=').upper() - user = user.replace("@", "") - return user - - -def generate_password(): - """ Generate a password with 80 bits of entropy - """ - # Take a random 10 char binary string (80 bits of - # entropy) and encode it as lower cased base32 (16 chars) - random_bytes = os.urandom(10) - password = base64.b32encode(random_bytes).decode('ascii').rstrip('=').lower() - return password diff --git a/snovault/tests/authorization.py b/snovault/tests/authorization.py deleted file mode 100644 index 0795f250b..000000000 --- a/snovault/tests/authorization.py +++ /dev/null @@ -1,63 +0,0 @@ -from ..interfaces import COLLECTIONS - - -def groupfinder(login, request): - if '.' not in login: - return None - namespace, localname = login.split('.', 1) - user = None - - collections = request.registry[COLLECTIONS] - - if namespace == 'remoteuser': - if localname in ['EMBED', 'INDEXER']: - return [] - elif localname in ['TEST', 'IMPORT', 'UPGRADE']: - return ['group.admin'] - elif localname in ['TEST_SUBMITTER']: - return ['group.submitter'] - elif localname in ['TEST_AUTHENTICATED']: - return ['viewing_group.SNOWFLAKE'] - - if namespace in ('mailto', 'remoteuser', 'webuser'): - users = collections.by_item_type['user'] - try: - user = users[localname] - except KeyError: - return None - - elif namespace == 'accesskey': - access_keys = collections.by_item_type['access_key'] - try: - access_key = access_keys[localname] - except KeyError: - return None - - if access_key.properties.get('status') in ('deleted', 'disabled'): - return None - - userid = access_key.properties['user'] - user = collections.by_item_type['user'][userid] - - if user is None: - return None - - user_properties = user.properties - - if user_properties.get('status') in ('deleted', 'disabled'): - return None - - principals = ['userid.%s' % user.uuid] - lab = user_properties.get('lab') - if lab: - principals.append('lab.%s' % lab) - submits_for = user_properties.get('submits_for', []) - principals.extend('lab.%s' % lab_uuid for lab_uuid in submits_for) - principals.extend('submits_for.%s' % lab_uuid for lab_uuid in submits_for) - if submits_for: - principals.append('group.submitter') - groups = user_properties.get('groups', []) - principals.extend('group.%s' % group for group in groups) - viewing_groups = user_properties.get('viewing_groups', []) - principals.extend('viewing_group.%s' % group for group in viewing_groups) - return principals diff --git a/snovault/tests/conftest.py b/snovault/tests/conftest.py index 52b611736..c3460af5f 100644 --- a/snovault/tests/conftest.py +++ b/snovault/tests/conftest.py @@ -1,10 +1,7 @@ -# import os -# import time -# import tempfile -import pytest import logging -# import subprocess +import pytest +from ..project_defs import C4ProjectRegistry # noQA from ..elasticsearch.indexer_queue import QueueManager diff --git a/snovault/tests/data/inserts/README.rst b/snovault/tests/data/inserts/README.rst new file mode 100644 index 000000000..e69de29bb diff --git a/snovault/tests/data/master-inserts/README.rst b/snovault/tests/data/master-inserts/README.rst new file mode 100644 index 000000000..e69de29bb diff --git a/snovault/tests/search.py b/snovault/tests/search.py deleted file mode 100644 index 216ad819d..000000000 --- a/snovault/tests/search.py +++ /dev/null @@ -1,1435 +0,0 @@ -import itertools -import math -import re -import structlog -import uuid - -from collections import OrderedDict -from copy import deepcopy -from dcicutils.misc_utils import ignored -from elasticsearch import TransportError, RequestError, ConnectionTimeout -from elasticsearch_dsl import Search -from pyramid.httpexceptions import HTTPBadRequest -from pyramid.view import view_config -from urllib.parse import urlencode -from webob.multidict import MultiDict - -from ..elasticsearch import ELASTIC_SEARCH -from ..elasticsearch.create_mapping import determine_if_is_date_field -from ..elasticsearch.indexer_utils import get_namespaced_index -from ..embed import make_subrequest -from ..interfaces import TYPES, COLLECTIONS -from ..resources import AbstractCollection -from ..typeinfo import AbstractTypeInfo -from ..util import find_collection_subtypes, crawl_schema - - -log = structlog.getLogger(__name__) - - -def includeme(config): - config.add_route('search', '/search{slash:/?}') - config.add_route('browse', '/browse{slash:/?}') - config.scan(__name__) - - -sanitize_search_string_re = re.compile(r'[\\\+\-\&\|\!\(\)\{\}\[\]\^\~\:\/\\\*\?]') - - -COMMON_EXCLUDED_URI_PARAMS = [ - 'frame', 'format', 'limit', 'sort', 'from', 'field', - 'mode', 'redirected_from', 'datastore', 'referrer', - 'currentAction' -] - - -@view_config(route_name='search', request_method='GET', permission='search') -def search(context, request, search_type=None, return_generator=False, forced_type='Search', custom_aggregations=None): - """ - Search view connects to ElasticSearch and returns the results - """ - types = request.registry[TYPES] - # list of item types used from the query - doc_types = set_doc_types(request, types, search_type) - # sets request.normalized_params - search_base = normalize_query(request, types, doc_types) - # == INITIALIZE RESULT == - result = { - '@context': request.route_path('jsonld_context'), - '@id': '/' + forced_type.lower() + '/' + search_base, - '@type': [forced_type], - 'title': forced_type, - 'filters': [], - 'facets': [], - '@graph': [], - 'notification': '', - 'sort': {} - } - principals = request.effective_principals - es = request.registry[ELASTIC_SEARCH] - - from_, size = get_pagination(request) - - # get desired frame for this search - search_frame = request.normalized_params.get('frame', 'embedded') - - # == PREPARE SEARCH TERM == - prepared_terms = prepare_search_term(request) - - schemas = [types[item_type].schema for item_type in doc_types] - - # set ES index based on doc_type (one type per index) - # if doc_type is item, search all indexes by setting es_index to None - # If multiple, search all specified - namespaced_star = get_namespaced_index(request, '*') - if 'Item' in doc_types: - es_index = namespaced_star - else: - es_index = find_index_by_doc_types(request, doc_types, ['Item']) - - # establish elasticsearch_dsl class that will perform the search - search = Search(using=es, index=es_index) - - # set up clear_filters path - result['clear_filters'] = clear_filters_setup(request, doc_types, forced_type) - - # == SET TYPE FILTERS == - build_type_filters(result, request, doc_types, types) - - # get the fields that will be used as source for the search - # currently, supports frame=raw/object but live faceting does not work - # this is okay because the only non-embedded access will be programmatic - source_fields = sorted(list_source_fields(request, doc_types, search_frame)) - - # == GET FILTERED QUERY == - # Builds filtered query which supports multiple facet selection - search, string_query = build_query(search, prepared_terms, source_fields) - - # == Set sort order == - search = set_sort_order(request, search, prepared_terms, types, doc_types, result) - # TODO: implement BOOST here? - - # == Set filters == - search, query_filters = set_filters(request, search, result, principals, doc_types) - - # == Set starting facets == - facets = initialize_facets(request, doc_types, prepared_terms, schemas) - - # == Adding facets, plus any optional custom aggregations. == - # Uses 'size' and 'from_' to conditionally skip (no facets if from > 0; no aggs if size > 0). - search = set_facets(search, facets, query_filters, string_query, request, doc_types, custom_aggregations, size, from_) - - # == Add preference from session, if available == - search_session_id = None - if request.__parent__ is None and not return_generator and size != 'all': # Probably unnecessary, but skip for non-paged, sub-reqs, etc. - search_session_id = request.cookies.get('searchSessionID', 'SESSION-' + str(uuid.uuid1())) - search = search.params(preference=search_session_id) - - # == Execute the query == - if size == 'all': - es_results = execute_search_for_all_results(search) - else: - size_search = search[from_:from_ + size] - es_results = execute_search(size_search) - - # == Record total number of hits == - result['total'] = total = es_results['hits']['total']['value'] - result['facets'] = format_facets(es_results, facets, total, search_frame) - result['aggregations'] = format_extra_aggregations(es_results) - - # Add batch actions - # TODO: figure out exactly what this does. Provide download URLs? - # Implement later - # result.update(search_result_actions(request, doc_types, es_results)) - - # == Add all link for collections == - if size not in (None, 'all') and size < result['total']: - params = [(k, v) for k, v in request.normalized_params.items() if k != 'limit'] - params.append(('limit', 'all')) - if context: - result['all'] = '%s?%s' % (request.resource_path(context), urlencode(params)) - - # add actions (namely 'add') - result['actions'] = get_collection_actions(request, types[doc_types[0]]) - - if not result['total']: - # http://googlewebmastercentral.blogspot.com/2014/02/faceted-navigation-best-and-5-of-worst.html - request.response.status_code = 404 - result['notification'] = 'No results found' - result['@graph'] = [] - return result if not return_generator else [] - - columns = build_table_columns(request, schemas, doc_types) - if columns: - result['columns'] = columns - - result['notification'] = 'Success' - - # == Format results for JSON-LD == - graph = format_results(request, es_results['hits']['hits'], search_frame) - - if request.__parent__ is not None or return_generator: - if return_generator: - return graph - else: - result['@graph'] = list(graph) - return result - - result['@graph'] = list(graph) - if search_session_id: # Is 'None' if e.g. limit=all - request.response.set_cookie('searchSessionID', search_session_id) # Save session ID for re-requests / subsequent pages. - return result - - -@view_config(route_name='browse', request_method='GET', permission='search') -def browse(context, request, search_type='ExperimentSetReplicate', return_generator=False): - """ - Simply use search results for browse view - """ - return search(context, request, search_type, return_generator, forced_type='Browse') - - -@view_config(context=AbstractCollection, permission='list', request_method='GET') -def collection_view(context, request): - """ - Simply use search results for collections views (e.g./biosamples/) - This is a redirect directly to the search page - """ - return search(context, request, context.type_info.name, False, forced_type='Search') - - -def get_collection_actions(request, type_info): - collection = request.registry[COLLECTIONS].get(type_info.name) - if collection and hasattr(collection, 'actions'): - return collection.actions(request) - else: - return None - - -def get_pagination(request): - """ - Fill from_ and size parameters for search if given in the query string - """ - from_ = request.normalized_params.get('from', 0) - size = request.normalized_params.get('limit', 25) - if size in ('all', ''): - size = "all" - else: - try: - size = int(size) - except ValueError: - size = 25 - try: - from_ = int(from_) - except ValueError: - size = 0 - return from_, size - - -def get_all_subsequent_results(initial_search_result, search, extra_requests_needed_count, size_increment): - from_ = 0 - while extra_requests_needed_count > 0: - # print(str(extra_requests_needed_count) + " requests left to get all results.") - from_ = from_ + size_increment - subsequent_search = search[from_:from_ + size_increment] - subsequent_search_result = execute_search(subsequent_search) - extra_requests_needed_count -= 1 - for hit in subsequent_search_result['hits'].get('hits', []): - yield hit - - -def execute_search_for_all_results(search): - size_increment = 100 # Decrease this to like 5 or 10 to test. - - first_search = search[0:size_increment] # get aggregations from here - es_result = execute_search(first_search) - - total_results_expected = es_result['hits'].get('total', 0) - extra_requests_needed_count = int(math.ceil(total_results_expected / size_increment)) - 1 # Decrease by 1 (first es_result already happened) - - if extra_requests_needed_count > 0: - es_result['hits']['hits'] = itertools.chain(es_result['hits']['hits'], get_all_subsequent_results(es_result, search, extra_requests_needed_count, size_increment)) - return es_result - - -def normalize_query(request, types, doc_types): - """ - Normalize the query by calculating and setting request.normalized_params - (a webob MultiDict) that is derived from custom query rules and also - the list of doc_types specified by set_doc_types(). The normalize_param - helper function finds field_schema for each query parameter and enforces - a set of rules (see below). If the query item types differ from doc_types, - override with doc_types - - Args: - request: the current Request - types: registry[TYPES] - doc_types (list): item_types to use for the search - - Returns: - string: query string built from normalized params - """ - def normalize_param(key, val): - """ - Process each key/val in the original query param. As part of this, - obtain the field schema for each parameter. - Current rules: - - for 'type', get name from types (from the registry) - - append '.display_title' to any terminal linkTo query field - """ - # type param is a special case. use the name from TypeInfo - if key == 'type' and val in types: - return (key, types[val].name) - - # find schema for field parameter and drill down into arrays/subobjects - field_schema = schema_for_field(key, request, doc_types) - while field_schema and ('items' in field_schema or 'properties' in field_schema): - try: - field_schema = field_schema['items'] - except KeyError: - pass - try: - field_schema = field_schema['properties'] - except KeyError: - pass - if field_schema and 'linkTo' in field_schema: - # add display_title to terminal linkTo query fields - if key.endswith('!'): # handle NOT - return (key[:-1] + '.display_title!', val) - return (key + '.display_title', val) - else: - return (key, val) - - normalized_params = ( - normalize_param(k, v) - for k, v in request.params.items() - ) - # use a MultiDict to emulate request.params - normalized_params = MultiDict(normalized_params) - # overwrite 'type' if not equal to doc_types - if set(normalized_params.getall('type')) != set(doc_types): - if 'type' in normalized_params: - del normalized_params['type'] - for dtype in doc_types: - normalized_params.add('type', dtype) - # add the normalized params to the request - # these will be used in place of request.params for the rest of search - setattr(request, 'normalized_params', normalized_params) - # the query string of the normalized search - qs = '?' + urlencode([ - (k.encode('utf-8'), v.encode('utf-8')) - for k, v in request.normalized_params.items() - ]) - return qs - - -def clear_filters_setup(request, doc_types, forced_type): - """ - Clear Filters URI path - - Make a URI path that clears all non-datatype filters - and leaves in `q` (search query) params, if present. - Also preserves currentAction=selection, if is set. - - Returns: - A URL path - """ - seach_query_specs = request.normalized_params.getall('q') - seach_query_url = urlencode([("q", seach_query) for seach_query in seach_query_specs]) - # types_url will always be present (always >=1 doc_type) - types_url = urlencode([("type", typ) for typ in doc_types]) - current_action = request.normalized_params.get('currentAction') - - clear_qs = types_url or '' - if seach_query_url: - clear_qs += '&' + seach_query_url - if current_action == 'selection': - clear_qs += '¤tAction=selection' - current_search_sort = request.normalized_params.getall('sort') - current_search_sort_url = urlencode([("sort", s) for s in current_search_sort]) - if current_search_sort_url: - clear_qs += '&' + current_search_sort_url - return request.route_path(forced_type.lower(), slash='/') + (('?' + clear_qs) if clear_qs else '') - - -def build_type_filters(result, request, doc_types, types): - """ - Set the type filters for the search. If no doc_types, default to Item - """ - if not doc_types: - doc_types = ['Item'] - else: - for item_type in doc_types: - ti = types[item_type] - qs = urlencode([ - (k.encode('utf-8'), v.encode('utf-8')) - for k, v in request.normalized_params.items() if not (k == 'type' and types.all.get('Item' if v == '*' else v) is ti) - ]) - result['filters'].append({ - 'field': 'type', - 'term': ti.name, - 'remove': '{}?{}'.format(request.path, qs) - }) - - -def prepare_search_term(request): - """ - Prepares search terms by making a dictionary where the keys are fields - and the values are arrays of query strings - Ignore certain keywords, such as type, format, and field - """ - prepared_terms = {} - # prepared_vals = [] - # In case it helps, I think request.normalized_params is a MultiDict. -kmp 7-Aug-2022 - for field, val in request.normalized_params.items(): # was .iteritems(), but that went away in Python 3 - if field.startswith('validation_errors') or field.startswith('aggregated_items'): - continue - elif field == 'q': # searched string has field 'q' - # people shouldn't provide multiple queries, but if they do, - # combine them with AND logic - if 'q' in prepared_terms: - join_list = [prepared_terms['q'], val] - prepared_terms['q'] = ' AND '.join(join_list) - else: - prepared_terms['q'] = val - elif field not in COMMON_EXCLUDED_URI_PARAMS + ['type']: - if 'embedded.' + field not in prepared_terms.keys(): - prepared_terms['embedded.' + field] = [] - prepared_terms['embedded.' + field].append(val) - return prepared_terms - - -def set_doc_types(request, types, search_type): - """ - Set the type of documents resulting from the search; order and check for - invalid types as well. If a forced search_type is enforced, use that; - otherwise, set types from the query params. Default to Item if none set. - - Args: - request: the current Request - types: registry[TYPES] - search_type (str): forced search item type - - Returns: - list: the string item types to use for the search - - Raises: - HTTPBadRequest: if an invalid item type is supplied - """ - doc_types = [] - if search_type is None: - doc_types = request.params.getall('type') - if '*' in doc_types: - doc_types = ['Item'] - else: - doc_types = [search_type] - # Normalize to item_type - try: - doc_types = sorted({types[name].name for name in doc_types}) - except KeyError: - # Check for invalid types - bad_types = [t for t in doc_types if t not in types] - msg = "Invalid type: {}".format(', '.join(bad_types)) - raise HTTPBadRequest(explanation=msg) - if len(doc_types) == 0: - doc_types = ['Item'] - return doc_types - - -def get_search_fields(request, doc_types): - """ - Returns set of columns that are being searched and highlights - """ - fields = {'uuid'} - highlights = {} - types = request.registry[TYPES] - for doc_type in doc_types: - type_info = types[doc_type] - for value in type_info.schema.get('boost_values', ()): - fields.add('embedded.' + value) - highlights['embedded.' + value] = {} - return fields, highlights - - -def list_source_fields(request, doc_types, frame): - """ - Returns set of fields that are requested by user or default fields. - These fields are used to further limit the results from the search. - Note that you must provide the full fieldname with embeds, such as: - 'field=biosample.biosource.individual.organism.name' and not just - 'field=name' - """ - fields_requested = request.normalized_params.getall('field') - if fields_requested: - fields = ['embedded.@id', 'embedded.@type'] - for field in fields_requested: - fields.append('embedded.' + field) - elif frame in ['embedded', 'object', 'raw']: - if frame != 'embedded': - # frame=raw corresponds to 'properties' in ES - if frame == 'raw': - frame = 'properties' - # let embedded be searched as well (for faceting) - fields = ['embedded.*', frame + '.*'] - else: - fields = [frame + '.*'] - else: - fields = ['embedded.*'] - return fields - - -def build_query(search, prepared_terms, source_fields): - """ - Prepare the query within the Search object. - """ - query_info = {} - string_query = None - # set _source fields for the search - search = search.source(list(source_fields)) - # prepare the query from prepared_terms - for field, value in prepared_terms.items(): - if field == 'q': - query_info['query'] = value - query_info['lenient'] = True - query_info['default_operator'] = 'AND' - query_info['fields'] = ['_all'] - break - if query_info != {}: - string_query = {'must': {'simple_query_string': query_info}} - query_dict = {'query': {'bool': string_query}} - else: - query_dict = {'query': {'bool': {}}} - search.update_from_dict(query_dict) - return search, string_query - - -def set_sort_order(request, search, search_term, types, doc_types, result): - """ - sets sort order for elasticsearch results - example: /search/?type=Biosource&sort=display_title - will sort by display_title in ascending order. To set descending order, - use the "-" flag: sort_by=-date_created. - Sorting is done alphatbetically, case sensitive by default. - TODO: add a schema flag for case sensitivity/insensitivity? - - ES5: simply pass in the sort OrderedDict into search.sort - """ - sort = OrderedDict() - result_sort = OrderedDict() - if len(doc_types) == 1: - type_schema = types[doc_types[0]].schema - else: - type_schema = None - - def add_to_sort_dict(requested_sort): - if requested_sort.startswith('-'): - name = requested_sort[1:] - order = 'desc' - else: - name = requested_sort - order = 'asc' - sort_schema = type_schema.get('properties', {}).get(name) if type_schema else None - if sort_schema: - sort_type = sort_schema.get('type') - else: - sort_type = 'string' - - # ES type != schema types - if sort_type == 'integer': - sort['embedded.' + name] = result_sort[name] = { - 'order': order, - 'unmapped_type': 'long', - 'missing': '_last' - } - elif sort_type == 'number': - sort['embedded.' + name] = result_sort[name] = { - 'order': order, - 'unmapped_type': 'float', - 'missing': '_last' - } - else: - # fallback case, applies to all string type:string fields - sort['embedded.' + name + '.lower_case_sort'] = result_sort[name] = { - 'order': order, - 'unmapped_type': 'keyword', - 'missing': '_last' - } - - # Prefer sort order specified in request, if any - requested_sorts = request.normalized_params.getall('sort') - if requested_sorts: - for rs in requested_sorts: - add_to_sort_dict(rs) - - text_search = search_term.get('q') - - # Otherwise we use a default sort only when there's no text search to be ranked - if not sort and (text_search == '*' or not text_search): - # If searching for a single type, look for sort options in its schema - if type_schema: - if 'sort_by' in type_schema: - for k, v in type_schema['sort_by'].items(): - # Should always sort on raw field rather than analyzed field - # OR search on lower_case_sort for case insensitive results - sort['embedded.' + k + '.lower_case_sort'] = result_sort[k] = v - # Default is most recent first, then alphabetical by label - if not sort: - sort['embedded.date_created.raw'] = result_sort['date_created'] = { - 'order': 'desc', - 'unmapped_type': 'keyword', - } - sort['embedded.label.raw'] = result_sort['label'] = { - 'order': 'asc', - 'missing': '_last', - 'unmapped_type': 'keyword', - } - elif not sort and text_search and text_search != '*': - search = search.sort( # Multi-level sort. See http://www.elastic.co/guide/en/elasticsearch/guide/current/_sorting.html#_multilevel_sorting & https://stackoverflow.com/questions/46458803/python-elasticsearch-dsl-sorting-with-multiple-fields - {'_score': {"order": "desc"}}, - {'embedded.date_created.raw': {'order': 'desc', 'unmapped_type': 'keyword'}, - 'embedded.label.raw': {'order': 'asc', 'unmapped_type': 'keyword', 'missing': '_last'}}, - {'_uid': {'order': 'asc'}} # 'embedded.uuid.raw' (instd of _uid) sometimes results in 400 bad request : 'org.elasticsearch.index.query.QueryShardException: No mapping found for [embedded.uuid.raw] in order to sort on' - ) - result['sort'] = result_sort = {'_score': {"order": "desc"}} - return search - - if sort and result_sort: - result['sort'] = result_sort - search = search.sort(sort) - return search - - -def set_filters(request, search, result, principals, doc_types): - """ - Sets filters in the query - """ - - # these next two dictionaries should each have keys equal to query_field - # and values: must_terms: [], must_not_terms: [], add_no_value: True/False/None - field_filters = { - 'principals_allowed.view': { - 'must_terms': principals, - 'must_not_terms': [], - 'add_no_value': None - }, - 'embedded.@type.raw': { - 'must_terms': doc_types, - 'must_not_terms': [], - 'add_no_value': None - }, - 'embedded.status.raw': { - 'must_terms': [], - 'must_not_terms': [], - 'add_no_value': None - } - } - - range_filters = {} - - # Exclude status=deleted Items unless explicitly requested/filtered-in. - if 'deleted' not in request.normalized_params.getall('status'): - field_filters['embedded.status.raw']['must_not_terms'].append('deleted') - if 'replaced' not in request.normalized_params.getall('status'): - field_filters['embedded.status.raw']['must_not_terms'].append('replaced') - - # Exclude type=TrackingItem and type=OntologyTerm from results unless are explictly specified - if 'TrackingItem' not in doc_types: - field_filters['embedded.@type.raw']['must_not_terms'].append('TrackingItem') - if 'OntologyTerm' not in doc_types: - field_filters['embedded.@type.raw']['must_not_terms'].append('OntologyTerm') - - for field, term in request.normalized_params.items(): - not_field = False # keep track if query is NOT (!) - exists_field = False # keep track of null values - range_type = False # If we determine is a range request (field.to, field.from), will be populated with string 'date' or 'numerical' - range_direction = None - if field in COMMON_EXCLUDED_URI_PARAMS + ['q']: - continue - elif field == 'type' and term != 'Item': - continue - elif term == 'No value': - exists_field = True - - # Check for date or numerical range filters - if (len(field) > 3 and field[-3:] == '.to') or (len(field) > 5 and field[-5:] == '.from'): - if field[-3:] == '.to': - f_field = field[:-3] - range_direction = "lte" - else: - f_field = field[:-5] - range_direction = "gte" - - # If schema for field is not found (and range_type thus not set), - # then treated as ordinary term filter (likely will get 0 results) - field_schema = schema_for_field(f_field, request, doc_types) - if field_schema: - range_type = 'date' if determine_if_is_date_field(f_field, field_schema) else 'numerical' - - # Add filter to result - qs = urlencode([ - (k.encode('utf-8'), v.encode('utf-8')) - for k, v in request.normalized_params.items() - if (k != field or v != term) - ]) - remove_path = '{}?{}'.format(request.path, qs) - - # default to searching type=Item rather than empty filter path - if remove_path[-1] == '?': - remove_path += 'type=Item' - - result['filters'].append({ - 'field': field, - 'term': term, - 'remove': remove_path - }) - - # handle NOT - if field.endswith('!'): - field = field[:-1] - not_field = True - - # Add filter to query - if range_type and f_field and range_type in ('date', 'numerical'): - query_field = 'embedded.' + f_field - elif field.startswith('validation_errors') or field.startswith('aggregated_items'): - query_field = field + '.raw' - elif field == 'type': - query_field = 'embedded.@type.raw' - else: - query_field = 'embedded.' + field + '.raw' - - if range_type: - if query_field not in range_filters: - range_filters[query_field] = {} - if range_type == 'date': - range_filters[query_field]['format'] = 'yyyy-MM-dd HH:mm' - - if range_direction in ('gt', 'gte', 'lt', 'lte'): - if len(term) == 10: - # Correct term to have hours, e.g. 00:00 or 23:59, if not otherwise supplied. - if range_direction == 'gt' or range_direction == 'lte': - term += ' 23:59' - elif range_direction == 'gte' or range_direction == 'lt': - term += ' 00:00' - - if range_filters[query_field].get(range_direction) is None: - range_filters[query_field][range_direction] = term - else: - # If have a value already (e.g. multiple ranges selected), choose the widening option. - if range_direction == 'gt' or range_direction == 'gte': - if term < range_filters[query_field][range_direction]: - range_filters[query_field][range_direction] = term - elif range_direction == 'lt' or range_direction == 'lte': - if term > range_filters[query_field][range_direction]: - range_filters[query_field][range_direction] = term - else: - if query_field not in field_filters: - field_filters[query_field] = { - 'must_terms': [], - 'must_not_terms': [], - 'add_no_value': None - } - - # handle case of filtering for null values - if exists_field: - # the value below is True when we want to include 'No value' as a filter - field_filters[query_field]['add_no_value'] = False if not_field else True - continue - - if not_field: - field_filters[query_field]['must_not_terms'].append(term) - else: - field_filters[query_field]['must_terms'].append(term) - - must_filters = [] - must_not_filters = [] - for query_field, filters in field_filters.items(): - must_terms = {'terms': {query_field: filters['must_terms']}} if filters['must_terms'] else {} - must_not_terms = {'terms': {query_field: filters['must_not_terms']}} if filters['must_not_terms'] else {} - if filters['add_no_value'] is True: - # add to must_not in an OR case, which is equivalent to filtering on 'No value' - should_arr = [must_terms] if must_terms else [] - should_arr.append({'bool': {'must_not': {'exists': {'field': query_field}}}}) - must_filters.append({'bool': {'should': should_arr}}) - elif filters['add_no_value'] is False: - # add to must_not in an OR case, which is equivalent to filtering on '! No value' - should_arr = [must_terms] if must_terms else [] - should_arr.append({'exists': {'field': query_field}}) - must_filters.append({'bool': {'should': should_arr}}) - else: # no filtering on 'No value' - if must_terms: - must_filters.append(must_terms) - if must_not_terms: - must_not_filters.append(must_not_terms) - - # lastly, add range limits to filters if given - for range_field, range_def in range_filters.items(): - must_filters.append({ - 'range': {range_field: range_def} - }) - - # To modify filters of elasticsearch_dsl Search, must call to_dict(), - # modify that, then update from the new dict - prev_search = search.to_dict() - # initialize filter hierarchy - final_filters = {'bool': {'must': must_filters, 'must_not': must_not_filters}} - prev_search['query']['bool']['filter'] = final_filters - search.update_from_dict(prev_search) - - return search, final_filters - - -def initialize_facets(request, doc_types, prepared_terms, schemas): - """ - Initialize the facets used for the search. If searching across multiple - doc_types, only use the default 'Data Type' and 'Status' facets. - Add facets for custom url filters whether or not they're in the schema - - Args: - doc_types (list): Item types (@type) for which we are performing a search for. - prepared_terms (dict): terms to match in ES, keyed by ES field name. - schemas (list): List of OrderedDicts of schemas for doc_types. - - Returns: - list: tuples containing (0) ElasticSearch-formatted field name (e.g. `embedded.status`) and (1) list of terms for it. - """ - - facets = [ - # More facets will be appended to this list from item schema plus from any currently-active filters (as requested in URI params). - ('type', {'title': 'Data Type'}) - ] - append_facets = [ - # Facets which will be appended after those which are in & added to `facets` - ('status', {'title': 'Status'}), - - # TODO: Re-enable below line if/when 'range' URI param queries for date & numerical fields are implemented. - # ('date_created', {'title': 'Date Created', 'hide_from_view' : True, 'aggregation_type' : 'date_histogram' }) - ] - validation_error_facets = [ - ('validation_errors.name', {'title': 'Validation Errors', 'order': 999}) - ] - # hold disabled facets from schema; we also want to remove these from the prepared_terms facets - disabled_facets = [] - - # Add facets from schema if one Item type is defined. - # Also, conditionally add extra appendable facets if relevant for type from schema. - if len(doc_types) == 1 and doc_types[0] != 'Item': - current_type_schema = request.registry[TYPES][doc_types[0]].schema - if 'facets' in current_type_schema: - schema_facets = OrderedDict(current_type_schema['facets']) - for schema_facet in schema_facets.items(): - if schema_facet[1].get('disabled', False): - disabled_facets.append(schema_facet[0]) - continue # Skip disabled facets. - facets.append(schema_facet) - - # == Add facets for any non-schema ?field=value filters requested in the search (unless already set) == - used_facets = [facet[0] for facet in facets + append_facets] - used_facet_titles = [facet[1]['title'] - for facet in facets + append_facets - if 'title' in facet[1]] - for field in prepared_terms: - if field.startswith('embedded'): - split_field = field.strip().split('.') # Will become, e.g. ['embedded', 'experiments_in_set', 'files', 'file_size', 'from'] - use_field = '.'.join(split_field[1:]) - - # 'terms' is the default per-term bucket aggregation for all non-schema facets - aggregation_type = 'terms' - - # Use the last part of the split field to get the field title - title_field = split_field[-1] - - # if searching for a display_title, use the title of parent object - # use `is_object_title` to keep track of this - if title_field == 'display_title' and len(split_field) > 1: - title_field = split_field[-2] - is_object_title = True - else: - is_object_title = False - - if title_field in used_facets or title_field in disabled_facets: - # Cancel if already in facets or is disabled - continue - - # If we have a range filter in the URL, - if title_field == 'from' or title_field == 'to': - if len(split_field) == 3: - f_field = split_field[-2] - field_schema = schema_for_field(f_field, request, doc_types) - if field_schema: - title_field = f_field - use_field = '.'.join(split_field[1:-1]) - aggregation_type = 'stats' - - for schema in schemas: - if title_field in schema['properties']: - title_field = schema['properties'][title_field].get('title', title_field) - # see if the title field conflicts for is_object_title facets - if is_object_title and title_field in used_facet_titles: - title_field += ' (Title)' - break - - facet_tuple = (use_field, {'title': title_field, 'aggregation_type': aggregation_type}) - - # At moment is equivalent to `if aggregation_type == 'stats'`` until/unless more agg types are added for _facets_. - if aggregation_type != 'terms': - facet_tuple[1]['hide_from_view'] = True # Temporary until we handle these better on front-end. - # Facet would be otherwise added twice if both `.from` and `.to` are requested. - if facet_tuple in facets: - continue - - facets.append(facet_tuple) - - # Append additional facets (status, validation_errors, ...) at the end of - # list unless were already added via schemas, etc. - used_facets = [facet[0] for facet in facets] # Reset this var - for ap_facet in append_facets + validation_error_facets: - if ap_facet[0] not in used_facets: - facets.append(ap_facet) - else: # Update with better title if not already defined from e.g. requested filters. - existing_facet_index = used_facets.index(ap_facet[0]) - if facets[existing_facet_index][1].get('title') in (None, facets[existing_facet_index][0]): - facets[existing_facet_index][1]['title'] = ap_facet[1]['title'] - - return facets - - -def schema_for_field(field, request, doc_types, should_log=False): - """ - Find the schema for the given field (in embedded '.' format). Uses - ff_utils.crawl_schema from snovault and logs any cases where there is an - error finding the field from the schema - - Args: - field (string): embedded field path, separated by '.' - request: current Request object - doc_types (list): @types for the search - should_log (bool): logging will only occur if set to True - - Returns: - Dictionary schema for the field, or None if not found - """ - types = request.registry[TYPES] - schemas = [types[dt].schema - for dt in doc_types] - - # We cannot hash dict by list (of doc_types) so we convert to unique ordered string - doc_type_string = ','.join(doc_types) - - cache = getattr(request, '_field_schema_cache', {}) - if (field, doc_type_string) in cache: - return cache[(field, doc_type_string)] - - field_schema = None - - # for 'validation_errors.*' and 'aggregated_items.*', - # schema will never be found and logging isn't helpful - if (schemas - and not field.startswith('validation_errors.') - and not field.startswith('aggregated_items.')): - # 'type' field is really '@type' in the schema - use_field = '@type' if field == 'type' else field - # eliminate '!' from not fields - use_field = use_field[:-1] if use_field.endswith('!') else use_field - for schema in schemas: - try: - field_schema = crawl_schema(types, use_field, schema) - except Exception as exc: # cannot find schema. Log and Return None - if should_log: - log.warning(f'Cannot find schema in search.py. Type: {doc_types[0]}. Field: {field}', - field=field, error=str(exc)) - else: - if field_schema is not None: - break - - # Cache result, even if not found, for this request. - cache[(field, doc_type_string)] = field_schema - if not hasattr(request, '_field_schema_cache'): - setattr(request, '_field_schema_cache', cache) - - return field_schema - - -def is_linkto_or_object_array_root_field(field, types, doc_types): - """Not used currently. May be useful for if we want to enabled "type" : "nested" mappings on lists of dictionaries""" - schema = types[doc_types[0]].schema - field_root = field.split('.')[0] - fr_schema = (schema and schema.get('properties', {}).get(field_root, None)) or None - if fr_schema and fr_schema['type'] == 'array' and (fr_schema['items'].get('linkTo') is not None or fr_schema['items']['type'] == 'object'): - return True - return False - - -def generate_filters_for_terms_agg_from_search_filters(query_field, search_filters, string_query): - """ - We add a copy of our filters to each facet, minus that of - facet's field itself so that we can get term counts for other terms filters. - And be able to filter w/ it. - - Remove filters from fields they apply to. - For example, the 'biosource_type' aggs should not have any - biosource_type filter in place. - Handle 'must' and 'must_not' filters separately - - Returns - Copy of search_filters, minus filter for current query_field (if one set). - """ - - facet_filters = deepcopy(search_filters['bool']) - - for filter_type in ['must', 'must_not']: - if not search_filters['bool'][filter_type]: - continue - for active_filter in search_filters['bool'][filter_type]: # active_filter => e.g. { 'terms' : { 'embedded.@type.raw': ['ExperimentSetReplicate'] } } - if 'bool' in active_filter and 'should' in active_filter['bool']: - # handle No value case - inner_bool = None - inner_should = active_filter.get('bool').get('should', []) - for or_term in inner_should: - # this may be naive, but assume first non-terms - # filter is the No value quqery - if 'terms' in or_term: - continue - else: - inner_bool = or_term - break - if 'exists' in inner_bool: - compare_field = inner_bool['exists'].get('field') - else: - # attempt to get the field from the alternative No value syntax - compare_field = inner_bool.get('bool', {}).get('must_not', {}).get('exists', {}).get('field') - if compare_field == query_field and query_field != 'embedded.@type.raw': - facet_filters[filter_type].remove(active_filter) - - if 'terms' in active_filter: - # there should only be one key here - for compare_field in active_filter['terms'].keys(): - # remove filter for a given field for that facet - # skip this for type facet (field = 'type') - # since we always want to include that filter. - if compare_field == query_field and query_field != 'embedded.@type.raw': - facet_filters[filter_type].remove(active_filter) - - elif 'range' in active_filter: - for compare_field in active_filter['range'].keys(): - # Do same as for terms - if compare_field == query_field: - facet_filters[filter_type].remove(active_filter) - - # add the string_query, if present, to the bool term with facet_filters - if string_query and string_query['must']: - # combine statements within 'must' for each - facet_filters['must'].append(string_query['must']) - - return facet_filters - - -def set_facets(search, facets, search_filters, string_query, request, doc_types, custom_aggregations=None, - size=25, from_=0): - """ - Sets facets in the query as ElasticSearch aggregations, with each aggregation to be - filtered by search_filters minus filter affecting facet field in order to get counts - for other facet term options. - ES5 - simply sets aggs by calling update_from_dict after adding them in - - :param facets: Facet field (0) in object dot notation, and a dict or OrderedDict with title property (1). - :type facets: List of tuples. - :param search_filters: Dict of filters which are set for the ES query in set_filters - :param string_query: Dict holding the query_string used in the search - """ - - if from_ != 0: - return search - - aggs = OrderedDict() - - for field, facet in facets: # E.g. 'type','experimentset_type','experiments_in_set.award.project', ... - field_schema = schema_for_field(field, request, doc_types, should_log=True) - is_date_field = field_schema and determine_if_is_date_field(field, field_schema) - is_numerical_field = field_schema and field_schema['type'] in ("integer", "float", "number") - - if field == 'type': - query_field = 'embedded.@type.raw' - elif field.startswith('validation_errors') or field.startswith('aggregated_items'): - query_field = field + '.raw' - elif facet.get('aggregation_type') in ('stats', 'date_histogram', 'histogram', 'range'): - query_field = 'embedded.' + field - else: - query_field = 'embedded.' + field + '.raw' - - # Create the aggregation itself, extend facet with info to pass down to front-end - agg_name = field.replace('.', '-') - - if facet.get('aggregation_type') == 'stats': - - if is_date_field: - facet['field_type'] = 'date' - elif is_numerical_field: - facet['field_type'] = 'number' - - aggs[facet['aggregation_type'] + ":" + agg_name] = { - 'aggs': { - "primary_agg": { - 'stats': { - 'field': query_field - } - } - }, - 'filter': search_filters - } - - else: # Default -- facetable terms - - facet['aggregation_type'] = 'terms' - facet_filters = generate_filters_for_terms_agg_from_search_filters(query_field, search_filters, string_query) - term_aggregation = { - "terms": { - 'size': 100, # Maximum terms returned (default=10); see https://github.com/10up/ElasticPress/wiki/Working-with-Aggregations - 'field': query_field, - 'missing': facet.get("missing_value_replacement", "No value") - } - } - - aggs[facet['aggregation_type'] + ":" + agg_name] = { - 'aggs': { - "primary_agg": term_aggregation - }, - 'filter': {'bool': facet_filters}, - } - - # Update facet with title, description from field_schema, if missing. - if facet.get('title') is None and field_schema and 'title' in field_schema: - facet['title'] = field_schema['title'] - if facet.get('description') is None and field_schema and 'description' in field_schema: - facet['description'] = field_schema['description'] - - # to achieve OR behavior within facets, search among GLOBAL results, - # not just returned ones. to do this, wrap aggs in ['all_items'] - # and add "global": {} to top level aggs query - # see elasticsearch global aggs for documentation (should be ES5 compliant) - search_as_dict = search.to_dict() - search_as_dict['aggs'] = { - 'all_items': { - 'global': {}, - 'aggs': aggs - } - } - - if size == 0: - # Only perform aggs if size==0 requested, to improve performance for search page queries. - # We do currently have (hidden) monthly date histogram facets which may yet to be utilized for common size!=0 agg use cases. - set_additional_aggregations(search_as_dict, request, doc_types, custom_aggregations) - - search.update_from_dict(search_as_dict) - return search - - -def set_additional_aggregations(search_as_dict, request, doc_types, extra_aggregations=None): - """ - Per-type aggregations may be defined in schemas. Apply them OUTSIDE of globals so they act on our current search filters. - Warning: `search_as_dict` is modified IN PLACE. - """ - - types = request.registry[TYPES] - schema = types[doc_types[0]].schema - - if schema.get('aggregations'): - for schema_agg_name in schema['aggregations'].keys(): - if schema_agg_name == 'all_items': - raise Exception('all_items is a reserved agg name and not allowed as an extra aggregation name.') - search_as_dict['aggs'][schema_agg_name] = schema['aggregations'][schema_agg_name] - - if extra_aggregations: - for extra_agg_name in extra_aggregations.keys(): - if extra_agg_name == 'all_items': - raise Exception('all_items is a reserved agg name and not allowed as an extra aggregation name.') - search_as_dict['aggs'][extra_agg_name] = extra_aggregations[extra_agg_name] - - return search_as_dict - - -def execute_search(search): - """ - Execute the given Elasticsearch-dsl search. Raise HTTPBadRequest for any - exceptions that arise. - Args: - search: the Elasticsearch-dsl prepared in the search() function - Returns: - Dictionary search results - """ - err_exp = None - try: - es_results = search.execute().to_dict() - except ConnectionTimeout as exc: - ignored(exc) - err_exp = 'The search failed due to a timeout. Please try a different query.' - except RequestError as exc: - # try to get a specific error message. May fail in some cases - try: - err_detail = str(exc.info['error']['root_cause'][0]['reason']) - except Exception: - err_detail = str(exc) - err_exp = 'The search failed due to a request error: ' + err_detail - except TransportError as exc: - # most general exception - exc_status = getattr(exc, 'status_code') - if exc_status == 'TIMEOUT': - err_exp = 'The search failed due to a timeout. Please try a different query.' - else: - err_exp = 'The search failed due to a transport error: ' + str(exc) - except Exception as exc: - ignored(exc) - err_exp = 'The search failed. The DCIC team has been notified.' - if err_exp: - raise HTTPBadRequest(explanation=err_exp) - return es_results # noQA - PyCharm wrongly worries this might not have been set. - - -def format_facets(es_results, facets, total, search_frame='embedded'): - """ - Format the facets for the final results based on the es results. - Sort based off of the 'order' of the facets - These are stored within 'aggregations' of the result. - - If the frame for the search != embedded, return no facets - """ - ignored(total) - result = [] - if search_frame != 'embedded': - return result - - # Loading facets in to the results - if 'aggregations' not in es_results: - return result - - aggregations = es_results['aggregations']['all_items'] - used_facets = set() - - # Sort facets by order (ascending). - # If no order is provided, assume 0 to - # retain order of non-explicitly ordered facets - for field, facet in sorted(facets, key=lambda fct: fct[1].get('order', 0)): - result_facet = { - 'field': field, - 'title': facet.get('title', field), - 'total': 0 - # To be added depending on facet['aggregation_type']: 'terms', 'min', 'max', 'min_as_string', 'max_as_string', ... - } - - result_facet.update({k: v - for k, v in facet.items() - if k not in result_facet.keys()}) - used_facets.add(field) - field_agg_name = field.replace('.', '-') - full_agg_name = facet['aggregation_type'] + ':' + field_agg_name - - if full_agg_name in aggregations: - if facet['aggregation_type'] == 'stats': - result_facet['total'] = aggregations[full_agg_name]['doc_count'] - # Used for fields on which can do range filter on, to provide min + max bounds - for k in aggregations[full_agg_name]["primary_agg"].keys(): - result_facet[k] = aggregations[full_agg_name]["primary_agg"][k] - else: # 'terms' assumed. - # Default - terms, range, or histogram buckets. Buckets may not be present - result_facet['terms'] = aggregations[full_agg_name]["primary_agg"]["buckets"] - # Choosing to show facets with one term for summary info on search it provides - if len(result_facet.get('terms', [])) < 1: - continue - - if len(aggregations[full_agg_name].keys()) > 2: - result_facet['extra_aggs'] = {k: v - for k, v in aggregations[field_agg_name].items() - if k not in ('doc_count', "primary_agg")} - - result.append(result_facet) - - return result - - -def format_extra_aggregations(es_results): - if 'aggregations' not in es_results: - return {} - return {k: v - for k, v in es_results['aggregations'].items() - if k != 'all_items'} - - -def format_results(request, hits, search_frame): - """ - Loads results to pass onto UI - Will retrieve the desired frame from the search hits and automatically - add 'validation_errors' and 'aggregated_items' frames if they are present - """ - fields_requested = request.normalized_params.getall('field') - if fields_requested: - frame = 'embedded' - elif search_frame: - frame = search_frame - else: - frame = 'embedded' - - if frame in ['embedded', 'object', 'raw']: - # transform 'raw' to 'properties', which is what is stored in ES - if frame == 'raw': - frame = 'properties' - for hit in hits: - frame_result = hit['_source'][frame] - if 'validation_errors' in hit['_source'] and 'validation_errors' not in frame_result: - frame_result['validation_errors'] = hit['_source']['validation_errors'] - if 'aggregated_items' in hit['_source'] and 'aggregated_items' not in frame_result: - frame_result['aggregated_items'] = hit['_source']['aggregated_items'] - yield frame_result - return - - -def find_index_by_doc_types(request, doc_types, ignore): - """ - Find the correct index(es) to be search given a list of doc_types. - The types in doc_types are the item class names, formatted like - 'Experiment HiC' and index names are the item types, formatted like - 'experiment_hi_c'. - Ignore any collection names provided in the ignore param, an array. - Formats output indexes as a string usable by elasticsearch - """ - indexes = [] - for doc_type in doc_types: - if doc_type in ignore: - continue - else: - result = find_collection_subtypes(request.registry, doc_type) - namespaced_results = map(lambda t: get_namespaced_index(request, t), result) - indexes.extend(namespaced_results) - # remove any duplicates - indexes = list(set(indexes)) - index_string = ','.join(indexes) - return index_string - - -def make_search_subreq(request, path): - subreq = make_subrequest(request, path) - if hasattr(request, "_stats"): - subreq._stats = request._stats - subreq.registry = request.registry - if hasattr(request, "context"): - subreq.context = request.context - else: - subreq.context = None - subreq.headers['Accept'] = 'application/json' - return subreq - - -DEFAULT_BROWSE_PARAM_LISTS = { - 'type': ["ExperimentSetReplicate"], - 'experimentset_type': ['replicate'], - 'award.project': ['4DN'] -} - - -def get_iterable_search_results(request, search_path='/search/', param_lists=None, **kwargs): - """ - Loops through search results, returns 100 (or search_results_chunk_row_size) results at a time. - Pass it through itertools.chain.from_iterable to get one big iterable of results. - TODO: Maybe make 'limit=all', and instead of calling invoke_subrequest(subrequest), instead call iter_search_results! - - :param request: Only needed to pass to do_subreq to make a subrequest with. - :param search_path: Root path to call, defaults to /search/ (can also use /browse/). - :param param_lists: Dictionary of param:lists_of_vals which is converted to URL query. - :param search_results_chunk_row_size: Amount of results to get per chunk. Default should be fine. - """ - if param_lists is None: - param_lists = deepcopy(DEFAULT_BROWSE_PARAM_LISTS) - else: - param_lists = deepcopy(param_lists) - param_lists['limit'] = ['all'] - param_lists['from'] = [0] # TODO: Should be ['0'] ?? - param_lists['sort'] = param_lists.get('sort', 'uuid') # TODO: Default should be ['uuid'] ?? - subreq = make_search_subreq(request, f'{search_path}?{urlencode(param_lists, True)}') - return iter_search_results(None, subreq, **kwargs) - - -# Update? used in ./batch_download.py -def iter_search_results(context, request, **kwargs): - return search(context, request, return_generator=True, **kwargs) - - -def build_table_columns(request, schemas, doc_types): - - any_abstract_types = 'Item' in doc_types - if not any_abstract_types: # Check explictly-defined types to see if any are abstract. - type_infos = [request.registry[TYPES][type] for type in doc_types if type != 'Item'] - for ti in type_infos: - # We use `type` instead of `isinstance` since we don't want to catch subclasses. - if type(ti) == AbstractTypeInfo: - any_abstract_types = True - break - - columns = OrderedDict() - - # Add title column, at beginning always - columns['display_title'] = { - "title": "Title", - "order": -100 - } - - # Add type column if any abstract types in search - if any_abstract_types and request.normalized_params.get('currentAction') != 'selection': - columns['@type'] = { - "title": "Item Type", - "colTitle": "Type", - "order": -80, - "description": "Type or category of Item", - # Alternative below, if we want type column to be available but hidden by default in selection mode: - # "default_hidden": request.normalized_params.get('currentAction') == 'selection' - } - - for schema in schemas: - if 'columns' in schema: - schema_columns = OrderedDict(schema['columns']) - # Add all columns defined in schema - for name, obj in schema_columns.items(): - if name not in columns: - columns[name] = obj - else: - # If @type or display_title etc. column defined in schema, then override defaults. - for prop in schema_columns[name]: - columns[name][prop] = schema_columns[name][prop] - # Add description from field schema, if none otherwise. - if not columns[name].get('description'): - field_schema = schema_for_field(name, request, doc_types) - if field_schema: - if field_schema.get('description') is not None: - columns[name]['description'] = field_schema['description'] - - # Add status column, if not present, at end. - if 'status' not in columns: - columns['status'] = { - "title": "Status", - "default_hidden": True, - "order": 501 - } - # Add date column, if not present, at end. - if 'date_created' not in columns: - columns['date_created'] = { - "title": "Date Created", - "colTitle": "Created", - "default_hidden": True, - "order": 510 - } - return columns - - -_ASSEMBLY_MAPPER = { - 'GRCh38-minimal': 'hg38', - 'GRCh38': 'hg38', - 'GRCh37': 'hg19', - 'GRCm38': 'mm10', - 'GRCm37': 'mm9', - 'BDGP6': 'dm4', - 'BDGP5': 'dm3', - 'WBcel235': 'WBcel235' -} - -hgConnect = ''.join([ - 'http://genome.ucsc.edu/cgi-bin/hgTracks', - '?hubClear=', -]) diff --git a/snovault/tests/serverfixtures.py b/snovault/tests/serverfixtures.py index f00456c32..392c710a4 100644 --- a/snovault/tests/serverfixtures.py +++ b/snovault/tests/serverfixtures.py @@ -26,6 +26,7 @@ NO_SERVER_FIXTURES = environ_bool("NO_SERVER_FIXTURES") + def pytest_configure(): logging.basicConfig(format='') logging.getLogger('sqlalchemy.engine').setLevel(logging.WARNING) diff --git a/snovault/tests/test_authentication.py b/snovault/tests/test_authentication.py index 7f102115c..8b245411f 100644 --- a/snovault/tests/test_authentication.py +++ b/snovault/tests/test_authentication.py @@ -4,7 +4,7 @@ from pyramid.security import Authenticated, Everyone from pyramid.testing import DummyRequest from zope.interface.verify import verifyObject, verifyClass -from .authentication import NamespacedAuthenticationPolicy +from ..authentication import NamespacedAuthenticationPolicy class TestNamespacedAuthenticationPolicy(unittest.TestCase): @@ -31,7 +31,7 @@ def test_unauthenticated_userid_returns_None(self): self.assertEqual(policy.unauthenticated_userid(request), None) def test_unauthenticated_userid(self): - request = DummyRequest(environ={'REMOTE_USER':'fred'}) + request = DummyRequest(environ={'REMOTE_USER': 'fred'}) policy = self._makeOne() self.assertEqual(policy.unauthenticated_userid(request), 'user.fred') @@ -41,7 +41,7 @@ def test_authenticated_userid_None(self): self.assertEqual(policy.authenticated_userid(request), None) def test_authenticated_userid(self): - request = DummyRequest(environ={'REMOTE_USER':'fred'}) + request = DummyRequest(environ={'REMOTE_USER': 'fred'}) policy = self._makeOne() self.assertEqual(policy.authenticated_userid(request), 'user.fred') @@ -51,7 +51,7 @@ def test_effective_principals_None(self): self.assertEqual(policy.effective_principals(request), [Everyone]) def test_effective_principals(self): - request = DummyRequest(environ={'REMOTE_USER':'fred'}) + request = DummyRequest(environ={'REMOTE_USER': 'fred'}) policy = self._makeOne() self.assertEqual(policy.effective_principals(request), [Everyone, Authenticated, 'user.fred']) @@ -63,7 +63,7 @@ def test_remember(self): self.assertEqual(result, []) def test_forget(self): - request = DummyRequest(environ={'REMOTE_USER':'fred'}) + request = DummyRequest(environ={'REMOTE_USER': 'fred'}) policy = self._makeOne() result = policy.forget(request) self.assertEqual(result, []) diff --git a/snovault/tests/test_clear_db_es_contents.py b/snovault/tests/test_clear_db_es_contents.py new file mode 100644 index 000000000..b6dc47af5 --- /dev/null +++ b/snovault/tests/test_clear_db_es_contents.py @@ -0,0 +1,303 @@ +import contextlib +import pytest + +from dcicutils.lang_utils import disjoined_list +from dcicutils.qa_utils import logged_messages, input_mocked +from unittest import mock +from ..commands import clear_db_es_contents as clear_db_es_contents_module +from ..commands.clear_db_es_contents import ( + clear_db_tables, + run_clear_db_es, + main as clear_db_es_contents_main +) + + +pytestmark = [pytest.mark.setone, pytest.mark.working, pytest.mark.indexing] + + +def test_clear_db_tables(app, testapp): + # post an item and make sure it's there + post_res = testapp.post_json('/testing-post-put-patch-sno/', {'required': 'abc'}, + status=201) + testapp.get(post_res.location, status=200) + clear_res = clear_db_tables(app) + assert clear_res is True + # item should no longer be present + testapp.get(post_res.location, status=404) + + +_FOURFRONT_PRODUCTION_ENVS = ['fourfront-production-blue', 'fourfront-production-green', 'data', 'staging'] +# Really we only care about the first of these names, but the rest are names that were at one time +# planned to be stg or prd names for cgap, so we'll use them to tell that run_clear_db_es is properly +# skipping any such names. -kmp 4-Jun-2022 +_CGAP_PRODUCTION_ENVS = ['fourfront-cgap', 'fourfront-cgap-green', 'cgap-green', 'fourfront-cgap-blue', 'cgap-blue'] + +_PRODUCTION_ENVS = [_FOURFRONT_PRODUCTION_ENVS] + [_CGAP_PRODUCTION_ENVS] + +TEST_ENV = 'cgap-devtest' + +OTHER_ENV = 'fourfront-foo' + +DECOY_ENV_1 = TEST_ENV + '-decoy-1' +DECOY_ENV_2 = TEST_ENV + '-decoy-2' + + +@contextlib.contextmanager +def local_env_name_registry_setting_for_testing(app, envname): + old_env = app.registry.settings.get('env.name') + print(f"Remembering old env.name = {old_env}") + try: + app.registry.settings['env.name'] = envname + print(f"Set env.name = {envname}") + yield + finally: + if old_env is None: + print(f"Removing env.name") + del app.registry.settings['env.name'] + else: + print(f"Restoring env.name to {old_env}") + app.registry.settings['env.name'] = old_env + + +@pytest.mark.unit +def test_run_clear_db_es_unit(app, testapp): + + with mock.patch.object(clear_db_es_contents_module, "clear_db_tables") as mock_clear_db_tables: + with mock.patch.object(clear_db_es_contents_module, "run_create_mapping") as mock_run_create_mapping: + + def mocked_is_stg_or_prd_env(env): + result = (env in _PRODUCTION_ENVS # really this should be enough + # for pragmatic redundancy since these will match our real production systems, protect them + or env in _CGAP_PRODUCTION_ENVS + or env in _FOURFRONT_PRODUCTION_ENVS + or env.endswith("blue") or env.endswith("green") or env.endswith("cgap")) + print(f"Mocked is_stg_or_prd_env({env}) returning {result}.") + return result + + with mock.patch.object(clear_db_es_contents_module, "is_stg_or_prd_env") as mock_is_stg_or_prd_env: + mock_is_stg_or_prd_env.side_effect = mocked_is_stg_or_prd_env + + expected_db_clears = 0 + expected_es_clears = 0 + + assert mock_clear_db_tables.call_count == expected_db_clears + assert mock_run_create_mapping.call_count == expected_es_clears + + # It works positionally + assert run_clear_db_es(app, None, True) is True + expected_db_clears += 1 + expected_es_clears += 0 + + assert mock_clear_db_tables.call_count == expected_db_clears + assert mock_run_create_mapping.call_count == expected_es_clears + + # It works by keyword argument + assert run_clear_db_es(app, only_envs=None, skip_es=True) is True + expected_db_clears += 1 + expected_es_clears += 0 + + assert mock_clear_db_tables.call_count == expected_db_clears + assert mock_run_create_mapping.call_count == expected_es_clears + + for production_env in _PRODUCTION_ENVS: + with local_env_name_registry_setting_for_testing(app, production_env): + # should never run on production envs env + assert clear_db_es_contents_module.is_stg_or_prd_env(production_env) is True + with logged_messages(module=clear_db_es_contents_module, error=[ + (f'clear_db_es_contents: This action cannot be performed on env {production_env}' + f' because it is a production-class (stg or prd) environment.' + f' Skipping the attempt to clear DB.')]): + assert run_clear_db_es(app, only_envs=None, skip_es=True) is False + expected_db_clears += 0 + expected_es_clears += 0 + assert mock_clear_db_tables.call_count == expected_db_clears + assert mock_run_create_mapping.call_count == expected_es_clears + + with local_env_name_registry_setting_for_testing(app, TEST_ENV): + + allowed_envs = [OTHER_ENV] + + # test if we are only running on specific envs + with logged_messages(module=clear_db_es_contents_module, + error=[(f'clear_db_es_contents: The current environment, {TEST_ENV},' + f' is not {disjoined_list(allowed_envs)}.' + f' Skipping the attempt to clear DB.')]): + assert run_clear_db_es(app, only_envs=allowed_envs, skip_es=True) is False + expected_db_clears += 0 + expected_es_clears += 0 + assert mock_clear_db_tables.call_count == expected_db_clears + assert mock_run_create_mapping.call_count == expected_es_clears + + # test again if we are only running on specific envs + with logged_messages(module=clear_db_es_contents_module, + error=[(f'clear_db_es_contents: The current environment, {TEST_ENV},' + f' is not {disjoined_list(allowed_envs)}.' + f' Skipping the attempt to clear DB.')]): + assert run_clear_db_es(app, only_envs=allowed_envs, skip_es=False) is False + expected_db_clears += 0 + expected_es_clears += 0 + assert mock_clear_db_tables.call_count == expected_db_clears + assert mock_run_create_mapping.call_count == expected_es_clears + + # test if we are only running on specific envs + assert run_clear_db_es(app, only_envs=[TEST_ENV], skip_es=True) is True + expected_db_clears += 1 + expected_es_clears += 0 + assert mock_clear_db_tables.call_count == expected_db_clears + assert mock_run_create_mapping.call_count == expected_es_clears + + # test again if we are only running on specific envs + assert run_clear_db_es(app, only_envs=[TEST_ENV], skip_es=False) is True + expected_db_clears += 1 + expected_es_clears += 1 + assert mock_clear_db_tables.call_count == expected_db_clears + assert mock_run_create_mapping.call_count == expected_es_clears + + allowed_envs = [DECOY_ENV_1, DECOY_ENV_2] + # test if we are only running on specific envs + with logged_messages(module=clear_db_es_contents_module, + error=[(f'clear_db_es_contents: The current environment, {TEST_ENV},' + f' is not {disjoined_list(allowed_envs)}.' + f' Skipping the attempt to clear DB.')]): + assert run_clear_db_es(app, only_envs=allowed_envs, skip_es=False) is False + expected_db_clears += 0 + expected_es_clears += 0 + assert mock_clear_db_tables.call_count == expected_db_clears + assert mock_run_create_mapping.call_count == expected_es_clears + + allowed_envs = [DECOY_ENV_1, TEST_ENV] + # test if we are only running on specific envs + assert run_clear_db_es(app, only_envs=allowed_envs, skip_es=False) is True + expected_db_clears += 1 + expected_es_clears += 1 + assert mock_clear_db_tables.call_count == expected_db_clears + assert mock_run_create_mapping.call_count == expected_es_clears + + +@pytest.mark.unit +def test_clear_db_es_contents_main(): + + # It should never get to these first two in this test, but they're ethere for safety. + with mock.patch.object(clear_db_es_contents_module, "clear_db_tables"): + with mock.patch.object(clear_db_es_contents_module, "run_create_mapping"): + + class FakeApp: + + class Registry: + def __init__(self): + self.settings = {} + + def __init__(self, config_uri, appname): + self.appname = appname + self.config_uri = config_uri + self.registry = self.Registry() + + def __str__(self): + return f"" + + def __repr__(self): + return str(self) + + class MockDBSession: + + def __init__(self, app): + self.app = app + + apps = {} + + def mocked_get_app(config_uri, appname): + key = (config_uri, appname) + app = apps.get(key) + if not app: + apps[key] = app = FakeApp(config_uri, appname) + return app + + def mocked_configure_dbsession(app): + return MockDBSession(app) + + with mock.patch.object(clear_db_es_contents_module, "run_clear_db_es") as mock_run_clear_db_es: + with mock.patch.object(clear_db_es_contents_module, "get_app") as mock_get_app: + mock_get_app.side_effect = mocked_get_app + with mock.patch.object(clear_db_es_contents_module, + "configure_dbsession") as mock_configure_dbsession: + mock_configure_dbsession.side_effect = mocked_configure_dbsession + + config_uri = 'production.ini' + appname = "app" + + + with input_mocked( + # We'll be prompted for the environment name to confirm. + "local", + module=clear_db_es_contents_module): + + clear_db_es_contents_main([config_uri]) + mock_run_clear_db_es.assert_called_with(app=mocked_get_app(config_uri, None), + only_envs=[], + skip_es=False, + allow_prod=False) + + with input_mocked( + # No input prompting will occur because --no-confirm was supplied. + module=clear_db_es_contents_module): + + clear_db_es_contents_main([config_uri, "--no-confirm"]) + mock_run_clear_db_es.assert_called_with(app=mocked_get_app(config_uri, None), + only_envs=[], + skip_es=False, + allow_prod=False) + + with input_mocked( + # We'll be prompted for the environment name to confirm. + "local", + module=clear_db_es_contents_module): + + clear_db_es_contents_main([config_uri, "--app-name", appname]) + mock_run_clear_db_es.assert_called_with(app=mocked_get_app(config_uri, appname), + only_envs=[], + skip_es=False, + allow_prod=False) + + with input_mocked( + # We'll be prompted for the environment name to confirm. + "local", + module=clear_db_es_contents_module): + + clear_db_es_contents_main([config_uri, "--app-name", appname, '--skip-es']) + mock_run_clear_db_es.assert_called_with(app=mocked_get_app(config_uri, appname), + only_envs=[], + skip_es=True, + allow_prod=False) + + with input_mocked( + # No input prompting will occur because --only-if-env was supplied. + module=clear_db_es_contents_module): + + clear_db_es_contents_main([config_uri, "--app-name", appname, "--only-if-env", TEST_ENV]) + mock_run_clear_db_es.assert_called_with(app=mocked_get_app(config_uri, appname), + only_envs=[TEST_ENV], + skip_es=False, + allow_prod=False) + + with input_mocked( + # We'll be prompted for the environment name to confirm. + "local", + module=clear_db_es_contents_module): + + clear_db_es_contents_main([config_uri, "--app-name", appname, "--only-if-env", TEST_ENV, + "--confirm"]) + mock_run_clear_db_es.assert_called_with(app=mocked_get_app(config_uri, appname), + only_envs=[TEST_ENV], + skip_es=False, + allow_prod=False) + + with input_mocked( + # No input prompting will occur because --only-if-env was supplied. + module=clear_db_es_contents_module): + + clear_db_es_contents_main([config_uri, "--app-name", appname, + "--only-if-env", f"{TEST_ENV},{OTHER_ENV}"]) + mock_run_clear_db_es.assert_called_with(app=mocked_get_app(config_uri, appname), + only_envs=[TEST_ENV, OTHER_ENV], + skip_es=False, + allow_prod=False) diff --git a/snovault/tests/test_drs.py b/snovault/tests/test_drs.py new file mode 100644 index 000000000..985f4f94d --- /dev/null +++ b/snovault/tests/test_drs.py @@ -0,0 +1,74 @@ +import pytest +from .test_attachment import testing_download # noQA fixture import +from ..drs import REQUIRED_FIELDS + + +class TestDRSAPI: + """ Class for testing the DRS implementation - uses TestingDownload as it implements + the @@download scheme + """ + BASE_URL = 'http://localhost:80/' + + def test_drs_get_object(self, testapp, testing_download): # noQA fixture + """ Tests basic structure about a drs object """ + res = testapp.get(testing_download) + drs_object_uri = res.json['uuid'] + drs_object_1 = testapp.get(f'/ga4gh/drs/v1/objects/{drs_object_uri}').json + for key in REQUIRED_FIELDS: + assert key in drs_object_1 + assert drs_object_1['self_uri'] == f'drs://localhost:80/ga4gh/drs/v1/objects/{drs_object_uri}' + assert (drs_object_1['access_methods'][0]['access_url']['url'] + == f'{self.BASE_URL}{drs_object_uri}/@@download') + + # failure cases + testapp.get(f'/ga4gh/drs/v1/objects/not_a_uri', status=404) + + # @@drs case + drs_object_2 = testapp.get(f'/{drs_object_uri}/@@drs') + for key in REQUIRED_FIELDS: + assert key in drs_object_2 + + def test_drs_get_object_url(self, testapp, testing_download): # noQA fixture + """ Tests extracting URL through ga4gh pathway """ + res = testapp.get(testing_download) + drs_object_uri = res.json['uuid'] + + # standard URI with meaningful access_id, discarded + drs_object_download = testapp.get(f'/ga4gh/drs/v1/objects/{drs_object_uri}/access/https').json + assert drs_object_download == { + 'url': f'{self.BASE_URL}{drs_object_uri}/@@download' + } + + # /access/ method + drs_object_download = testapp.get(f'/ga4gh/drs/v1/objects/{drs_object_uri}/access/').json + assert drs_object_download == { + 'url': f'{self.BASE_URL}{drs_object_uri}/@@download' + } + + # standard URI with nonsense access id, still discarded + drs_object_download = testapp.get(f'/ga4gh/drs/v1/objects/{drs_object_uri}/access/blah').json + assert drs_object_download == { + 'url': f'{self.BASE_URL}{drs_object_uri}/@@download' + } + + # /access method + drs_object_download = testapp.get(f'/ga4gh/drs/v1/objects/{drs_object_uri}/access').json + assert drs_object_download == { + 'url': f'{self.BASE_URL}{drs_object_uri}/@@download' + } + + def test_drs_get_object_failure(self, testapp, testing_download): # noQA fixture + """ Tests a bunch of bunk URLs """ + res = testapp.get(testing_download) + drs_object_uri = res.json['uuid'] + + with pytest.raises(Exception): + testapp.get(f'/ga4gh/drs/v1/objects/not_a_uri/access/https') + with pytest.raises(Exception): + testapp.get(f'/ga4gh/drs/v1/objects/access/https') + with pytest.raises(Exception): + testapp.get(f'/ga4gh/drs/v1/objects/access/') + with pytest.raises(Exception): + testapp.get(f'/ga4gh/drs/v1/objects/access') + with pytest.raises(Exception): + testapp.get(f'/ga4gh/drs/v1/objects/{drs_object_uri}/accesss/https') diff --git a/snovault/tests/test_snowflake_hash.py b/snovault/tests/test_edw_hash.py similarity index 67% rename from snovault/tests/test_snowflake_hash.py rename to snovault/tests/test_edw_hash.py index d4cdcf0d7..a26356ae0 100644 --- a/snovault/tests/test_snowflake_hash.py +++ b/snovault/tests/test_edw_hash.py @@ -1,6 +1,9 @@ import pytest -from .snowflake_hash import SNOWHash +from ..edw_hash import EDWHash + + +pytestmark = [pytest.mark.setone, pytest.mark.working] TEST_HASHES = { @@ -11,5 +14,5 @@ @pytest.mark.parametrize(('password', 'pwhash'), TEST_HASHES.items()) -def test_snowflake_hash(password, pwhash): - assert SNOWHash.hash(password) == pwhash +def test_edw_hash(password, pwhash): + assert EDWHash.hash(password) == pwhash diff --git a/snovault/tests/test_embed_utils.py b/snovault/tests/test_embed_utils.py index 18d4ad4d3..c029b3d88 100644 --- a/snovault/tests/test_embed_utils.py +++ b/snovault/tests/test_embed_utils.py @@ -67,7 +67,8 @@ def test_find_default_embeds_and_expand_emb_list(registry): 'attachment', 'attachment.attachment.*', 'attachment.attachment2.*', - 'attachment.principals_allowed.*' + 'attachment.principals_allowed.*', + 'attachment.submitted_by' ] assert set(embs_to_add) == set(expected_to_add) diff --git a/snovault/tests/test_indexing.py b/snovault/tests/test_indexing.py index 52ba1251e..b4d3433ac 100644 --- a/snovault/tests/test_indexing.py +++ b/snovault/tests/test_indexing.py @@ -54,11 +54,13 @@ notice_pytest_fixtures(TestingLinkSourceSno) -pytestmark = [pytest.mark.indexing] +pytestmark = [pytest.mark.indexing, pytest.mark.es] TEST_COLL = '/testing-post-put-patch-sno/' TEST_TYPE = 'testing_post_put_patch_sno' # use one collection for testing +TEST_TYPE_HIDDEN_FACETS = 'testing_hidden_facets' +TEST_TYPE_BUCKET_RANGE = 'testing_bucket_range_facets' # we just need single shard for these tests # XXX: use new type @@ -94,7 +96,7 @@ def app_settings(basic_app_settings, wsgi_server_host_port, elasticsearch_server raise Exception("Bad value of INDEXER_MODE: %s. Possible values are MPINDEX, INDEX, and BOTH." % INDEXER_MODE) -@pytest.fixture(scope='module', params=INDEXER_APP_PARAMS) # must happen AFTER scope='session' moto setup +@pytest.fixture(scope='session', params=INDEXER_APP_PARAMS) # must happen AFTER scope='session' moto setup def app(app_settings, request): old_mpindexer = app_settings['mpindexer'] with override_dict(app_settings, mpindexer=old_mpindexer): # we plan to set it inside here @@ -113,6 +115,7 @@ def app(app_settings, request): # DBSession.bind.engine.pool.dispose() pass + # XXX C4-312: refactor tests so this can be module scope. # Having to have to drop DB tables and re-run create_mapping for every test is slow. @pytest.fixture(scope='function', autouse=True) @@ -121,8 +124,9 @@ def setup_and_teardown(app): Run create mapping and purge queue before tests and clear out the DB tables after the test """ - # BEFORE THE TEST - just run CM for the TEST_TYPE by default - create_mapping.run(app, collections=[TEST_TYPE], skip_indexing=True, purge_queue=True) + # BEFORE THE TEST - just run CM for the TEST_TYPEs above by default + create_mapping.run(app, collections=[TEST_TYPE, TEST_TYPE_HIDDEN_FACETS, TEST_TYPE_BUCKET_RANGE], + skip_indexing=True, purge_queue=True) yield # run the test @@ -210,7 +214,7 @@ def test_indexer_namespacing(app, testapp, indexer_testapp): # app.registry.settings['indexer.namespace'] = indexer_namespace # reset indexer_namespace -@pytest.mark.es +# @pytest.mark.es - Specified at top of file for whole file def test_indexer_queue_adds_telemetry_id(app): indexer_queue = app.registry[INDEXER_QUEUE] indexer_queue.clear_queue() @@ -232,7 +236,7 @@ def test_indexer_queue_adds_telemetry_id(app): indexer_queue.delete_messages(received) -@pytest.mark.es +# @pytest.mark.es - Specified at top of file for whole file @pytest.mark.flaky def test_indexer_queue(app): indexer_queue_mirror = app.registry[INDEXER_QUEUE_MIRROR] @@ -1953,6 +1957,364 @@ def test_invalidation_scope_view_error(self, indexer_testapp, req): indexer_testapp.post_json('/compute_invalidation_scope', req) +@pytest.fixture(scope='session') +def hidden_facet_data_one(): + """ Sample TestingHiddenFacets object we are going to facet on """ + return { + 'first_name': 'John', + 'last_name': 'Doe', + 'sid': 1, + 'status': 'current', + 'unfaceted_string': 'hello', + 'unfaceted_integer': 123, + 'disabled_string': 'orange', + 'disabled_integer': 789, + 'unfaceted_object': { + 'mother': 'Anne', + 'father': 'Bob' + }, + 'unfaceted_array_of_objects': [ + { + 'fruit': 'orange', + 'color': 'orange', + 'uid': 1 + }, + { + 'fruit': 'banana', + 'color': 'yellow', + 'uid': 2 + }, + ] + } + + +@pytest.fixture(scope='session') +def hidden_facet_data_two(): + """ A second sample TestingHiddenFacets object we are going to facet on """ + return { + 'first_name': 'Boston', + 'last_name': 'Bruins', + 'sid': 2, + 'status': 'current', + 'unfaceted_string': 'world', + 'unfaceted_integer': 456, + 'disabled_string': 'apple', + 'disabled_integer': 101112, + 'unfaceted_object': { + 'mother': 'Candice', + 'father': 'Doug' + }, + 'unfaceted_array_of_objects': [ + { + 'fruit': 'blueberry', + 'color': 'blue', + 'uid': 3 + }, + { + 'fruit': 'mango', + 'color': 'yellow', + 'uid': 4 + }, + ] + } + + +@pytest.fixture(scope='function') +def hidden_facet_test_data(testapp, hidden_facet_data_one, hidden_facet_data_two): + testapp.post_json('/TestingHiddenFacets', hidden_facet_data_one, status=201) + testapp.post_json('/TestingHiddenFacets', hidden_facet_data_two, status=201) + index_n_items_for_testing(testapp, 2) + + +class TestSearchHiddenAndAdditionalFacets: + """ Encapsulates tests meant for testing behavior associated with default_hidden, hidden + and additional_facets + """ + DEFAULT_FACETS = ['first_name'] # 'validation_errors.name' + DEFAULT_HIDDEN_FACETS = ['last_name', 'sid'] + ADDITIONAL_FACETS = ['unfaceted_string', 'unfaceted_integer'] + DISABLED_FACETS = ['disabled_string', 'disabled_integer'] + + @staticmethod + def check_and_verify_result(facets, desired_facet, number_expected): + """ Helper method for later tests that checks terms count and average. """ + for facet in facets: + field = facet['field'] + if field == desired_facet and 'terms' in facet: + assert len(facet['terms']) == number_expected + elif field == facet and 'avg' in facet: + assert facet['avg'] == number_expected + else: + continue + break + + @staticmethod + def assert_facet_set_equal(expected, facets): + """ Takes list of expect results and raw facet response and checks that they + are identical. """ + assert sorted(expected) == sorted([facet['field'] for facet in facets]) + + def test_search_default_hidden_facets_dont_show(self, testapp, hidden_facet_test_data): + facets = testapp.get('/search/?type=TestingHiddenFacets').json['facets'] + self.assert_facet_set_equal(self.DEFAULT_FACETS, facets) + + @pytest.mark.parametrize('facet', ADDITIONAL_FACETS) + def test_search_one_additional_facet(self, testapp, hidden_facet_test_data, facet): + """ Tests that specifying each of the 'additional' facets works correctly """ + facets = testapp.get('/search/?type=TestingHiddenFacets&additional_facet=%s' % facet).json['facets'] + expected = self.DEFAULT_FACETS + [facet] + self.assert_facet_set_equal(expected, facets) + + def test_search_multiple_additional_facets(self, testapp, hidden_facet_test_data): + """ Tests that enabling multiple additional facets works """ + facets = testapp.get('/search/?type=TestingHiddenFacets' + '&additional_facet=unfaceted_string' + '&additional_facet=unfaceted_integer').json['facets'] + expected = self.DEFAULT_FACETS + self.ADDITIONAL_FACETS + self.assert_facet_set_equal(expected, facets) + for facet in facets: # verify facet type + if facet['field'] == 'unfaceted_integer': + assert facet['aggregation_type'] == 'stats' + else: # facet['field'] == 'unfaceted_string' + assert facet['aggregation_type'] == 'terms' + + @pytest.mark.parametrize('facet', DEFAULT_HIDDEN_FACETS) + def test_search_one_additional_default_hidden_facet(self, testapp, hidden_facet_test_data, facet): + """ Tests that passing default_hidden facets to additional_facets works correctly """ + facets = testapp.get('/search/?type=TestingHiddenFacets&additional_facet=%s' % facet).json['facets'] + expected = self.DEFAULT_FACETS + [facet] + self.assert_facet_set_equal(expected, facets) + + def test_search_multiple_additional_default_hidden_facets(self, testapp, hidden_facet_test_data): + """ Tests that passing multiple hidden_facets as additionals works correctly """ + facets = testapp.get('/search/?type=TestingHiddenFacets' + '&additional_facet=last_name' + '&additional_facet=sid').json['facets'] + expected = self.DEFAULT_FACETS + self.DEFAULT_HIDDEN_FACETS + self.assert_facet_set_equal(expected, facets) + for facet in facets: + if facet['field'] == 'sid': + assert facet['aggregation_type'] == 'stats' + else: + assert facet['aggregation_type'] == 'terms' + + @pytest.mark.parametrize('_facets', [ + ['last_name', 'unfaceted_integer'], # second slot holds number field + ['unfaceted_string', 'sid'] + ]) + def test_search_mixing_additional_and_default_hidden(self, testapp, hidden_facet_test_data, _facets): + """ Tests that we can mix additional_facets with those both on and off schema """ + facets = testapp.get('/search/?type=TestingHiddenFacets' + '&additional_facet=%s' + '&additional_facet=%s' % (_facets[0], _facets[1])).json['facets'] + expected = self.DEFAULT_FACETS + _facets + self.assert_facet_set_equal(expected, facets) + for facet in facets: + if facet['field'] == _facets[1]: # second slot holds number field + assert facet['aggregation_type'] == 'stats' + else: + assert facet['aggregation_type'] == 'terms' + + @pytest.mark.parametrize('_facet', DISABLED_FACETS) + def test_search_disabled_overrides_additional(self, testapp, hidden_facet_test_data, _facet): + """ Hidden facets should NEVER be faceted on """ + facets = testapp.get('/search/?type=TestingHiddenFacets&additional_facet=%s' % _facet).json['facets'] + field_names = [facet['field'] for facet in facets] + assert _facet not in field_names # always hidden should not be here, even if specified + + @pytest.mark.parametrize('_facets', [ + ('last_name', 'unfaceted_integer', 'disabled_integer'), # default_hidden second + ('sid', 'unfaceted_string', 'disabled_string') # disabled always last + ]) + def test_search_additional_mixing_disabled_default_hidden(self, testapp, hidden_facet_test_data, _facets): + """ Tests that supplying multiple additional facets combined with hidden still respects the + hidden restriction. """ + facets = testapp.get('/search/?type=TestingHiddenFacets' + '&additional_facet=%s' + '&additional_facet=%s' + '&additional_facet=%s' % (_facets[0], _facets[1], _facets[2])).json['facets'] + expected = self.DEFAULT_FACETS + [_facets[0], _facets[1]] # first two should show + self.assert_facet_set_equal(expected, facets) + + @pytest.mark.parametrize('_facet', [ + 'unfaceted_object.mother', + 'unfaceted_object.father' + ]) + def test_search_additional_object_facets(self, testapp, hidden_facet_test_data, _facet): + """ Tests that specifying an object field as an additional_facet works correctly """ + facets = testapp.get('/search/?type=TestingHiddenFacets' + '&additional_facet=%s' % _facet).json['facets'] + expected = self.DEFAULT_FACETS + [_facet] + self.assert_facet_set_equal(expected, facets) + + @pytest.mark.parametrize('_facet, n_expected', [ + ('unfaceted_array_of_objects.fruit', 4), + ('unfaceted_array_of_objects.color', 3), + ('unfaceted_array_of_objects.uid', 2.5) # stats avg + ]) + def test_search_additional_nested_facets(self, testapp, hidden_facet_test_data, _facet, n_expected): + """ Tests that specifying an array of object field mapped with nested as an additional_facet + works correctly. """ + [desired_facet] = [facet for facet in testapp.get('/search/?type=TestingHiddenFacets' + '&additional_facet=%s' % _facet).json['facets'] + if facet['field'] == _facet] + if 'terms' in desired_facet: + assert len(desired_facet['terms']) == n_expected + else: + assert desired_facet['avg'] == n_expected + + @pytest.fixture + def many_non_nested_facets(self, testapp, hidden_facet_test_data): + return testapp.get('/search/?type=TestingHiddenFacets' + '&additional_facet=non_nested_array_of_objects.fruit' + '&additional_facet=non_nested_array_of_objects.color' + '&additional_facet=non_nested_array_of_objects.uid').json['facets'] + + @pytest.mark.parametrize('_facet, n_expected', [ + ('unfaceted_array_of_objects.fruit', 4), + ('unfaceted_array_of_objects.color', 3), + ('unfaceted_array_of_objects.uid', 2.5) # stats avg + ]) + def test_search_additional_non_nested_facets(self, many_non_nested_facets, _facet, n_expected): + """ Tests trying to facet on an array of objects field that is not nested, requesting + all at the same time. + """ + self.check_and_verify_result(many_non_nested_facets, _facet, n_expected) + + +@pytest.fixture(scope='session') +def bucket_range_data_raw(): + """ 9 objects with a numerical field we will bucket on. + 'special_integer' has i in it. + 'special_object_that_holds_integer' holds a single integer field with i as well + 'array_of_objects_that_holds_integer' holds 2 objects that are mirrors of one another + + + 1 object with a value for no_value_integer, to test that filtering on a field that sets + 'add_no_value' to True will not filter documents with 'No value'. + """ + entries = [{ + 'special_integer': i, + 'special_object_that_holds_integer': { + 'embedded_integer': i + }, + 'array_of_objects_that_holds_integer': [ + { + 'embedded_identifier': 'forward', + 'embedded_integer': 0 if i < 5 else 9 + }, + { + 'embedded_identifier': 'reverse', + 'embedded_integer': 9 if i < 5 else 0 + }, + ] + } for i in range(10)] + # set no value int on the last element + entries[-1]['no_value_integer'] = 8 + entries[-1]['no_value_integer_array'] = [8] + return entries + + +@pytest.fixture(scope='function') +def bucket_range_data(testapp, bucket_range_data_raw): + for entry in bucket_range_data_raw: + testapp.post_json('/TestingBucketRangeFacets', entry, status=201) + index_n_items_for_testing(testapp, 10) + + +class TestSearchBucketRangeFacets: + """ Class that encapsulates tests for BucketRanges """ + + @staticmethod + def verify_facet_counts(facets, expected_fields, expected_cardinality, expected_count): + """ Checks for given expected facets, checking bucket cardinality and document count + Note that the actual range properties are trivial (we are not testing elasticsearch) + """ + for facet in facets: + if facet['field'] in expected_fields: + assert len(facet['ranges']) == expected_cardinality + for bucket in facet['ranges']: + assert bucket['doc_count'] == expected_count + + @staticmethod + def verify_counts(response, expected_count): + assert len(response['@graph']) == expected_count + + @staticmethod + def select_facet(facets, facet_name): + result = None + for facet in facets: + if facet['field'] == facet_name: + result = facet + break + return result + + @pytest.mark.parametrize('expected_fields, expected_counts', [ + (['special_integer'], 5), + (['special_object_that_holds_integer.embedded_integer'], 5), + (['array_of_objects_that_holds_integer.embedded_integer'], 10) + ]) + def test_search_bucket_range_simple(self, testapp, bucket_range_data, expected_fields, expected_counts): + """ Tests searching a collection of documents with varying integer field types that + have the same distribution - all of which should give the same results. """ + res = testapp.get('/search/?type=TestingBucketRangeFacets').json['facets'] + self.verify_facet_counts(res, expected_fields, 2, expected_counts) + + # XXX: The following 2 tests don't function correctly because the facet doesn't utilize reverse_nested + @pytest.mark.parametrize('identifier', [ + 'reverse', 'forward' + ]) + def test_search_bucket_range_nested_qualifier(self, testapp, bucket_range_data, identifier): + """ Tests aggregating on a nested field while selecting for a field within the nested object. """ + res = testapp.get('/search/?type=TestingBucketRangeFacets' + '&array_of_objects_that_holds_integer.embedded_identifier=%s' % identifier).json['facets'] + self.verify_facet_counts(res, ['array_of_objects_that_holds_integer.embedded_integer'], + 2, 10) + + @pytest.mark.parametrize('identifier', [ + 'reverse', 'forward' + ]) + def test_search_bucket_range_nested_qualifier_multiple(self, testapp, bucket_range_data, identifier): + """ Tests aggregating on a nested field while selecting for a field within the nested object (no change). """ + res = testapp.get('/search/?type=TestingBucketRangeFacets' + '&array_of_objects_that_holds_integer.embedded_integer.from=6' + '&array_of_objects_that_holds_integer.embedded_identifier=%s' % identifier).json['facets'] + self.verify_facet_counts(res, ['array_of_objects_that_holds_integer.embedded_integer'], + 2, 10) + facet_with_labels = self.select_facet(res, 'array_of_objects_that_holds_integer.embedded_integer') + for r in facet_with_labels['ranges']: + assert 'label' in r + assert r['label'] in ['Low', 'High'] + + def test_search_bucket_range_add_no_value(self, testapp, bucket_range_data): + """ Tests that providing a range filter on a field that specifies 'add_no_value' does not + filter documents that have no value for that field. + """ + res = testapp.get('/search/?type=TestingBucketRangeFacets&no_value_integer.from=0').json # should detect + self.verify_counts(res, 10) + testapp.get('/search/?type=TestingBucketRangeFacets&no_value_integer.from=10', status=404) # should not detect + res = testapp.get('/search/?type=TestingBucketRangeFacets&no_value_integer.to=10').json # should detect + self.verify_counts(res, 10) + res = testapp.get('/search/?type=TestingBucketRangeFacets&no_value_integer.from=0' + '&no_value_integer.to=10').json # should detect + self.verify_counts(res, 10) + res = testapp.get('/search/?type=TestingBucketRangeFacets' + '&no_value_integer_array.from=0').json # should detect + self.verify_counts(res, 10) + res = testapp.get('/search/?type=TestingBucketRangeFacets' + '&no_value_integer_array.from=8').json # should detect + self.verify_counts(res, 1) + res = testapp.get('/search/?type=TestingBucketRangeFacets' + '&no_value_integer_array.from=0&no_value_integer_array.to=7').json # should detect + self.verify_counts(res, 9) + res = testapp.get('/search/?type=TestingBucketRangeFacets' + '&no_value_integer_array.from=-1&no_value_integer_array.to=7').json # should detect + self.verify_counts(res, 9) + res = testapp.get('/search/?type=TestingBucketRangeFacets' + '&no_value_integer_array.from=-1&no_value_integer_array.to=9').json # should detect + self.verify_counts(res, 10) + + + def test_assert_transactions_table_is_gone(app): """ A bit of a strange location for this test, but we need the app and diff --git a/snovault/tests/test_ingestion_message_handler_decorator.py b/snovault/tests/test_ingestion_message_handler_decorator.py new file mode 100644 index 000000000..e30e985e3 --- /dev/null +++ b/snovault/tests/test_ingestion_message_handler_decorator.py @@ -0,0 +1,236 @@ +import pytest +import re +from snovault.ingestion.ingestion_message import IngestionMessage +from snovault.ingestion.ingestion_listener_base import IngestionListenerBase +from snovault.ingestion.ingestion_message_handler_decorator import ( + call_ingestion_message_handler, + ingestion_message_handler, + clear_ingestion_message_handlers_for_testing, +) + + +class IngestionListener(IngestionListenerBase): + pass # dummy + + +SOME_UUID = "some-uuid-xyzzy" +INGESTION_LISTENER = IngestionListener() +INGESTION_TYPE_VCF = "vcf" +INGESTION_TYPE_NOVCF = "novcf" +INGESTION_TYPE_OTHER = "other" + + +def isolate_ingestion_message_handler_decorator_test(f): + def wrapper(): + clear_ingestion_message_handlers_for_testing() + f() + clear_ingestion_message_handlers_for_testing() + return wrapper + + +def create_raw_message(ingestion_type: str) -> dict: + return {"Body": f"{{\"uuid\":\"{SOME_UUID}\", \"ingestion_type\":\"{ingestion_type}\"}}"} + + +@isolate_ingestion_message_handler_decorator_test +def test_error_decorator_arguments(): + + with pytest.raises(Exception): + @ingestion_message_handler(123) # wrong decorator arg type + def bad_a(message, listener): + pass + + with pytest.raises(Exception): + @ingestion_message_handler("vcf", 123) # too many decorator args + def bad_b(message, listener): + pass + + with pytest.raises(Exception): + @ingestion_message_handler(xyzzy="vcf") # unknown named decorator kwarg + def bad_c(message, listener): + pass + + with pytest.raises(Exception): + @ingestion_message_handler("vcf", ingestion_type="vcf") # too many decorator args + def bad_d(message, listener): + pass + + +@isolate_ingestion_message_handler_decorator_test +def test_error_decorated_function_signature(): + + with pytest.raises(Exception): + @ingestion_message_handler + def bad_a(): # not enough args + pass + + with pytest.raises(Exception): + @ingestion_message_handler + def bad_b(message): # not enough args + pass + + with pytest.raises(Exception): + @ingestion_message_handler + def bad_c(message, listener, extraneous_arg): # too many args + pass + + with pytest.raises(Exception): + @ingestion_message_handler + def bad_d(message: IngestionMessage, listener: str): # wrong type arg + pass + + with pytest.raises(Exception): + @ingestion_message_handler + def bad_e(message: str, listener): # wrong type arg + pass + + with pytest.raises(Exception): + @ingestion_message_handler + def bad_f(message: str, listener: str): # wrong type args + pass + + +@isolate_ingestion_message_handler_decorator_test +def test_error_duplicate_default_handlers_one(): + + with pytest.raises(Exception): + + @ingestion_message_handler + def a(message, listener): + pass + + @ingestion_message_handler # same as above (i.e. default) + def duplicate_a(message, listener): + pass + + +@isolate_ingestion_message_handler_decorator_test +def test_error_duplicate_default_handlers_two(): + + with pytest.raises(Exception): + + @ingestion_message_handler(ingestion_type="default") + def a(message, listener): + pass + + @ingestion_message_handler # same as above (i.e. default) + def duplicate_a(message, listener): + pass + + +@isolate_ingestion_message_handler_decorator_test +def test_error_duplicate_typed_handlers(): + + with pytest.raises(Exception): + + @ingestion_message_handler("some-message-type") + def a(message, listener): + pass + + @ingestion_message_handler("some-message-type") # same as above + def duplicate_a(message, listener): + pass + + +@isolate_ingestion_message_handler_decorator_test +def test_error_undefined_handler(): + + with pytest.raises(Exception): + + @ingestion_message_handler("some-message-type") + def a(message, listener): + pass + + @ingestion_message_handler("some-other-message-type") + def duplicate_a(message, listener): + pass + + ingestion_message = create_raw_message(ingestion_type="some-third-message-type") + # This should throw exception because no relevant handler found. + call_ingestion_message_handler(ingestion_message, INGESTION_LISTENER) + + exception_message = (f".*No.*ingestion.*message.*handler.*defined.*" + f" ->.*{{'uuid': 'some-uuid-xyzzy', 'ingestion_type': 'some-third-message-type'}}.*") + with pytest.raises(Exception, match=re.compile(exception_message)): + call_ingestion_message_handler(ingestion_message, INGESTION_LISTENER) + + +@isolate_ingestion_message_handler_decorator_test +def test_error_invalid_call_arguments(): + + @ingestion_message_handler("some-message-type") + def a(message, listener): + pass + + ingestion_message = create_raw_message(ingestion_type="some-message-type") + with pytest.raises(Exception): + call_ingestion_message_handler(ingestion_message, "wrong-type-should-be-IngestionListenerBase") + + ingestion_message = create_raw_message(ingestion_type="some-message-type") + with pytest.raises(Exception): + call_ingestion_message_handler(ingestion_message, INGESTION_LISTENER, "extra-arg") + + ingestion_message = create_raw_message(ingestion_type="some-message-type") + with pytest.raises(Exception): + call_ingestion_message_handler(ingestion_message) # missing IngestionListenerBase arg + + ingestion_message = create_raw_message(ingestion_type="some-message-type") + with pytest.raises(Exception): + call_ingestion_message_handler() # missing args + + with pytest.raises(Exception): + a(ingestion_message, INGESTION_LISTENER) # wrong first arg type (raw dict rather than IngestionMessage) + + +@isolate_ingestion_message_handler_decorator_test +def test_one(): + + handler_calls = None + + @ingestion_message_handler + def a(message: IngestionMessage, listener: IngestionListener): + this_function_name = "a" + result = f"{this_function_name}/{message.type}" + handler_calls.add(result) + assert not message.is_type(INGESTION_TYPE_VCF) and not message.is_type(INGESTION_TYPE_NOVCF) + assert message.uuid == SOME_UUID + assert listener is INGESTION_LISTENER + return result + + @ingestion_message_handler(ingestion_type=INGESTION_TYPE_VCF) + def b(message: IngestionMessage, listener: IngestionListener) -> str: + this_function_name = "b" + result = f"{this_function_name}/{message.type}" + handler_calls.add(result) + assert message.is_type(INGESTION_TYPE_VCF) + assert message.uuid == SOME_UUID + assert listener is INGESTION_LISTENER + return result + + @ingestion_message_handler(INGESTION_TYPE_NOVCF) + def c(message, listener): + this_function_name = "c" + result = f"{this_function_name}/{message.type}" + handler_calls.add(result) + assert message.is_type(INGESTION_TYPE_NOVCF) + assert message.uuid == SOME_UUID + assert listener is INGESTION_LISTENER + return result + + handler_calls = set() + ingestion_message = create_raw_message(ingestion_type=INGESTION_TYPE_OTHER) + handler_result = call_ingestion_message_handler(ingestion_message, INGESTION_LISTENER) + assert handler_result == f"a/{IngestionMessage(ingestion_message).type}" + assert handler_calls == {handler_result} + + handler_calls = set() + ingestion_message = create_raw_message(ingestion_type=INGESTION_TYPE_VCF) + handler_result = call_ingestion_message_handler(ingestion_message, INGESTION_LISTENER) + assert handler_result == f"b/{IngestionMessage(ingestion_message).type}" + assert handler_calls == {handler_result} + + handler_calls = set() + ingestion_message = create_raw_message(ingestion_type=INGESTION_TYPE_NOVCF) + handler_result = call_ingestion_message_handler(ingestion_message, INGESTION_LISTENER) + assert handler_result == f"c/{IngestionMessage(ingestion_message).type}" + assert handler_calls == {handler_result} diff --git a/snovault/tests/test_ingestion_processor.py b/snovault/tests/test_ingestion_processor.py new file mode 100644 index 000000000..8606e7828 --- /dev/null +++ b/snovault/tests/test_ingestion_processor.py @@ -0,0 +1,10 @@ +import pytest +from snovault.ingestion.ingestion_processor_decorator import ingestion_processor +from snovault.types.ingestion import SubmissionFolio + + +def test_error_ingestion_processor(): + with pytest.raises(Exception): + @ingestion_processor('some_unknown_ingestion_type') + def some_processor_for_unknown_ingestion_type(submission: SubmissionFolio): + pass diff --git a/snovault/tests/test_key.py b/snovault/tests/test_key.py index f344121e2..e9aefcf33 100644 --- a/snovault/tests/test_key.py +++ b/snovault/tests/test_key.py @@ -1,8 +1,6 @@ import pytest from dcicutils.qa_utils import notice_pytest_fixtures -from pyramid.config import Configurator -from ..interfaces import DBSESSION # Test for storage.keys @@ -18,16 +16,6 @@ ] -@pytest.fixture(scope='session') -def app(DBSession): - notice_pytest_fixtures(DBSession) - config = Configurator() - config.registry[DBSESSION] = DBSession - config.include('snovault') - config.include('.testing_key') - return config.make_wsgi_app() - - @pytest.fixture def content(testapp): notice_pytest_fixtures(testapp) diff --git a/snovault/tests/test_renderers.py b/snovault/tests/test_renderers.py new file mode 100644 index 000000000..7c68a4cf1 --- /dev/null +++ b/snovault/tests/test_renderers.py @@ -0,0 +1,197 @@ +import pytest +import urllib.parse + +from dcicutils.lang_utils import n_of +from dcicutils.misc_utils import filtered_warnings +from dcicutils.qa_utils import MockResponse +from pyramid.testing import DummyRequest +from unittest import mock +from snovault import renderers +from snovault.renderers import ( + best_mime_type, should_transform, MIME_TYPES_SUPPORTED, MIME_TYPE_DEFAULT, + MIME_TYPE_JSON, MIME_TYPE_HTML, MIME_TYPE_LD_JSON, MIME_TYPE_TRIAGE_MODE, +) + + +pytestmark = [pytest.mark.setone, pytest.mark.working] + + +class DummyResponse(MockResponse): + + def __init__(self, content_type=None, status_code: int = 200, json=None, content=None, url=None, + params=None): + self.params = {} if params is None else params + self.content_type = content_type + super().__init__(status_code=status_code, json=json, content=content, url=url) + + +def test_mime_variables(): + + # Really these don't need testing but it's useful visually to remind us of their values here. + assert MIME_TYPE_HTML == 'text/html' + assert MIME_TYPE_JSON == 'application/json' + assert MIME_TYPE_LD_JSON == 'application/ld+json' + + # The MIME_TYPES_SUPPORTED is a list whose first element has elevated importance as we've structured things. + # First check that it is a list, and that its contents contain the things we support. That isn't controversial. + assert isinstance(MIME_TYPES_SUPPORTED, list) + assert set(MIME_TYPES_SUPPORTED) == {MIME_TYPE_JSON, MIME_TYPE_HTML, MIME_TYPE_LD_JSON} + # Check that the first element is consistent with the MIME_TYPE_DEFAULT. + # It's an accident of history that this next relationship matters, but at this point check for consistency. + assert MIME_TYPE_DEFAULT == MIME_TYPES_SUPPORTED[0] + # Now we concern ourselves with the actual values... + # TODO: I think it's a bug that JSON is at the head of this list (and so the default) in cgap-portal. + # cgap-portal needs to be made to match what Fourfront does to dig it out of a bug I introduced. + # -kmp 29-Jan-2022 + assert MIME_TYPES_SUPPORTED == [MIME_TYPE_JSON, MIME_TYPE_HTML, MIME_TYPE_LD_JSON] + assert MIME_TYPE_DEFAULT == MIME_TYPE_JSON + + # Regardless of whether we're using legacy mode or modern mode, we should get the same result. + assert MIME_TYPE_TRIAGE_MODE in ['legacy', 'modern'] + + +VARIOUS_MIME_TYPES_TO_TEST = ['*/*', 'text/html', 'application/json', 'application/ld+json', 'text/xml', 'who/cares'] + + +def test_best_mime_type(): + + the_constant_answer = MIME_TYPE_DEFAULT + + with filtered_warnings("ignore", category=DeprecationWarning): + # Suppresses this warning: + # DeprecationWarning: The behavior of .best_match for the Accept classes is currently being maintained + # for backward compatibility, but the method will be deprecated in the future, as its behavior is not + # specified in (and currently does not conform to) RFC 7231. + + for requested_mime_type in VARIOUS_MIME_TYPES_TO_TEST: + req = DummyRequest(headers={'Accept': requested_mime_type}) + assert best_mime_type(req, 'legacy') == the_constant_answer + assert best_mime_type(req, 'modern') == the_constant_answer + req = DummyRequest(headers={}) # The Accept header in the request just isn't being consulted + assert best_mime_type(req, 'modern') == the_constant_answer + assert best_mime_type(req, 'modern') == the_constant_answer + + +TYPICAL_URLS = [ + 'http://whatever/foo', + 'http://whatever/foo/', + 'http://whatever/foo.json', + 'http://whatever/foo.html', +] + +ALLOWED_FRAMES_OR_NONE = ['raw', 'page', 'embedded', 'object', 'bad', None] + +SOME_HTTP_METHODS = ['GET', 'POST', 'PUT', 'PATCH', 'DELETE', 'HEAD'] + +ALLOWED_FORMATS_OR_NONE = ['json', 'html', None] + + +def test_should_transform(): + + passed = [] + failed = [] + problem_area = set() + + with filtered_warnings("ignore", category=DeprecationWarning): + # Suppresses this warning: + # DeprecationWarning: The behavior of .best_match for the Accept classes is currently being maintained + # for backward compatibility, but the method will be deprecated in the future, as its behavior is not + # specified in (and currently does not conform to) RFC 7231. + + for method in SOME_HTTP_METHODS: + for _format in ALLOWED_FORMATS_OR_NONE: + for requested_mime_type in VARIOUS_MIME_TYPES_TO_TEST: + for response_content_type in VARIOUS_MIME_TYPES_TO_TEST: + for frame in [None] + ALLOWED_FRAMES_OR_NONE: + for url in TYPICAL_URLS: + + params = {} + if frame is not None: + params['frame'] = frame + if _format is not None: + params['format'] = _format + + req = DummyRequest(headers={'Accept': requested_mime_type}, + method=method, + url=url, + params=params) + resp = DummyResponse(content_type=response_content_type, url=url) + + _should_transform = should_transform(req, resp) + + situation = { + 'method': method, + "format": _format, + "encoded_params": urllib.parse.urlencode(params), + "requested": requested_mime_type, + "response_content_type": response_content_type, + "frame": frame, + "url": url, + "params": params, + } + + if req.method not in ('GET', 'HEAD'): + rule_applied = "method not GET or HEAD" + correct = not _should_transform + elif resp.content_type != 'application/json': + # If the response MIME type is not application/json, + # it just can't be transformed at all. + rule_applied = "content_type is not application/json" + correct = not _should_transform + elif params.get("frame", "page") != 'page': + rule_applied = "?frame=xxx is not page" + correct = not _should_transform + elif _format is not None: + rule_applied = "?format=xxx given but not html" + correct = _should_transform is (_format == 'html') + else: + # TODO: kmp thinks this behavior is a bug. It should default to HTML. -kmp 23-Mar-2021 + correct = _should_transform is False # If no cue is given, default to JSON + + + if correct: + # There are a lot of cases, so we don't print stuff here by default, but + # uncomment to see what cases are passing as they pass: + # print(situation) + # print("=should_transform?=>", _should_transform, "(correct)") + passed.append(situation) + + else: + # There are a lot of cases, so we don't print stuff here by default, but + # uncomment to see what cases are failing as they fail: + # print(situation) + # print("=should_transform?=>", _should_transform, "(WRONG)") + failed.append(situation) + problem_area.add(rule_applied) + + if failed: + # Collect all failures in one place: + print("FAILED:") + for failure in failed: + print(" method=%(method)s format=%(format)s requested=%(requested)s" + " response_content_type=%(response_content_type)s frame=%(frame)s" + " url=%(url)s params=%(encoded_params)s" + % failure) + + n_failed = len(failed) + n_passed = len(passed) + assert not n_failed, ( + "%s passed, %s FAILED (%s: %s)" + % (n_passed, n_failed, n_of(problem_area, "problem area"), ", ".join(problem_area)) + ) + print("\n", n_passed, "combinations tried. ALL PASSED") + + +def test_should_transform_without_best_mime_type(): + + # As we call things now, we really don't need the best_mime_type function because it just returns the + # first element of its first argument. That probably should change. Because it should be a function + # of the request and its Accept offerings. Even so, we test for this now not because this makes programs + # right, but so we notice if/when this truth changes. -kmp 23-Mar-2021 + + with mock.patch.object(renderers, "best_mime_type") as mock_best_mime_type: + + # Demonstrate that best_mime_type(...) could be replaced by MIME_TYPES_SUPPORTED[0] + mock_best_mime_type.return_value = MIME_TYPES_SUPPORTED[0] + + test_should_transform() diff --git a/snovault/tests/test_views.py b/snovault/tests/test_views.py index 6da71c13b..7917ca024 100644 --- a/snovault/tests/test_views.py +++ b/snovault/tests/test_views.py @@ -433,3 +433,19 @@ def test_auth0_config_admin(testapp, registry): def test_auth0_config_anon(anontestapp, registry): """ Tests that acquiring auth0 config gives the expected values from settings for anonymous users. """ _test_auth_config(anontestapp, registry) + + +def _test_recaptcha_config(testapp, registry): + cfg = testapp.get('/recaptcha_config').json + assert cfg['title'] == 'Recaptcha Config' + assert cfg['RecaptchaKey'] == registry.settings['g.recaptcha.key'] + + +def test_recaptcha_config_admin(testapp, registry): + """ Tests that acquiring recaptcha config gives the expected values from settings for admins. """ + _test_recaptcha_config(testapp, registry) + + +def test_recaptcha_config_anon(anontestapp, registry): + """ Tests that acquiring recaptcha config gives the expected values from settings for anonymous users. """ + _test_recaptcha_config(anontestapp, registry) diff --git a/snovault/tests/testappfixtures.py b/snovault/tests/testappfixtures.py index c88746b23..53cc1f87d 100644 --- a/snovault/tests/testappfixtures.py +++ b/snovault/tests/testappfixtures.py @@ -16,6 +16,7 @@ 'retry.attempts': 3, 'production': True, 'structlog.dir': '/tmp/', + 'g.recaptcha.key': 'dummy-recaptcha', 'auth0.client': 'dummy-client', 'auth0.domain': 'dummy.domain', 'auth0.options': { @@ -32,21 +33,21 @@ 'github', 'google-oauth2', 'partners' ] }, - 'multiauth.policies': 'session remoteuser accesskey webuser', - 'multiauth.groupfinder': 'snovault.tests.authorization.groupfinder', - 'multiauth.policy.session.use': 'snovault.tests.authentication.NamespacedAuthenticationPolicy', + 'multiauth.policies': 'session remoteuser accesskey auth0', + 'multiauth.groupfinder': 'snovault.authorization.groupfinder', + 'multiauth.policy.session.use': 'snovault.authentication.NamespacedAuthenticationPolicy', 'multiauth.policy.session.base': 'pyramid.authentication.SessionAuthenticationPolicy', 'multiauth.policy.session.namespace': 'mailto', - 'multiauth.policy.remoteuser.use': 'snovault.tests.authentication.NamespacedAuthenticationPolicy', + 'multiauth.policy.remoteuser.use': 'snovault.authentication.NamespacedAuthenticationPolicy', 'multiauth.policy.remoteuser.namespace': 'remoteuser', 'multiauth.policy.remoteuser.base': 'pyramid.authentication.RemoteUserAuthenticationPolicy', - 'multiauth.policy.accesskey.use': 'snovault.tests.authentication.NamespacedAuthenticationPolicy', + 'multiauth.policy.accesskey.use': 'snovault.authentication.NamespacedAuthenticationPolicy', 'multiauth.policy.accesskey.namespace': 'accesskey', - 'multiauth.policy.accesskey.base': 'snovault.tests.authentication.BasicAuthAuthenticationPolicy', - 'multiauth.policy.accesskey.check': 'snovault.tests.authentication.basic_auth_check', - 'multiauth.policy.webuser.use': 'snovault.tests.authentication.NamespacedAuthenticationPolicy', - 'multiauth.policy.webuser.namespace': 'webuser', - 'multiauth.policy.webuser.base': 'snovault.tests.authentication.WebUserAuthenticationPolicy' + 'multiauth.policy.accesskey.base': 'snovault.authentication.BasicAuthAuthenticationPolicy', + 'multiauth.policy.accesskey.check': 'snovault.authentication.basic_auth_check', + 'multiauth.policy.auth0.use': 'snovault.authentication.NamespacedAuthenticationPolicy', + 'multiauth.policy.auth0.namespace': 'auth0', + 'multiauth.policy.auth0.base': 'snovault.authentication.Auth0AuthenticationPolicy', } @@ -93,7 +94,7 @@ def encrypted_testapp(encrypted_app): return webtest.TestApp(encrypted_app, environ) -@pytest.fixture +@pytest.fixture(scope='session') def testapp(app): """ TestApp with JSON accept header. """ environ = { diff --git a/snovault/tests/testing_key.py b/snovault/tests/testing_key.py deleted file mode 100644 index 09d449c48..000000000 --- a/snovault/tests/testing_key.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Test class for keys -""" - -from ..resources import Item -from ..config import collection - - -def includeme(config): - config.scan(__name__) - - -@collection( - 'testing-keys', - properties={ - 'title': 'Test keys', - 'description': 'Testing. Testing. 1, 2, 3.', - }, - unique_key='testing_accession', -) -class TestingKey(Item): - item_type = 'testing_key' - schema = { - 'type': 'object', - 'properties': { - 'name': { - 'type': 'string', - 'uniqueKey': True, - }, - 'accession': { - 'type': 'string', - 'uniqueKey': 'testing_accession', - }, - } - } diff --git a/snovault/tests/testing_views.py b/snovault/tests/testing_views.py index 1a2930eaf..086a22758 100644 --- a/snovault/tests/testing_views.py +++ b/snovault/tests/testing_views.py @@ -428,6 +428,38 @@ class TestingDownload(ItemWithAttachment): schema = load_schema('snovault:test_schemas/TestingDownload.json') +@view_config(name='drs', context=TestingDownload, request_method='GET', + permission='view', subpath_segments=[0, 1]) +def drs(context, request): + """ Example DRS object implementation. Write this for all object classes that + you want to render a DRS object. This structure is minimally validated by the + downstream API (see drs.py). + """ + rendered_object = request.embed(str(context.uuid), '@@object', as_user=True) + drs_object = { + 'id': rendered_object['@id'], + 'created_time': rendered_object['date_created'], + 'drs_id': rendered_object['uuid'], + 'self_uri': f'drs://{request.host}{request.path}', + 'size': 0, + 'checksums': [ + { + 'checksum': 'something', + 'type': 'md5' + } + ], + 'access_methods': [ + { + 'access_url': { + 'url': f'http://{request.host}/{context.uuid}/@@download' + }, + 'type': 'http' + }, + ] + } + return drs_object + + @collection('testing-link-sources-sno', unique_key='testing_link_sources-sno:name') class TestingLinkSourceSno(Item): item_type = 'testing_link_source_sno' @@ -806,3 +838,226 @@ class TestingBiogroupSno(Item): 'sources.samples.*', # embed everything at top level 'sources.contributor.*' ] + + +@collection( + 'testing-keys', + properties={ + 'title': 'Test keys', + 'description': 'Testing. Testing. 1, 2, 3.', + }, + unique_key='testing_accession', +) +class TestingKey(Item): + item_type = 'testing_key' + schema = { + 'type': 'object', + 'properties': { + 'name': { + 'type': 'string', + 'uniqueKey': True, + }, + 'accession': { + 'type': 'string', + 'uniqueKey': 'testing_accession', + }, + } + } + + +@collection('testing-hidden-facets') +class TestingHiddenFacets(Item): + """ Collection designed to test searching with hidden facets. Yes this is large, but this is a complex feature + with many possible cases. """ + item_type = 'testing_hidden_facets' + schema = { + 'type': 'object', + 'properties': { + 'first_name': { + 'type': 'string' + }, + 'last_name': { + 'type': 'string' + }, + 'sid': { + 'type': 'integer' + }, + 'unfaceted_string': { + 'type': 'string' + }, + 'unfaceted_integer': { + 'type': 'integer' + }, + 'disabled_string': { + 'type': 'string', + }, + 'disabled_integer': { + 'type': 'integer', + }, + 'unfaceted_object': { + 'type': 'object', + 'properties': { + 'mother': { + 'type': 'string' + }, + 'father': { + 'type': 'string' + } + } + }, + 'unfaceted_array_of_objects': { + 'type': 'array', + 'enable_nested': True, + 'items': { + 'type': 'object', + 'properties': { + 'fruit': { + 'type': 'string' + }, + 'color': { + 'type': 'string' + }, + 'uid': { + 'type': 'integer' + } + } + } + } + }, + 'facets': { + 'first_name': { + 'title': 'First Name' + }, + 'last_name': { + 'default_hidden': True, + 'title': 'Last Name' + }, + 'sid': { + 'default_hidden': True, + 'title': 'SID', + 'aggregation_type': 'stats', + 'number_step': 1 + }, + 'disabled_string': { + 'disabled': True + }, + 'disabled_integer': { + 'disabled': True + } + } + } + + @calculated_property(schema={ + 'type': 'array', + 'items': { + 'type': 'object', + 'properties': { + 'fruit': { + 'type': 'string' + }, + 'color': { + 'type': 'string' + }, + 'uid': { + 'type': 'integer' + } + } + } + }) + def non_nested_array_of_objects(self, unfaceted_array_of_objects): + """ Non-nested view of the unfaceted_array_of_objects field """ + return unfaceted_array_of_objects + + +@collection('testing-bucket-range-facets') +class TestingBucketRangeFacets(Item): + """ Collection for testing BucketRange facets. + Also tests 'add_no_value' schema param behavior. + """ + item_type = 'testing_bucket_range_facets' + schema = { + 'type': 'object', + 'properties': { + 'no_value_integer': { + 'type': 'integer', + 'add_no_value': True # if a range query is specified on this field, include documents that + # have 'No value' for the field + }, + 'no_value_integer_array': { + 'type': 'array', + 'items': { + 'type': 'integer', + 'add_no_value': True + } + }, + 'special_integer': { + 'type': 'integer' + }, + 'special_object_that_holds_integer': { + 'type': 'object', + 'properties': { + 'embedded_integer': { + 'type': 'integer' + } + } + }, + 'array_of_objects_that_holds_integer': { + 'type': 'array', + 'items': { + 'type': 'object', + 'enable_nested': False, + 'properties': { + 'embedded_identifier': { + 'type': 'string' + }, + 'embedded_integer': { + 'type': 'integer' + } + } + } + } + }, + 'facets': { + 'no_value_integer': { + 'title': 'No value integer', + 'aggregation_type': 'range', + 'ranges': [ + {'from': 0, 'to': 5}, + {'from': 5, 'to': 10} + ] + }, + 'no_value_integer_array': { + 'title': 'No value integer array', + 'aggregation_type': 'range', + 'ranges': [ + {'from': 0, 'to': 0}, # test zero range faceting behavior + {'from': 0, 'to': 5}, + {'from': 5, 'to': 10} + ] + }, + 'special_integer': { + 'title': 'Special Integer', + 'aggregation_type': 'range', + 'ranges': [ + {'from': 0, 'to': 5}, + {'from': 5, 'to': 10} + ] + }, + 'special_object_that_holds_integer.embedded_integer': { + 'title': 'Single Object Embedded Integer', + 'aggregation_type': 'range', + 'ranges': [ + {'from': 0, 'to': 5}, + {'from': 5, 'to': 10} + ] + }, + 'array_of_objects_that_holds_integer.embedded_integer': { + 'title': 'Array of Objects Embedded Integer', + 'aggregation_type': 'range', + 'ranges': [ + {'from': 0, 'to': 5, 'label': 'Low'}, + {'from': 5, 'to': 10, 'label': 'High'} + ] + } + } + } diff --git a/snovault/typedsheets.py b/snovault/typedsheets.py new file mode 100644 index 000000000..82d806879 --- /dev/null +++ b/snovault/typedsheets.py @@ -0,0 +1,82 @@ +from pyramid.settings import asbool + + +def parse_array(types, value): + return [cast(types, v) for v in value.split(';') if v.strip()] + + +def parse_object(types, value): + items = (part.split(':', 1) for part in value.split(',') if value.strip()) + return {k.strip(): cast(types, v) for k, v in items} + + +def parse_string(types, value): + assert not types + return value + + +def parse_ignore(types, value): + return None + + +def parse_number(types, value): + assert not types + try: + return int(value) + except ValueError: + return float(value) + + +def parse_integer(types, value): + assert not types + return int(value) + + +def parse_boolean(types, value): + assert not types + return asbool(value) + + +TYPE_BY_NAME = { + 'string': parse_string, + 'number': parse_number, + 'boolean': parse_boolean, + 'integer': parse_integer, + 'ignore': parse_ignore, + 'array': parse_array, + 'object': parse_object, +} + + +def cast(types, value): + types = list(types) or ['string'] + type_name = types.pop() + value = value.strip() + if value.lower() == 'null': + return None + if value == '' and type_name != 'string': + return None + parse = TYPE_BY_NAME[type_name] + return parse(types, value) + + +def convert(name, value): + """ fieldname:[:...], value -> fieldname, cast(value) + """ + parts = name.split(':') + return parts[0], cast(parts[1:], value) + + +def cast_row_values(dictrows): + """ Wrapper generator for typing csv.DictReader rows + """ + for row in dictrows: + yield dict(convert(name, value or '') for name, value in row.items()) + + +def remove_nulls(dictrows): + for row in dictrows: + yield { + name: value for name, value in row.items() + if value is not None and name + } diff --git a/snovault/types/__init__.py b/snovault/types/__init__.py new file mode 100644 index 000000000..7ef848b7e --- /dev/null +++ b/snovault/types/__init__.py @@ -0,0 +1,3 @@ +def includeme(config): + """include me method.""" + config.scan() diff --git a/snovault/types/access_key.py b/snovault/types/access_key.py new file mode 100644 index 000000000..dac52d7e3 --- /dev/null +++ b/snovault/types/access_key.py @@ -0,0 +1,162 @@ +"""Access_key types file.""" + +import datetime +from pyramid.security import ( + Allow, + Deny, + Authenticated, + Everyone, +) +from pyramid.settings import asbool +from pyramid.view import view_config +from .. import ( + collection, + load_schema, +) +from ..authentication import ( + generate_password, + generate_user, + CRYPT_CONTEXT, +) +from ..crud_views import ( + collection_add, + item_edit, +) +from ..project_app import app_project +from ..validators import ( + validate_item_content_post, +) +from ..util import debug_log +from .base import ( + Item, + DELETED_ACL, + ONLY_ADMIN_VIEW_ACL, +) + + +@collection( + name='access-keys', + properties={ + 'title': 'Access keys', + 'description': 'Programmatic access keys', + }, + acl=[ + (Allow, Authenticated, 'add'), + (Allow, 'group.admin', 'list'), + (Allow, 'group.read-only-admin', 'list'), + (Allow, 'remoteuser.INDEXER', 'list'), + (Allow, 'remoteuser.EMBED', 'list'), + (Deny, Everyone, 'list'), + ]) +class AccessKey(Item): + """AccessKey class.""" + ACCESS_KEY_EXPIRATION_TIME = 90 # days + item_type = 'access_key' + schema = load_schema('snovault:schemas/access_key.json') + embedded_list = [] + + STATUS_ACL = { + 'current': [(Allow, 'role.owner', ['view', 'edit'])] + ONLY_ADMIN_VIEW_ACL, + 'deleted': DELETED_ACL, + } + + @classmethod + def create(cls, registry, uuid, properties, sheets=None): + """ Sets the access key timeout 90 days from creation. """ + if app_project().access_key_has_expiration_date(): + properties['expiration_date'] = (datetime.datetime.utcnow() + datetime.timedelta( + days=cls.ACCESS_KEY_EXPIRATION_TIME)).isoformat() + return super().create(registry, uuid, properties, sheets) + + def __ac_local_roles__(self): + """grab and return user as owner.""" + owner = 'userid.%s' % self.properties['user'] + return {owner: 'role.owner'} + + def __json__(self, request): + """delete the secret access key has from the object when used.""" + properties = super(AccessKey, self).__json__(request) + del properties['secret_access_key_hash'] + return properties + + def update(self, properties, sheets=None): + """smth.""" + # make sure PUTs preserve the secret access key hash + if 'secret_access_key_hash' not in properties: + new_properties = self.properties.copy() + new_properties.update(properties) + properties = new_properties + # set new expiration + properties['expiration_date'] = (datetime.datetime.utcnow() + datetime.timedelta( + days=self.ACCESS_KEY_EXPIRATION_TIME)).isoformat() + self._update(properties, sheets) + + class Collection(Item.Collection): + pass + + +# access keys have view permissions for update so readonly admin and the like +# can create access keys to download files. +@view_config(context=AccessKey.Collection, request_method='POST', + permission='add', + validators=[validate_item_content_post]) +@debug_log +def access_key_add(context, request): + """smth.""" + crypt_context = request.registry[CRYPT_CONTEXT] + + if 'access_key_id' not in request.validated: + request.validated['access_key_id'] = generate_user() + + if 'user' not in request.validated: + request.validated['user'], = [ + principal.split('.', 1)[1] + for principal in request.effective_principals + if principal.startswith('userid.') + ] + + password = None + if 'secret_access_key_hash' not in request.validated: + password = generate_password() + request.validated['secret_access_key_hash'] = crypt_context.hash(password) + + result = collection_add(context, request) + + if password is None: + result['secret_access_key'] = None + else: + result['secret_access_key'] = password + + result['access_key_id'] = request.validated['access_key_id'] + result['description'] = request.validated.get('description', "") + return result + + +@view_config(name='reset-secret', context=AccessKey, + permission='add', + request_method='POST', subpath_segments=0) +@debug_log +def access_key_reset_secret(context, request): + """smth.""" + request.validated = context.properties.copy() + crypt_context = request.registry[CRYPT_CONTEXT] + password = generate_password() + new_hash = crypt_context.hash(password) + request.validated['secret_access_key_hash'] = new_hash + result = item_edit(context, request, render=False) + result['access_key_id'] = request.validated['access_key_id'] + result['secret_access_key'] = password + return result + + +@view_config(context=AccessKey, permission='view_raw', request_method='GET', + name='raw') +@debug_log +def access_key_view_raw(context, request): + """smth.""" + if asbool(request.params.get('upgrade', True)): + properties = context.upgrade_properties() + else: + properties = context.properties.copy() + del properties['secret_access_key_hash'] + return properties diff --git a/snovault/types/acl.py b/snovault/types/acl.py new file mode 100644 index 000000000..61288795b --- /dev/null +++ b/snovault/types/acl.py @@ -0,0 +1,42 @@ +# Considation of ACL related definitions. + +from pyramid.security import Allow, Deny, Everyone +from typing import Any, List, Tuple, Union + +Acl = List[Tuple[Any, Any, Union[str, List[str]]]] + +ONLY_ADMIN_VIEW_ACL: Acl = [ + (Allow, 'group.admin', ['view', 'edit']), + (Allow, 'group.read-only-admin', ['view']), + (Allow, 'remoteuser.INDEXER', ['view']), + (Allow, 'remoteuser.EMBED', ['view']), + (Deny, Everyone, ['view', 'edit']) +] + +PUBLIC_ACL: Acl = [ + (Allow, Everyone, ['view']), +] + ONLY_ADMIN_VIEW_ACL + +DELETED_ACL: Acl = [ + (Deny, Everyone, 'visible_for_edit') +] + ONLY_ADMIN_VIEW_ACL + +# Originally from user.py: + +ONLY_ADMIN_VIEW_USER_DETAILS_ACL = [ + (Allow, 'group.admin', ['view', 'view_details', 'edit']), + (Allow, 'remoteuser.INDEXER', ['view']), + (Allow, 'remoteuser.EMBED', ['view']), + (Deny, Everyone, ['view', 'view_details', 'edit']), +] + +ONLY_OWNER_VIEW_PROFILE_ACL = [ + (Allow, 'role.owner', 'view'), + # (Allow, 'role.owner', 'edit'), + # (Allow, 'role.owner', 'view_details'), +] + ONLY_ADMIN_VIEW_USER_DETAILS_ACL + +DELETED_USER_ACL = [ + (Deny, Everyone, 'visible_for_edit') +] + ONLY_ADMIN_VIEW_USER_DETAILS_ACL + diff --git a/snovault/types/base.py b/snovault/types/base.py new file mode 100644 index 000000000..034892285 --- /dev/null +++ b/snovault/types/base.py @@ -0,0 +1,255 @@ +"""base class creation for all the schemas that exist.""" +from dcicutils.misc_utils import exported +from pyramid.security import ( + # ALL_PERMISSIONS, + Allow, + Deny, + # DENY_ALL, + Everyone, +) +from pyramid.view import ( + view_config, +) +import re +import string +from .. import Item, Collection, AbstractCollection, abstract_collection, calculated_property +from ..util import debug_log +from ..validators import ( + validate_item_content_post, + validate_item_content_put, + validate_item_content_patch, + validate_item_content_in_place, + no_validate_item_content_post, + no_validate_item_content_put, + no_validate_item_content_patch +) +from ..crud_views import ( + collection_add as sno_collection_add, + item_edit +) +from ..interfaces import CONNECTION +from ..server_defaults import get_userid, add_last_modified +from .acl import ( + ONLY_ADMIN_VIEW_ACL, + PUBLIC_ACL, + DELETED_ACL +) +exported( + Allow, Deny, Everyone, + abstract_collection, + validate_item_content_put, + validate_item_content_patch, + validate_item_content_in_place, + no_validate_item_content_post, + no_validate_item_content_put, + no_validate_item_content_patch, + item_edit, + CONNECTION, + get_userid, + add_last_modified, + ONLY_ADMIN_VIEW_ACL, + PUBLIC_ACL, + DELETED_ACL +) + + +def get_item_or_none(request, value, itype=None, frame='object'): + """ + Return the view of an item with given frame. Can specify different types + of `value` for item lookup + + Args: + request: the current Request + value (str): String item identifier or a dict containing @id/uuid + itype (str): Optional string collection name for the item (e.g. /file-formats/) + frame (str): Optional frame to return. Defaults to 'object' + + Returns: + dict: given view of the item or None on failure + """ + item = None + + if isinstance(value, dict): + if 'uuid' in value: + value = value['uuid'] + elif '@id' in value: + value = value['@id'] + + svalue = str(value) + + # Below case is for UUIDs & unique_keys such as accessions, but not @ids + if not svalue.startswith('/') and not svalue.endswith('/'): + svalue = '/' + svalue + '/' + if itype is not None: + svalue = '/' + itype + svalue + + # Request.embed will attempt to get from ES for frame=object/embedded + # If that fails, get from DB. Use '@@' syntax instead of 'frame=' because + # these paths are cached in indexing + try: + item = request.embed(svalue, '@@' + frame) + except Exception: + pass + + # could lead to unexpected errors if == None + return item + + +def set_namekey_from_title(properties): + name = None + if properties.get('title'): + exclude = set(string.punctuation.replace('-', '')) + name = properties['title'].replace('&', ' n ') + name = ''.join(ch if ch not in exclude and ch != ' ' else '-' for ch in name) + name = re.sub(r"[-]+", '-', name).strip('-').lower() + return name + + +def validate_item_type_of_linkto_field(context, request): + """We are doing this case by case on item specific types files, + but might want to carry it here if filter is used more often. + If any of the submitted fields contain an ff_flag property starting with "filter", + the field in the filter is used for validating the type of the linked item. + Example: file has field file_format which is a linkTo FileFormat. + FileFormat items contain a field called "valid_item_types". + We have the ff_flag on file_format field called "filter:valid_item_types".""" + pass + + +class AbstractCollection(AbstractCollection): + """smth.""" + + def __init__(self, *args, **kw): + try: + self.lookup_key = kw.pop('lookup_key') + except KeyError: + pass + super(AbstractCollection, self).__init__(*args, **kw) + + def get(self, name, default=None): + """ + heres' and example of why this is the way it is: + ontology terms have uuid or term_id as unique ID keys + and if neither of those are included in post, try to + use term_name such that: + No - fail load with non-existing term message + Multiple - fail load with ‘ambiguous name - more than 1 term with that name exist use ID’ + Single result - get uuid and use that for post/patch + """ + resource = super(AbstractCollection, self).get(name, None) + if resource is not None: + return resource + if ':' in name: + resource = self.connection.get_by_unique_key('alias', name) + if resource is not None: + if not self._allow_contained(resource): + return default + return resource + if getattr(self, 'lookup_key', None) is not None: + # lookup key translates to query json by key / value and return if only one of the + # item type was found... so for keys that are mostly unique, but do to whatever + # reason (bad data mainly..) can be defined as unique keys + item_type = self.type_info.item_type + resource = self.connection.get_by_json(self.lookup_key, name, item_type) + if resource is not None: + if not self._allow_contained(resource): + return default + return resource + return default + + +class Collection(Collection, AbstractCollection): + """smth.""" + + def __init__(self, *args, **kw): + """smth.""" + super(Collection, self).__init__(*args, **kw) + if hasattr(self, '__acl__'): + return + + +@calculated_property(context=Item.AbstractCollection, category='action') +def add(context, request): + """smth.""" + if request.has_permission('add', context): + type_name = context.type_info.name + return { + 'name': 'add', + 'title': 'Add', + 'profile': '/profiles/{name}.json'.format(name=type_name), + 'href': '/search/?type={name}¤tAction=add'.format(name=type_name), + } + + +@calculated_property(context=Item, category='action') +def edit(context, request): + """smth.""" + if request.has_permission('edit'): + return { + 'name': 'edit', + 'title': 'Edit', + 'profile': '/profiles/{ti.name}.json'.format(ti=context.type_info), + 'href': '{item_uri}?currentAction=edit'.format(item_uri=request.resource_path(context)), + } + + +@calculated_property(context=Item, category='action') +def create(context, request): + if request.has_permission('create'): + return { + 'name': 'create', + 'title': 'Create', + 'profile': '/profiles/{ti.name}.json'.format(ti=context.type_info), + 'href': '{item_uri}?currentAction=create'.format(item_uri=request.resource_path(context)), + } + + +@view_config( + context=Collection, + permission='add', + request_method='POST', + # validators=[] # TURNS OFF VALIDATION HERE ([validate_item_content_post] previously) + validators=[validate_item_content_post] +) +@view_config( + context=Collection, + permission='add_unvalidated', + request_method='POST', + validators=[no_validate_item_content_post], + request_param=['validate=false'] +) +@debug_log +def collection_add(context, request, render=None): + + # institution_needed = False + # project_needed = False + # data = request.json + # schema = context.type_info.schema + # + # required_properties = schema.get("required", []) + # if "institution" in required_properties and "institution" not in data: + # institution_needed = True + # + # if "project" in required_properties and "project" not in data: + # project_needed = True + # + # if request.authenticated_userid and (institution_needed or project_needed): + # namespace, userid = request.authenticated_userid.split(".", 1) + # user_item = get_item_or_none(request, userid, itype="/users/", frame="object") + # new_data = data.copy() + # if institution_needed and "institution" in user_item: + # new_data["institution"] = user_item["institution"] + # if project_needed and "project" in user_item: + # new_data["project"] = user_item["project"] + # + # # Override initial JSON body of request (hacky? better way?) + # setattr(request, "json", new_data) + # + # # Perform validation that would occur otherwise + # validate_item_content_post(context, request) + # if request.errors: + # return HTTPUnprocessableEntity( + # json={'errors': request.errors}, + # content_type='application/json' + # ) + return sno_collection_add(context, request, render) diff --git a/snovault/types/filter_set.py b/snovault/types/filter_set.py new file mode 100644 index 000000000..dd626d7f9 --- /dev/null +++ b/snovault/types/filter_set.py @@ -0,0 +1,26 @@ +from snovault import collection, load_schema # , calculated_property +from .base import Item + + +# XXX: These constants could be defined on CompoundSearchBuilder, but thought they may be +# more useful here/more logically belong. - Will +FLAGS = 'flags' +FILTER_BLOCKS = 'filter_blocks' + + +@collection( + name='filter-sets', + unique_key='filter_set:title', + properties={ + 'title': 'Filter Sets', + 'description': 'Filter Set for combining multiple queries' + } +) +class FilterSet(Item): + """The class to store information about 4DN file formats""" + item_type = 'filter_set' + schema = load_schema('snovault:schemas/filter_set.json') + embedded_list = [] + + class Collection(Item.Collection): + pass diff --git a/snovault/types/ingestion.py b/snovault/types/ingestion.py new file mode 100644 index 000000000..2308201d8 --- /dev/null +++ b/snovault/types/ingestion.py @@ -0,0 +1,276 @@ +""" +Collection for objects related to ingestion submissions. +""" + +import boto3 +import contextlib +import io +import json +import logging +from typing import Optional +import traceback + +from dcicutils.misc_utils import PRINT # , ignored, check_true, VirtualApp +from snovault import collection, load_schema +# from pyramid.request import Request +# from pyramid.security import Allow, Deny, Everyone +from .base import ( + Item, + # TODO: Maybe collect all these permission styles into a single file, give them symbolic names, + # and permit only the symbolic names to be used in each situation so we can curate a full inventory of modes. + # -kmp 26-Jul-2020 + # Ticket C4-332 + # ALLOW_PROJECT_MEMBER_ADD_ACL, + # ONLY_ADMIN_VIEW_ACL, +) +from ..util import ( + debuglog, beanstalk_env_from_registry, create_empty_s3_file, s3_local_file, s3_output_stream, # subrequest_item_creation, + make_vapp_for_ingestion, # vapp_for_email, +) +from ..ingestion.common import metadata_bundles_bucket # , get_parameter + + +# ALLOW_SUBMITTER_VIEW_ACL = ( +# # TODO: There is an issue here where we want a logged in user remotely only to view this +# # but if we are proxying for them internall we want to be able to view OR edit. +# # There is never reason for a user outside the system to update this status. -kmp 26-Jul-2020 +# [] # Special additional permissions might go here. +# + ALLOW_PROJECT_MEMBER_ADD_ACL # Is this right? See note above. +# + ONLY_ADMIN_VIEW_ACL # Slightly misleading name. Allows admins to edit, too, actually. But only they can view. +# ) + + +class SubmissionFolio: + + INGESTION_SUBMISSION_URI = '/IngestionSubmission' + + def __init__(self, *, vapp, ingestion_type, submission_id, log=None): + self.vapp = vapp + self._admin_vapp = make_vapp_for_ingestion(app=vapp.app) + self.ingestion_type = ingestion_type + self.log = log or logging + self.bs_env = beanstalk_env_from_registry(vapp.app.registry) + self.bucket = metadata_bundles_bucket(vapp.app.registry) + self.s3_client = boto3.client('s3') + self.other_details = {} + self.outcome = 'unknown' + self.submission_id = submission_id + # These next two are initialized later by s3 lookup, and the result is cached here. + # In particular, the values will be made available in time for the body of 'with folio.processing_context(...)' + # Setting them to None here makes PyCharm and other code analysis tools happier in knowing + # that accesses to these instance variables are legit. -kmp 27-Aug-2020 + self.object_name = None + self.parameters = None + self.resolution = None + self.s3_encrypt_key_id = None # This is overridden based on manifest later + + def __str__(self): + return "" % (self.ingestion_type, self.submission_id) + + @classmethod + def make_submission_uri(cls, submission_id): + return "/ingestion-submissions/" + submission_id + + @property + def submission_uri(self): + return self.make_submission_uri(self.submission_id) + + def patch_item(self, **kwargs): + res = self._admin_vapp.patch_json(self.submission_uri, kwargs) + [item] = res.json['@graph'] + debuglog(json.dumps(item)) + return item + + def get_item(self): + res = self._admin_vapp.get(self.submission_uri) + [item] = res.json['@graph'] + return item + + def note_additional_datum(self, key, from_dict, from_key=None, default=None): + self.other_details['additional_data'] = additional_data = ( + self.other_details.get('additional_data', {}) + ) + additional_data[key] = from_dict.get(from_key or key, default) + + @contextlib.contextmanager + def s3_output(self, key_name, key_type='txt'): + key = "%s/%s%s" % (self.submission_id, key_name, "" if key_name.endswith(f".{key_type}") else f".{key_type}") + self.resolution[key_name] = key + with s3_output_stream(self.s3_client, bucket=self.bucket, key=key, + s3_encrypt_key_id=self.s3_encrypt_key_id) as fp: + yield fp + + @contextlib.contextmanager + def s3_input(self, bucket: Optional[str] = None, key: Optional[str] = None): + if not bucket: + bucket = self.bucket + if not key: + key = self.object_name + with s3_local_file(s3_client=self.s3_client, bucket=bucket, key=key) as filename: + with io.open(filename, "r") as fp: + yield fp + + def get_s3_input_json(self, bucket: Optional[str] = None, key: Optional[str] = None): + with self.s3_input(bucket=bucket, key=key) as fp: + return json.load(fp) + + def fail(self): + self.outcome = 'failure' + + def succeed(self): + self.outcome = 'success' + + def is_done(self): + return self.outcome != 'unknown' + + @contextlib.contextmanager + def processing_context(self): + + self.log.info("Processing {submission_id} as {ingestion_type}." + .format(submission_id=self.submission_id, ingestion_type=self.ingestion_type)) + + submission_id = self.submission_id + manifest_key = "%s/manifest.json" % submission_id + response = self.s3_client.get_object(Bucket=self.bucket, Key=manifest_key) + manifest = json.load(response['Body']) + + s3_encrypt_key_id = manifest.get("s3_encrypt_key_id") + + self.object_name = object_name = manifest['object_name'] + self.parameters = parameters = manifest['parameters'] + self.s3_encrypt_key_id = manifest['s3_encrypt_key_id'] + email = manifest['email'] + + debuglog(submission_id, "object_name:", object_name) + debuglog(submission_id, "parameters:", parameters) + debuglog(submission_id, "s3_encrypt_key_id:", s3_encrypt_key_id) + + started_key = "%s/started.txt" % submission_id + create_empty_s3_file(self.s3_client, bucket=self.bucket, key=started_key, s3_encrypt_key_id=s3_encrypt_key_id) + + # PyCharm thinks this is unused. -kmp 26-Jul-2020 + # data_stream = submission.s3_client.get_object(Bucket=submission.bucket, + # Key="%s/manifest.json" % submission_id)['Body'] + + resolution = { + "data_key": object_name, + "manifest_key": manifest_key, + "started_key": started_key, + } + + try: + + other_keys = {} + if email: + other_keys['submitted_by'] = email + + self.patch_item(submission_id=submission_id, + object_name=object_name, + parameters=parameters, + processing_status={"state": "processing"}, + **other_keys) + + self.resolution = resolution + + yield resolution + + if not self.is_done(): + self.succeed() + + self.patch_item(processing_status={"state": "done", "outcome": self.outcome, "progress": "complete"}, + **self.other_details) + + except Exception as e: + + resolution["traceback_key"] = traceback_key = "%s/traceback.txt" % submission_id + with s3_output_stream(self.s3_client, bucket=self.bucket, key=traceback_key, + s3_encrypt_key_id=s3_encrypt_key_id) as fp: + traceback.print_exc(file=fp) + + resolution["error_type"] = e.__class__.__name__ + resolution["error_message"] = str(e) + + self.patch_item( + errors=["%s: %s" % (e.__class__.__name__, e)], + processing_status={ + "state": "done", + "outcome": "error", + "progress": "incomplete" + } + ) + + with s3_output_stream(self.s3_client, + bucket=self.bucket, + key="%s/resolution.json" % submission_id, + s3_encrypt_key_id=s3_encrypt_key_id) as fp: + PRINT(json.dumps(resolution, indent=2), file=fp) + + def process_standard_bundle_results(self, bundle_result: dict, s3_only: bool = False) -> None: + """ + If the given bundle_result contains either a result, post_output, or upload_info property, + then writes the contents of that property to an S3 key with either the name submission.json, + post_output.txt, or upload_info.txt, respectively; and in the bucket name self.bucket. + + Additionally, for these three properties (result, post_output, upload_info), add them to the + self.other_details["additional_data"] property of this SubmissionFolio object, so that this + data will ultimately be written to the database (for the IngestionSubmission object). + + HOWEVER, if the s3_only (False by default) argument is True then the given result will + NOT be added to the other_details["additional_data"] of this SubmissionFolio object, + rather the given result will ONLY be written to S3. This is to prevent potentially + larger amounts of data from ultimately being written to the database (for the + IngestionSubmission object), but rather have them stored ONLY in S3. + """ + + # Next several files are created only if relevant. + + if bundle_result.get('result'): + with self.s3_output(key_name='submission.json', key_type='json') as fp: + print(json.dumps(bundle_result['result'], indent=2), file=fp) + if not s3_only: + self.note_additional_datum('result', from_dict=bundle_result, default={}) + + if bundle_result.get('post_output'): + with self.s3_output(key_name='submission_response') as fp: + self.show_report_lines(bundle_result['post_output'], fp) + if not s3_only: + self.note_additional_datum('post_output', from_dict=bundle_result, default=[]) + + if bundle_result.get('upload_info'): + with self.s3_output(key_name='upload_info') as fp: + print(json.dumps(bundle_result['upload_info'], indent=2), file=fp) + if not s3_only: + self.note_additional_datum('upload_info', from_dict=bundle_result, default=[]) + + @staticmethod + def show_report_lines(lines, fp, default="Nothing to report."): + for line in lines or ([default] if default else []): + try: + print(line, file=fp) + except UnicodeEncodeError: + ascii_line = line.encode( + encoding="ascii", errors="backslashreplace" + ).decode(encoding="ascii") + print(ascii_line, file=fp) + + +@collection( + name='ingestion-submissions', + # acl=ALLOW_SUBMITTER_VIEW_ACL, + unique_key='object_name', + properties={ + 'title': 'Ingestion Submissions', + 'description': 'List of Ingestion Submissions', + }) +class IngestionSubmission(Item): + """The IngestionSubmission class that holds info on requests to ingest data.""" + + item_type = 'ingestion_submission' + schema = load_schema('snovault:schemas/ingestion_submission.json') + # embedded_list = [...] + Item.embedded_list + + schema_json = json.loads(json.dumps(schema)) + + @classmethod + def supports_type(cls, ingestion_type: str) -> bool: + return ingestion_type in cls.schema_json["properties"]["ingestion_type"]["enum"] diff --git a/snovault/types/user.py b/snovault/types/user.py new file mode 100644 index 000000000..f487ab5b2 --- /dev/null +++ b/snovault/types/user.py @@ -0,0 +1,192 @@ +"""The user collection.""" +# -*- coding: utf-8 -*- + +import logging +import transaction + +from pyramid.httpexceptions import HTTPUnprocessableEntity +from pyramid.security import Allow, Deny, Everyone +from pyramid.view import view_config +from snovault import ( + # CONNECTION, + calculated_property, + collection, + load_schema, + # display_title_schema, +) +from ..crud_views import collection_add +from ..resource_views import item_view_page +from ..schema_utils import validate_request +from ..storage import User as AuthUser +from ..util import debug_log +from .base import Item + + +logging.getLogger('boto3').setLevel(logging.INFO) +log = logging.getLogger(__name__) + +""" In order to allow a user to add an access key they need to at + least see their basic profile info and the access_key table +""" + +ONLY_ADMIN_VIEW_USER_DETAILS_ACL = [ + (Allow, 'group.admin', ['view', 'view_details', 'edit']), + (Allow, 'remoteuser.INDEXER', ['view']), + (Allow, 'remoteuser.EMBED', ['view']), + (Deny, Everyone, ['view', 'view_details', 'edit']), +] + +ONLY_OWNER_VIEW_PROFILE_ACL = [ + (Allow, 'role.owner', 'view'), + # (Allow, 'role.owner', 'edit'), + # (Allow, 'role.owner', 'view_details'), +] + ONLY_ADMIN_VIEW_USER_DETAILS_ACL + +DELETED_USER_ACL = [ + (Deny, Everyone, 'visible_for_edit') +] + ONLY_ADMIN_VIEW_USER_DETAILS_ACL + + +@collection( + name='users', + unique_key='user:email', + properties={ + 'title': 'Users', + 'description': f'Listing of current users', + }, +) +class User(Item): + """The user class.""" + + item_type = 'user' + schema = load_schema('snovault:schemas/user.json') + + STATUS_ACL = { + 'current': ONLY_OWNER_VIEW_PROFILE_ACL, + 'deleted': DELETED_USER_ACL, + 'revoked': DELETED_USER_ACL, + 'inactive': ONLY_OWNER_VIEW_PROFILE_ACL, + } + + @calculated_property(schema={ + "title": "Title", + "type": "string", + }) + def title(self, first_name, last_name): + """return first and last name.""" + title = u'{} {}'.format(first_name, last_name) + return title + + @calculated_property(schema={ + "title": "Display Title", + "description": "A calculated title for every object in 4DN", + "type": "string" + }) + def display_title(self, first_name, last_name): + return self.title(first_name, last_name) + + @calculated_property(schema={ + "title": "Contact Email", + "description": "E-Mail address by which this person should be contacted.", + "type": "string", + "format": "email" + }) + def contact_email(self, email, preferred_email=None): + """Returns `email` if `preferred_email` is not defined.""" + if preferred_email: + return preferred_email + else: + return email + + def __ac_local_roles__(self): + """return the owner user.""" + owner = 'userid.%s' % self.uuid + return {owner: 'role.owner'} + + +USER_PAGE_VIEW_ATTRIBUTES = ['@id', '@type', 'uuid', 'title', 'display_title'] + + +@view_config(context=User, permission='view', request_method='GET', name='page') +@debug_log +def user_page_view(context, request, user_page_view_attributes = USER_PAGE_VIEW_ATTRIBUTES): + """smth.""" + properties = item_view_page(context, request) + if not request.has_permission('view_details'): + filtered = {} + for key in user_page_view_attributes: + try: + filtered[key] = properties[key] + except KeyError: + pass + return filtered + return properties + + +@view_config(context=User.Collection, permission='add', request_method='POST', + physical_path="/users") +@debug_log +def user_add(context, request): + ''' + if we have a password in our request, create and auth entry + for the user as well + ''' + # do we have valid data + pwd = request.json.get('password', None) + pwd_less_data = request.json.copy() + + if pwd is not None: + del pwd_less_data['password'] + + validate_request(context.type_info.schema, request, pwd_less_data) + + if request.errors: + return HTTPUnprocessableEntity(json={'errors': request.errors}, + content_type='application/json') + + result = collection_add(context, request) + if result: + email = request.json.get('email') + pwd = request.json.get('password', None) + name = request.json.get('first_name') + if pwd is not None: + auth_user = AuthUser(email, pwd, name) + db = request.registry['dbsession'] + db.add(auth_user) + + transaction.commit() + return result + + +@calculated_property(context=User, category='user_action') +def impersonate(context, request): + """smth.""" + # This is assuming the user_action calculated properties + # will only be fetched from the current_user view, + # which ensures that the user represented by 'context' is also an effective principal + if request.has_permission('impersonate'): + return { + 'id': 'impersonate', + 'title': 'Impersonate User…', + 'href': request.resource_path(context) + '?currentAction=impersonate-user', + } + + +@calculated_property(context=User, category='user_action') +def profile(context, request): + """smth.""" + return { + 'id': 'profile', + 'title': 'Profile', + 'href': request.resource_path(context), + } + + +@calculated_property(context=User, category='user_action') +def submissions(request): + """smth.""" + return { + 'id': 'submissions', + 'title': 'Submissions', + 'href': '/submissions', + } diff --git a/snovault/util.py b/snovault/util.py index 12a20efc8..a52d0ffef 100644 --- a/snovault/util.py +++ b/snovault/util.py @@ -1,15 +1,27 @@ +import boto3 import contextlib import datetime as datetime_module import functools +import gzip +import io import json import os +import re import structlog import sys +import tempfile +import time +from typing import Optional +from botocore.client import Config from copy import copy from datetime import datetime, timedelta -from pyramid.httpexceptions import HTTPForbidden +from io import BytesIO +from pyramid.httpexceptions import HTTPUnprocessableEntity, HTTPForbidden from pyramid.threadlocal import manager as threadlocal_manager +from dcicutils.ecs_utils import ECSUtils +from dcicutils.misc_utils import ignored, PRINT, VirtualApp, count_if, identity +from dcicutils.secrets_utils import assume_identity from .interfaces import CONNECTION, STORAGE, TYPES from .settings import Settings @@ -98,6 +110,74 @@ def __init__(self, *, shard_count=NUM_SHARDS, replica_count=NUM_REPLICAS, } +def create_empty_s3_file(s3_client, bucket: str, key: str, s3_encrypt_key_id: Optional[str] = None): + """ + Args: + s3_client: a client object that results from a boto3.client('s3', ...) call. + bucket: an S3 bucket name + key: the name of a key within the given S3 bucket + s3_encrypt_key_id: the name of a KMS encrypt key id, or None + """ + empty_file = "/dev/null" + + extra_kwargs = extra_kwargs_for_s3_encrypt_key_id(s3_encrypt_key_id=s3_encrypt_key_id, + client_name='create_empty_s3_file') + + s3_client.upload_file(empty_file, Bucket=bucket, Key=key, **extra_kwargs) + + +def get_trusted_email(request, context=None, raise_errors=True): + """ + Get an email address on behalf of which we can issue other requests. + + If auth0 has authenticated user info to offer, return that. + Otherwise, look for a userid.xxx among request.effective_principals and get the email from that. + + This will raise HTTPUnprocessableEntity if there's a problem obtaining the mail. + """ + try: + context = context or "Requirement" + email = getattr(request, '_auth0_authenticated', None) + if not email: + user_uuid = None + for principal in request.effective_principals: + if principal.startswith('userid.'): + user_uuid = principal[7:] + break + if not user_uuid: + raise HTTPUnprocessableEntity('%s: Must provide authentication' % context) + user_props = get_item_or_none(request, user_uuid) + if not user_props: + raise HTTPUnprocessableEntity('%s: User profile missing' % context) + if 'email' not in user_props: + raise HTTPUnprocessableEntity('%s: Entry for "email" missing in user profile.' % context) + email = user_props['email'] + return email + except Exception: + if raise_errors: + raise + return None + + +def beanstalk_env_from_request(request): + return beanstalk_env_from_registry(request.registry) + + +def beanstalk_env_from_registry(registry): + return registry.settings.get('env.name') + + +def customized_delay_rerun(sleep_seconds=1): + def parameterized_delay_rerun(*args): + """ Rerun function for flaky """ + ignored(args) + time.sleep(sleep_seconds) + return True + return parameterized_delay_rerun + + +delay_rerun = customized_delay_rerun(sleep_seconds=1) + @contextlib.contextmanager def mappings_use_nested(value=True): """ Context manager that sets the MAPPINGS_USE_NESTED setting with the given value, default True """ @@ -135,6 +215,55 @@ def dictionary_lookup(dictionary, key): return dictionary[key] +def deduplicate_list(lst): + """ De-duplicates the given list by converting it to a set then back to a list. + + NOTES: + * The list must contain 'hashable' type elements that can be used in sets. + * The result list might not be ordered the same as the input list. + * This will also take tuples as input, though the result will be a list. + + :param lst: list to de-duplicate + :return: de-duplicated list + """ + return list(set(lst)) + + +def gunzip_content(content): + """ Helper that will gunzip content (into memory) """ + f_in = BytesIO() + f_in.write(content) + f_in.seek(0) + with gzip.GzipFile(fileobj=f_in, mode='rb') as f: + gunzipped_content = f.read() + return gunzipped_content.decode('utf-8') + + +DEBUGLOG = os.environ.get('DEBUGLOG', "") + + +def debuglog(*args): + """ + As the name implies, this is a low-tech logging facility for temporary debugging info. + Prints info to a file in user's home directory. + + The debuglog facility allows simple debugging for temporary debugging of disparate parts of the system. + It takes arguments like print or one of the logging operations and outputs to ~/DEBUGLOG-yyyymmdd.txt. + Each line in the log is timestamped. + """ + if DEBUGLOG: + try: + nowstr = str(datetime.datetime.now()) + dateid = nowstr[:10].replace('-', '') + with io.open(os.path.expanduser(os.path.join(DEBUGLOG, "DEBUGLOG-%s.txt" % dateid)), "a+") as fp: + PRINT(nowstr, *args, file=fp) + except Exception: + # There are many things that could go wrong, but none of them are important enough to fuss over. + # Maybe it was a bad pathname? Out of disk space? Network error? + # It doesn't really matter. Just continue... + pass + + _skip_fields = ['@type', 'principals_allowed'] # globally accessible if need be in the future @@ -415,7 +544,7 @@ def secure_embed(request, item_path, addition='@@object'): res = '' return res except HTTPForbidden: - print("you don't have access to this object") + PRINT("you don't have access to this object") return res @@ -642,7 +771,7 @@ def expand_embedded_list(item_type, types, embeds, schema, processed_embeds): # be cases of fields that are not valid for default embeds # but are still themselves valid fields processed_embeds.remove(embed_path) - print(error_message, file=sys.stderr) + PRINT(error_message, file=sys.stderr) else: embeds_to_add.extend(path_embeds_to_add) return embeds_to_add, processed_embeds @@ -1064,3 +1193,364 @@ def generate_indexer_namespace_for_testing(prefix='sno'): INDEXER_NAMESPACE_FOR_TESTING = generate_indexer_namespace_for_testing() + + +def is_admin_request(request): + """ Checks for 'group.admin' in effective_principals on request - if present we know this + request was submitted by an admin + """ + return 'group.admin' in request.effective_principals + + +def get_item_or_none(request, value, itype=None, frame='object'): + """ + Return the view of an item with given frame. Can specify different types + of `value` for item lookup + + Args: + request: the current Request + value (str): String item identifier or a dict containing @id/uuid + itype (str): Optional string collection name for the item (e.g. /file-formats/) + frame (str): Optional frame to return. Defaults to 'object' + + Returns: + dict: given view of the item or None on failure + """ + item = None + + if isinstance(value, dict): + if 'uuid' in value: + value = value['uuid'] + elif '@id' in value: + value = value['@id'] + + svalue = str(value) + + # Below case is for UUIDs & unique_keys such as accessions, but not @ids + if not svalue.startswith('/') and not svalue.endswith('/'): + svalue = '/' + svalue + '/' + if itype is not None: + svalue = '/' + itype + svalue + + # Request.embed will attempt to get from ES for frame=object/embedded + # If that fails, get from DB. Use '@@' syntax instead of 'frame=' because + # these paths are cached in indexing + try: + item = request.embed(svalue, '@@' + frame) + except Exception: + pass + + # could lead to unexpected errors if == None + return item + + +CONTENT_TYPE_SPECIAL_CASES = { + 'application/x-www-form-urlencoded': [ + # Single legacy special case to allow us to POST to metadata TSV requests via form submission. + # All other special case values should be added using register_path_content_type. + '/metadata/', + '/variant-sample-search-spreadsheet/', + re.compile(r'/variant-sample-lists/[\da-z-]+/@@spreadsheet/'), + ] +} + + +def register_path_content_type(*, path, content_type): + """ + Registers that endpoints that begin with the specified path use the indicated content_type. + + This is part of an inelegant workaround for an issue in renderers.py that maybe we can make go away in the future. + See the 'implementation note' in ingestion/common.py for more details. + """ + exceptions = CONTENT_TYPE_SPECIAL_CASES.get(content_type, None) + if exceptions is None: + CONTENT_TYPE_SPECIAL_CASES[content_type] = exceptions = [] + if path not in exceptions: + exceptions.append(path) + + +compiled_regexp_class = type(re.compile("foo.bar")) # Hides that it's _sre.SRE_Pattern in 3.6, but re.Pattern in 3.7 + + +def content_type_allowed(request): + """ + Returns True if the current request allows the requested content type. + + This is part of an inelegant workaround for an issue in renderers.py that maybe we can make go away in the future. + See the 'implementation note' in ingestion/common.py for more details. + """ + if request.content_type == "application/json": + # For better or worse, we always allow this. + return True + + exceptions = CONTENT_TYPE_SPECIAL_CASES.get(request.content_type) + + if exceptions: + for path_condition in exceptions: + if isinstance(path_condition, str): + if path_condition in request.path: + return True + elif isinstance(path_condition, compiled_regexp_class): + if path_condition.match(request.path): + return True + else: + raise NotImplementedError(f"Unrecognized path_condition: {path_condition}") + + return False + + +def check_user_is_logged_in(request): + """ Raises HTTPForbidden if the request did not come from a logged in user. """ + for principal in request.effective_principals: + if principal.startswith('userid.') or principal == 'group.admin': # allow if logged in OR has admin + break + else: + raise HTTPForbidden(title="Not logged in.") + + + +EMAIL_PATTERN = re.compile(r'[^@]+[@][^@]+') + + +def make_vapp_for_email(*, email, app=None, registry=None, context=None): + app = _app_from_clues(app=app, registry=registry, context=context) + if not isinstance(email, str) or not EMAIL_PATTERN.match(email): + # It's critical to check that the pattern has an '@' so we know it's not a system account (injection). + raise RuntimeError("Expected email to be a string of the form 'user@host'.") + user_environ = { + 'HTTP_ACCEPT': 'application/json', + 'REMOTE_USER': email, + } + vapp = VirtualApp(app, user_environ) + return vapp + + +@contextlib.contextmanager +def vapp_for_email(email, app=None, registry=None, context=None): + yield make_vapp_for_email(email=email, app=app, registry=registry, context=context) + + +def make_vapp_for_ingestion(*, app=None, registry=None, context=None): + app = _app_from_clues(app=app, registry=registry, context=context) + user_environ = { + 'HTTP_ACCEPT': 'application/json', + 'REMOTE_USER': 'INGESTION', + } + vapp = VirtualApp(app, user_environ) + return vapp + + +@contextlib.contextmanager +def vapp_for_ingestion(app=None, registry=None, context=None): + yield make_vapp_for_ingestion(app=app, registry=registry, context=context) + + +def _app_from_clues(app=None, registry=None, context=None): + if count_if(identity, [app, registry, context]) != 1: + raise RuntimeError("Expected exactly one of app, registry, or context.") + if not app: + app = (registry or context).app + return app + + +def make_s3_client(): + s3_client_extra_args = {} + if 'IDENTITY' in os.environ: + identity = assume_identity() + s3_client_extra_args['aws_access_key_id'] = key_id = identity.get('S3_AWS_ACCESS_KEY_ID') + s3_client_extra_args['aws_secret_access_key'] = identity.get('S3_AWS_SECRET_ACCESS_KEY') + s3_client_extra_args['region_name'] = ECSUtils.REGION + log.warning(f"make_s3_client using S3 entity ID {key_id[:10]} arguments in `boto3 client creation call.") + if 'ENCODED_S3_ENCRYPT_KEY_ID' in identity: + # This setting is required when testing locally and encrypted buckets need to be accessed. + s3_client_extra_args['config'] = Config(signature_version='s3v4') + else: + log.warning(f'make_s3_client called with no identity') + + s3_client = boto3.client('s3', **s3_client_extra_args) + return s3_client + + +def build_s3_presigned_get_url(*, params): + """ Helper function that builds a presigned URL. """ + s3_client = make_s3_client() + return s3_client.generate_presigned_url( + ClientMethod='get_object', + Params=params, + ExpiresIn=36 * 60 * 60 + ) + + +def convert_integer_to_comma_string(value): + """Convert integer to comma-formatted string for displaying SV + position. + + :param value: Value to format. + :type value: int + :returns: Comma-formatted integer or None + :rtype: str or None + """ + result = None + if isinstance(value, int): + result = format(value, ",d") + return result + + +ENCODED_ROOT_DIR = os.path.dirname(__file__) + + +def resolve_file_path(path, file_loc=None, root_dir=ENCODED_ROOT_DIR): + """ Takes a relative path from this file location and returns an absolute path to + the desired file, needed for WSGI to resolve embed files. + + :param path: relative path to be converted + :param file_loc: absolute path to location path is relative to, by default path/to/encoded/src/ + :return: absolute path to location specified by path + """ + if path.startswith("~"): + # Really this shouldn't happen, so we could instead raise an error, but at least this is semantically correct. + path = os.path.expanduser(path) + if file_loc: + if file_loc.startswith("~"): + file_loc = os.path.expanduser(file_loc) + path_to_this_file = os.path.abspath(os.path.dirname(file_loc)) + else: + path_to_this_file = os.path.abspath(root_dir) + return os.path.join(path_to_this_file, path) + + +# These next few could be in dcicutils.s3_utils as part of s3Utils, but details of interfaces would have to change. +# For now, for expedience, they can live here and we can refactor later. -kmp 25-Jul-2020 + +@contextlib.contextmanager +def s3_output_stream(s3_client, bucket: str, key: str, s3_encrypt_key_id: Optional[str] = None): + """ + This context manager allows one to write: + + with s3_output_stream(s3_client, bucket, key) as fp: + ... fp.write("foo") ... + + to do output to an s3 bucket. + + In fact, an intermediate local file is involved, so this function yields a file pointer (fp) to a + temporary local file that is open for write. That fp should be used to supply content to the file + during the dynamic scope of the context manager. Once the context manager's body executes, the + file will be closed, its contents will be copied to s3, and finally the temporary local file will + be deleted. + + Args: + s3_client: a client object that results from a boto3.client('s3', ...) call. + bucket: an S3 bucket name + key: the name of a key within the given S3 bucket + s3_encrypt_key_id: a KMS encryption key id or None + """ + + tempfile_name = tempfile.mktemp() + try: + with io.open(tempfile_name, 'w') as fp: + yield fp + extra_kwargs = extra_kwargs_for_s3_encrypt_key_id(s3_encrypt_key_id=s3_encrypt_key_id, + client_name='s3_output_stream') + s3_client.upload_file(Filename=tempfile_name, Bucket=bucket, Key=key, **extra_kwargs) + finally: + try: + os.remove(tempfile_name) + except Exception: + pass + + +@contextlib.contextmanager +def s3_local_file(s3_client, bucket: str, key: str): + """ + This context manager allows one to write: + + with s3_local_file(s3_client, bucket, key) as file: + with io.open(local_file, 'r') as fp: + dictionary = json.load(fp) + + to do input from an s3 bucket. + + Args: + s3_client: a client object that results from a boto3.client('s3', ...) call. + bucket: an S3 bucket name + key: the name of a key within the given S3 bucket + """ + ext = os.path.splitext(key)[-1] + tempfile_name = tempfile.mktemp() + ext + try: + s3_client.download_file(Bucket=bucket, Key=key, Filename=tempfile_name) + yield tempfile_name + finally: + try: + os.remove(tempfile_name) + except Exception: + pass + + +@contextlib.contextmanager +def s3_input_stream(s3_client, bucket: str, key: str, mode: str = 'r'): + """ + This context manager allows one to write: + + with s3_input_stream(s3_client, bucket, key) as fp: + dictionary = json.load(fp) + + to do input from an s3 bucket. + + In fact, an intermediate local file is created, copied, and deleted. + + Args: + s3_client: a client object that results from a boto3.client('s3', ...) call. + bucket: an S3 bucket name + key: the name of a key within the given S3 bucket + mode: an input mode acceptable to io.open + """ + + with s3_local_file(s3_client, bucket, key) as file: + with io.open(file, mode=mode) as fp: + yield fp + + +class SettingsKey: + APPLICATION_BUCKET_PREFIX = 'application_bucket_prefix' + BLOB_BUCKET = 'blob_bucket' + EB_APP_VERSION = 'eb_app_version' + ELASTICSEARCH_SERVER = 'elasticsearch.server' + ENCODED_VERSION = 'encoded_version' + FILE_UPLOAD_BUCKET = 'file_upload_bucket' + FILE_WFOUT_BUCKET = 'file_wfout_bucket' + FOURSIGHT_BUCKET_PREFIX = 'foursight_bucket_prefix' + IDENTITY = 'identity' + INDEXER = 'indexer' + INDEXER_NAMESPACE = 'indexer.namespace' + INDEX_SERVER = 'index_server' + LOAD_TEST_DATA = 'load_test_data' + METADATA_BUNDLES_BUCKET = 'metadata_bundles_bucket' + S3_ENCRYPT_KEY_ID = 's3_encrypt_key_id' + SNOVAULT_VERSION = 'snovault_version' + SQLALCHEMY_URL = 'sqlalchemy.url' + SYSTEM_BUCKET = 'system_bucket' + TIBANNA_CWLS_BUCKET = 'tibanna_cwls_bucket' + TIBANNA_OUTPUT_BUCKET = 'tibanna_output_bucket' + UTILS_VERSION = 'utils_version' + + +class ExtraArgs: + SERVER_SIDE_ENCRYPTION = "ServerSideEncryption" + SSE_KMS_KEY_ID = "SSEKMSKeyId" + + +def extra_kwargs_for_s3_encrypt_key_id(s3_encrypt_key_id, client_name): + + extra_kwargs = {} + if s3_encrypt_key_id: + log.error(f"{client_name} adding SSEKMSKeyId ({s3_encrypt_key_id}) arguments in upload_fileobj call.") + extra_kwargs["ExtraArgs"] = { + ExtraArgs.SERVER_SIDE_ENCRYPTION: "aws:kms", + ExtraArgs.SSE_KMS_KEY_ID: s3_encrypt_key_id, + } + else: + log.error(f"{client_name} found no s3 encrypt key id ({SettingsKey.S3_ENCRYPT_KEY_ID})" + f" in request.registry.settings.") + + return extra_kwargs diff --git a/test.ini.template b/test.ini.template new file mode 100644 index 000000000..0f727bbe8 --- /dev/null +++ b/test.ini.template @@ -0,0 +1,75 @@ +[app:app] +use = config:base.ini#app +session.secret = %(here)s/session-secret.b64 +file_upload_bucket = encoded-4dn-files +blob_bucket = encoded-4dn-blobs +# metadata_bundles_bucket = ... not needed for snovault +# blob_store_profile_name = encoded-4dn-files +accession_factory = snovault.server_defaults.test_accession +elasticsearch.server = 172.31.49.128:9872 +snovault.app_version = 1.3.0 +ga_config_location = ./src/encoded/static/ga_config.json +encoded_version = 111.222.333 +snovault_version = 222.333.444 +utils_version = 333.444.555 +eb_app_version = app-v-test-simulation +create_tables = true +load_test_data = snovault.loadxl:load_test_data +env.name = snovault-test-${USER} +testing = true + +[composite:indexer] +use = config:base.ini#indexer + +[pipeline:main] +pipeline = + config:base.ini#memlimit + egg:PasteDeploy#prefix + app + +[pipeline:debug] +pipeline = + egg:repoze.debug#pdbpm + app +set pyramid.includes = + pyramid_translogger + +[server:main] +use = egg:waitress#main +host = 0.0.0.0 +port = 6543 +threads = 1 + +[loggers] +keys = root, encoded, encoded_listener + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console + +[logger_encoded] +level = WARN +handlers = console +qualname = encoded +propagate = 0 + +[logger_encoded_listener] +level = INFO +handlers = console +qualname = snovault.elasticsearch.es_index_listener +propagate = 0 + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)s [%(name)s][%(threadName)s] %(message)s