diff --git a/docs/search.rst b/docs/search.rst new file mode 100644 index 000000000..b6ba79bea --- /dev/null +++ b/docs/search.rst @@ -0,0 +1,146 @@ +:Authors: + Mike Cantelon + +Search API +================================================================================ + +In addition to the search functionality present in the web interface, the +storage service also includes a REST search API. Searches are performed by +sending an HTTP GET request. + +Search results will include a count of how many items were found and will +include next and previous properties indicating links to more items in the +result set. + +Location search +-------------------------------------------------------------------------------- + +The endpoint for searching locations is:: + + http:///api/v2/search/location/ + +Locations can be searched using the following search parameters: + +* uuid (location UUID) +* space (space UUID) +* purpose (purpose code) +* enabled (whether the location is enabled) + +For example, if you wanted to get details about the transfer source location +contained in the space 6d0b6cce-4372-4ef8-bf48-ce642761fd41 you could HTTP get:: + + http:///api/v2/search/location/?space=7ec3d5d9-23ec-4fd5-b9fb-df82da8de630&purpose=TS + +Here is an example JSON response:: + + { + "count": 1, + "next": null, + "previous": null, + "results": [ + { + "uuid": "f74c23e1-6737-4c24-a470-a003bc573051", + "space": "7ec3d5d9-23ec-4fd5-b9fb-df82da8de630", + "pipelines": [ + "2a351be8-99b4-4f53-8ea5-8d6ace6e0243", + "b9d676ff-7c9d-4777-9a19-1b4b76a6542f" + ], + "purpose": "TS", + "quota": null, + "used": 0, + "enabled": true + } + ] + } + + +Package search +-------------------------------------------------------------------------------- + +The endpoint for searching packages is:: + + http:///api/v2/search/package/ + +Packages can be searched using the following search parameters: + +* uuid (package UUID) +* pipeline (pipeline UUID) +* location (location UUID) +* package_type (package type code: "AIP", "AIC", "SIP", "DIP", "transfer", "file", "deposit") +* status (package status code: "PENDING", "STAGING", "UPLOADED", "VERIFIED", + "DEL_REQ", "DELETED", "RECOVER_REQ", "FAIL", or "FINALIZE") +* min_size (minimum package filesize) +* max_size (maximum package filesize) + +For example, if you wanted to get details about packages contained in the location +7c9ddb60-3d16-4fa3-a41e-4a1a876d2a89 you could HTTP GET:: + + http:///api/v2/search/package/?package_type=AIP + +Here is an example JSON response:: + + { + count: 1, + next: null, + previous: null, + results: [ + { + uuid: "96365d3d-6656-4fdd-a247-f85c9e0ddd43", + current_path: "9636/5d3d/6656/4fdd/a247/f85c/9e0d/dd43/Apples-96365d3d-6656-4fdd-a247-f85c9e0ddd43.7z", + size: 7918099, + origin_pipeline: "b9d676ff-7c9d-4777-9a19-1b4b76a6542f", + current_location: "a3d95a1b-f8fb-4e34-9f15-60dcdf178470", + package_type: "AIP", + status: "UPLOADED", + pointer_file_location: "c2dfb32b-77dd-4597-abff-7c52e05e6d01", + pointer_file_path: "9636/5d3d/6656/4fdd/a247/f85c/9e0d/dd43/pointer.96365d3d-6656-4fdd-a247-f85c9e0ddd43.xml" + } + ] + } + + +File search +-------------------------------------------------------------------------------- + +The endpoint for searching files is:: + + http:///api/v2/search/file/ + +Files can be searched using the following search criteria: + +* uuid (file UUID) +* package (package UUID) +* name (enter or partial filename) +* pronom_id (PRONUM PUID) +* format_name (format name) +* min_size (minimum filesize) +* max_size (maximum filesize) +* normalized (boolean: whether or not file was normalized) +* valid (boolean: whether or not file data is valid or malformed) + +For example, if you wanted to get details about files that are 29965171 bytes +or larger, you could HTTP GET:: + + http:///api/v2/search/file/?min_size=29965171 + +Here is an example JSON response:: + + { + count: 1, + next: null, + previous: null, + results: [ + { + uuid: "bd2074bb-2086-40b5-9c3f-3657cb900681", + name: "Bodring-5f0fa831-a74b-4bf5-8598-779d49c3663a/objects/pictures/Landing_zone-e50c8452-0791-4fac-9f45-15b088a39b10.tif", + file_type: "AIP", + size: 29965171, + format_name: "TIFF", + pronom_id: "", + source_package: "", + normalized: null, + validated: null, + ingestion_time: "2015-10-30T04:16:39Z" + } + ] + } diff --git a/requirements/base.txt b/requirements/base.txt index 121d96029..be55cffc2 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -7,10 +7,12 @@ setuptools bagit==1.5.4 brotli==0.5.2 # Better compression library for WhiteNoise defusedxml==0.5.0 +djangorestframework==3.2.4 Django>=1.8,<1.9 django-annoying==0.10.3 django-braces==1.11.0 django-extensions==1.7.9 +django-filter==0.11.0 django-model-utils==3.0.0 #tastypie 0.13.3 has breaking changes django-tastypie==0.13.1 @@ -20,7 +22,8 @@ gunicorn==19.7.1 jsonfield==2.0.1 logutils==0.3.4.1 lxml==3.7.3 -metsrw==0.2.0 +#metsrw==0.2.0 +git+https://github.com/artefactual-labs/mets-reader-writer.git@dev/issue-11581-premis-parsing#egg=metsrw ndg-httpsclient==0.4.2 pyasn1==0.2.3 python-gnupg==0.4.0 @@ -36,3 +39,4 @@ git+https://github.com/Brown-University-Library/django-shibboleth-remoteuser.git # This may not actually be needed as SS uses sqlite by default which doesn't really care about length. # But better to make sure Django doesn't have any validation issues (and also, keep db backend easily swappable) git+https://github.com/seatme/django-longer-username.git@seatme#egg=longerusername +-e git://github.com/mcantelon/mets-reader-writer.git@dev/issue-8894-premis-parsing#egg=metsrw diff --git a/storage_service/locations/api/resources.py b/storage_service/locations/api/resources.py index d9c033025..4ff011031 100644 --- a/storage_service/locations/api/resources.py +++ b/storage_service/locations/api/resources.py @@ -4,6 +4,7 @@ # stdlib, alphabetical import json import logging +from multiprocessing import Process import os import re import shutil @@ -570,6 +571,9 @@ def obj_create(self, bundle, **kwargs): bundle.obj.store_aip(origin_location, origin_path, related_package_uuid, premis_events=events, premis_agents=agents, aip_subtype=aip_subtype) + # Asynchronously index AIP files + p = Process(target=bundle.obj.index_file_data_from_aip_mets) + p.start() elif bundle.obj.package_type in (Package.TRANSFER,) and bundle.obj.current_location.purpose in (Location.BACKLOG,): # Move transfer to backlog bundle.obj.backlog_transfer(origin_location, origin_path) diff --git a/storage_service/locations/api/search/__init__.py b/storage_service/locations/api/search/__init__.py new file mode 100644 index 000000000..648668a70 --- /dev/null +++ b/storage_service/locations/api/search/__init__.py @@ -0,0 +1,5 @@ +# Common +# May have multiple models, so import * and use __all__ in file. +from router import router + +__all__ = ['router'] diff --git a/storage_service/locations/api/search/router.py b/storage_service/locations/api/search/router.py new file mode 100644 index 000000000..deecbb35b --- /dev/null +++ b/storage_service/locations/api/search/router.py @@ -0,0 +1,164 @@ +import django_filters +from rest_framework import routers, serializers, viewsets, filters +from rest_framework.decorators import list_route +from rest_framework.response import Response + +from django.db.models import Sum + +from locations import models + + +class CaseInsensitiveBooleanFilter(django_filters.Filter): + """ + This allows users to query booleans without having to use "True" and "False" + """ + def filter(self, qs, value): + if value is not None: + lc_value = value.lower() + if lc_value == "true": + value = True + elif lc_value == "false": + value = False + return qs.filter(**{self.name: value}) + return qs + + +class PipelineField(serializers.RelatedField): + """ + Used to show UUID of related pipelines + """ + def to_representation(self, value): + return value.uuid + + +class LocationSerializer(serializers.HyperlinkedModelSerializer): + """ + Serialize Location model data + """ + space = serializers.ReadOnlyField(source='space.uuid') + pipelines = PipelineField(many=True, read_only=True, source='pipeline') + + class Meta: + model = models.Location + fields = ('uuid', 'space', 'pipelines', 'purpose', 'quota', 'used', 'enabled') + + +class LocationFilter(django_filters.FilterSet): + """ + Filter for searching Location data + """ + uuid = django_filters.CharFilter(name='uuid') + space = django_filters.CharFilter(name='space') + purpose = django_filters.CharFilter(name='purpose') + enabled = CaseInsensitiveBooleanFilter(name='enabled') + + class Meta: + model = models.Location + fields = ['uuid', 'space', 'purpose', 'enabled'] + + +class LocationViewSet(viewsets.ReadOnlyModelViewSet): + """ + Search API view for Location model data + """ + queryset = models.Location.objects.all() + serializer_class = LocationSerializer + filter_backends = (filters.DjangoFilterBackend,) + filter_class = LocationFilter + + +class PackageSerializer(serializers.HyperlinkedModelSerializer): + """ + Serialize Package model data + """ + origin_pipeline = serializers.ReadOnlyField(source='origin_pipeline.uuid') + current_location = serializers.ReadOnlyField(source='current_location.uuid') + pointer_file_location = serializers.ReadOnlyField(source='pointer_file_location.uuid') + + class Meta: + model = models.Package + fields = ('uuid', 'current_path', 'size', 'origin_pipeline', 'current_location', 'package_type', 'status', 'pointer_file_location', 'pointer_file_path') + + +class PackageFilter(django_filters.FilterSet): + """ + Filter for searching Package data + """ + min_size = django_filters.NumberFilter(name='size', lookup_type='gte') + max_size = django_filters.NumberFilter(name='size', lookup_type='lte') + pipeline = django_filters.CharFilter(name='origin_pipeline') + location = django_filters.CharFilter(name='current_location') + package_type = django_filters.CharFilter(name='package_type') + + class Meta: + model = models.Package + fields = ['uuid', 'min_size', 'max_size', 'pipeline', 'location', 'package_type', 'status', 'pointer_file_location'] + + +class PackageViewSet(viewsets.ReadOnlyModelViewSet): + """ + Search API view for Package model data + """ + queryset = models.Package.objects.all() + serializer_class = PackageSerializer + filter_backends = (filters.DjangoFilterBackend,) + filter_class = PackageFilter + + +class FileSerializer(serializers.HyperlinkedModelSerializer): + """ + Serialize File model data + """ + pipeline = serializers.ReadOnlyField(source='origin.uuid') + + class Meta: + model = models.File + fields = ('uuid', 'name', 'file_type', 'size', 'format_name', 'pronom_id', 'pipeline', 'source_package', 'normalized', 'validated', 'ingestion_time') + + +class FileFilter(django_filters.FilterSet): + """ + Filter for searching File data + """ + min_size = django_filters.NumberFilter(name='size', lookup_type='gte') + max_size = django_filters.NumberFilter(name='size', lookup_type='lte') + pipeline = django_filters.CharFilter(name='origin') + package = django_filters.CharFilter(name='source_package') + name = django_filters.CharFilter(name='name', lookup_type='icontains') + normalized = CaseInsensitiveBooleanFilter(name='normalized') + ingestion_time = django_filters.DateFilter(name='ingestion_time', lookup_type='contains') + #ingestion_time_before = django_filters.DateFilter(name='ingestion_time', lookup_type='lt') + #ingestion_time_after = django_filters.DateFilter(name='ingestion_time', lookup_type='gt') + + class Meta: + model = models.File + fields = ['uuid', 'name', 'file_type', 'min_size', 'max_size', + 'format_name', 'pronom_id', 'pipeline', 'source_package', + 'normalized', 'validated', 'ingestion_time'] + #'ingestion_time_before', 'ingestion_time_after'] + + +class FileViewSet(viewsets.ReadOnlyModelViewSet): + """ + Search API view for File model data + + Custom endpoint "stats" provides total size of files searched for + """ + queryset = models.File.objects.all() + serializer_class = FileSerializer + filter_backends = (filters.DjangoFilterBackend,) + filter_class = FileFilter + + @list_route(methods=['get']) + def stats(self, request): + filtered = FileFilter(request.GET, queryset=self.get_queryset()) + count = filtered.qs.count() + summary = filtered.qs.aggregate(Sum('size')) + return Response({'count': count, 'total_size': summary['size__sum']}) + + +# Route location, package, and file search API requests +router = routers.DefaultRouter() +router.register(r'location', LocationViewSet) +router.register(r'package', PackageViewSet) +router.register(r'file', FileViewSet) diff --git a/storage_service/locations/api/urls.py b/storage_service/locations/api/urls.py index a33e74a5d..7515a45dd 100644 --- a/storage_service/locations/api/urls.py +++ b/storage_service/locations/api/urls.py @@ -1,9 +1,11 @@ from django.conf.urls import include, url from tastypie.api import Api -from locations.api import v1, v2 +from locations.api import v1, v2 +from locations.api.search import router from locations.api.sword import views + v1_api = Api(api_name='v1') v1_api.register(v1.SpaceResource()) v1_api.register(v1.LocationResource()) @@ -16,9 +18,12 @@ v2_api.register(v2.PackageResource()) v2_api.register(v2.PipelineResource()) + urlpatterns = [ url(r'', include(v1_api.urls)), url(r'v1/sword/$', views.service_document, name='sword_service_document'), url(r'', include(v2_api.urls)), url(r'v2/sword/$', views.service_document, name='sword_service_document'), + url(r'v1/search/', include(router.urls)), + url(r'v2/search/', include(router.urls)) ] diff --git a/storage_service/locations/migrations/0017_search_api.py b/storage_service/locations/migrations/0017_search_api.py new file mode 100644 index 000000000..70f7c99a3 --- /dev/null +++ b/storage_service/locations/migrations/0017_search_api.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.db import models, migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('locations', '0016_mirror_location_aip_replication'), + ] + + operations = [ + migrations.AddField( + model_name='file', + name='file_type', + field=models.CharField(max_length=8, null=True, choices=[(b'AIP', b'AIP'), (b'transfer', b'Transfer')]), + preserve_default=True, + ), + migrations.AddField( + model_name='file', + name='format_name', + field=models.TextField(max_length=128, blank=True), + preserve_default=True, + ), + migrations.AddField( + model_name='file', + name='ingestion_time', + field=models.DateTimeField(null=True), + preserve_default=True, + ), + migrations.AddField( + model_name='file', + name='normalized', + field=models.NullBooleanField(blank=True, default=None, null=True, help_text=b'Whether or not file has been normalized'), + preserve_default=True, + ), + migrations.AddField( + model_name='file', + name='pronom_id', + field=models.TextField(max_length=128, blank=True), + preserve_default=True, + ), + migrations.AddField( + model_name='file', + name='size', + field=models.IntegerField(default=0, help_text=b'Size in bytes of the file'), + preserve_default=True, + ), + migrations.AddField( + model_name='file', + name='validated', + field=models.NullBooleanField(blank=True, default=None, null=True, help_text=b'Whether or not file has been validated'), + preserve_default=True, + ), + ] diff --git a/storage_service/locations/models/event.py b/storage_service/locations/models/event.py index 1f105cad1..6f64e6231 100644 --- a/storage_service/locations/models/event.py +++ b/storage_service/locations/models/event.py @@ -135,9 +135,22 @@ class File(models.Model): help_text=_l("Unique identifier")) package = models.ForeignKey('Package', null=True) name = models.TextField(max_length=1000) + ingestion_time = models.DateTimeField(null=True) + + AIP = "AIP" + TRANSFER = "transfer" + FILE_TYPE_CHOICES = ( + (AIP, 'AIP'), + (TRANSFER, 'Transfer') + ) + file_type = models.CharField(max_length=8, choices=FILE_TYPE_CHOICES, null=True) + source_id = models.TextField(max_length=128) source_package = models.TextField(blank=True, help_text=_l("Unique identifier of originating unit")) + size = models.IntegerField(default=0, help_text='Size in bytes of the file') + format_name = models.TextField(blank=True, max_length=128) + pronom_id = models.TextField(blank=True, max_length=128) # Sized to fit sha512 checksum = models.TextField(max_length=128) stored = models.BooleanField(default=False) @@ -145,6 +158,10 @@ class File(models.Model): help_text=_l("Accession ID of originating transfer")) origin = UUIDField(editable=False, unique=False, version=4, blank=True, help_text=_l("Unique identifier of originating Archivematica dashboard")) + normalized = models.NullBooleanField(blank=True, default=None, null=True, + help_text="Whether or not file has been normalized") + validated = models.NullBooleanField(blank=True, default=None, null=True, + help_text="Whether or not file has been validated") class Meta: verbose_name = _l("File") diff --git a/storage_service/locations/models/package.py b/storage_service/locations/models/package.py index 386e752d2..d44e71293 100644 --- a/storage_service/locations/models/package.py +++ b/storage_service/locations/models/package.py @@ -21,6 +21,7 @@ # Third party dependencies, alphabetical import bagit import jsonfield +import metsrw from django_extensions.db.fields import UUIDField import metsrw from metsrw.plugins import premisrw @@ -237,7 +238,6 @@ def get_local_path(self): # TODO use Space protocol to determine if this is possible? self.local_path = self.full_path return self.local_path - return None def fetch_local_path(self): """Fetches a local copy of the package. @@ -1588,6 +1588,105 @@ def backlog_transfer(self, origin_location, origin_path): self.status = Package.UPLOADED self.save() + def index_file_data_from_aip_mets(self): + """ + Attempts to read an Archivematica AIP METS file inside this + package, then uses the retrieved metadata to generate one entry in the + File table in the database for each file inside the package. + + :raises StorageException: if the transfer METS cannot be found, + or if required elements are missing. + """ + aip_dir_name = os.path.basename(os.path.splitext(self.full_path)[0]) + relative_path = os.path.join(aip_dir_name, "data", "METS." + self.uuid + ".xml") + + path_to_mets, temp_dir = self.extract_file(relative_path) + + mw = metsrw.METSDocument.fromfile(path_to_mets) + + for fsentry in mw.all_files(): + metadata = self._parse_file_metadata(fsentry) + + if metadata is not None: + aip_file = File() + aip_file.file_type = File.AIP + aip_file.package = self + aip_file.source_id = metadata['uuid'] + aip_file.origin = self.origin_pipeline.uuid + aip_file.name = os.path.join(aip_dir_name, fsentry.path) + aip_file.ingestion_time = mw.createdate + if 'format_name' in metadata: + aip_file.format_name = metadata['format_name'] + if 'size' in metadata: + aip_file.size = int(metadata['size']) + if 'pronom_id' in metadata: + aip_file.pronom_id = metadata['pronom_id'] + if 'normalized' in metadata: + aip_file.normalized = metadata['normalized'] + if 'validated' in metadata: + aip_file.validated = metadata['validated'] + aip_file.save() + + shutil.rmtree(temp_dir) + + def _parse_file_metadata(self, fsentry): + """ + Cycle through an FSEntry object's AMDsec subsections and consolidate + PREMIS object/event metadata. + """ + metadata = None + + if fsentry.path != 'None': + metadata = {} + + # Get technical metadata + if len(fsentry.techmds): + techmd = fsentry.techmds[0] + premis_object = metsrw.premis.Object.parse(techmd.contents.document, False) + + # Don't provide metadata for METS files + if premis_object.characteristics[0]['is_mets']: + return + + metadata['filename'] = premis_object.original_name + + if len(premis_object.object_identifiers[0]): + if premis_object.object_identifiers[0]['type'] == 'UUID': + metadata['uuid'] = premis_object.object_identifiers[0]['value'] + + if premis_object.characteristics[0]['size'] is not None: + metadata['size'] = premis_object.characteristics[0]['size'] + + # Add file format to metadata + if len(premis_object.characteristics[0]['formats']): + first_format = premis_object.characteristics[0]['formats'][0] + if first_format['name'] is not None: + metadata['format_name'] = first_format['name'] + if first_format['version'] is not None: + metadata['format_version'] = first_format['version'] + if first_format['registry_name'] == 'PRONOM': + metadata['pronom_id'] = first_format['registry_key'] + + # Add normalization status to metadata + if len(premis_object.relationships) and premis_object.relationships[0]['type'] == 'derivation': + if premis_object.relationships[0]['subtype'] == 'has source': + metadata['derivative'] = True + + if premis_object.relationships[0]['subtype'] == 'is source of': + metadata['normalized'] = True + + # Cycle through event data to see if file has been validated and if it passed + for digiprovmd in fsentry.digiprovmds: + if digiprovmd.contents.mdtype == 'PREMIS:EVENT': + # Parse PREMIS event + premis_event = metsrw.premis.Event.parse(digiprovmd.contents.document) + + # Indicate whether or not a file has been validated in metadata and if it passed + if premis_event.event_type == 'validation': + metadata['validated'] = premis_event.outcomes[0]['outcome'] == "pass" + + return metadata + def check_fixity(self, force_local=False, delete_after=True): """ Scans the package to verify its checksums. diff --git a/storage_service/storage_service/settings/base.py b/storage_service/storage_service/settings/base.py index 2b9841f68..8da059e2a 100644 --- a/storage_service/storage_service/settings/base.py +++ b/storage_service/storage_service/settings/base.py @@ -230,6 +230,7 @@ def get_env_variable(var_name): ] THIRD_PARTY_APPS = [ + 'rest_framework', # REST framework 'tastypie', # REST framework 'longerusername', # Longer (> 30 characters) username ] @@ -245,6 +246,13 @@ def get_env_variable(var_name): INSTALLED_APPS = DJANGO_APPS + THIRD_PARTY_APPS + LOCAL_APPS # ######## END APP CONFIGURATION +REST_FRAMEWORK = { + 'DEFAULT_RENDERER_CLASSES': ( + 'rest_framework.renderers.JSONRenderer', + ), + 'PAGE_SIZE': 10 +} + # ######## LOGIN REQUIRED MIDDLEWARE CONFIGURATION LOGIN_URL = '/login/'