diff --git a/requirements/base.txt b/requirements/base.txt index 22e4ec2bd..daf8c1938 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -20,3 +20,4 @@ ndg-httpsclient pyasn1 djangorestframework==3.2.4 django-filter==0.11.0 +-e git://github.com/mcantelon/mets-reader-writer.git@dev/issue-8894-premis-parsing#egg=metsrw diff --git a/storage_service/locations/api/resources.py b/storage_service/locations/api/resources.py index 16093ddc9..1a5013a5d 100644 --- a/storage_service/locations/api/resources.py +++ b/storage_service/locations/api/resources.py @@ -4,6 +4,7 @@ # stdlib, alphabetical import json import logging +from multiprocessing import Process import os import shutil import urllib @@ -475,6 +476,10 @@ def obj_create(self, bundle, **kwargs): if bundle.obj.package_type in (Package.AIP, Package.AIC, Package.DIP) and bundle.obj.current_location.purpose in (Location.AIP_STORAGE, Location.DIP_STORAGE): # Store AIP/AIC bundle.obj.store_aip(origin_location, origin_path) + + # Asynchronously index AIP files + p = Process(target=bundle.obj.index_file_data_from_aip_mets) + p.start() elif bundle.obj.package_type in (Package.TRANSFER,) and bundle.obj.current_location.purpose in (Location.BACKLOG,): # Move transfer to backlog bundle.obj.backlog_transfer(origin_location, origin_path) diff --git a/storage_service/locations/api/search/__init__.py b/storage_service/locations/api/search/__init__.py new file mode 100644 index 000000000..d2e8510b2 --- /dev/null +++ b/storage_service/locations/api/search/__init__.py @@ -0,0 +1,3 @@ +# Common +# May have multiple models, so import * and use __all__ in file. +from router import * diff --git a/storage_service/locations/api/search/router.py b/storage_service/locations/api/search/router.py new file mode 100644 index 000000000..c551f448e --- /dev/null +++ b/storage_service/locations/api/search/router.py @@ -0,0 +1,159 @@ +import django_filters +from rest_framework import routers, serializers, viewsets, filters +from rest_framework.decorators import list_route +from rest_framework.response import Response + +from django.db.models import Sum + +from locations import models + + +class CaseInsensitiveBooleanFilter(django_filters.Filter): + """ + This allows users to query booleans without having to use "True" and "False" + """ + def filter(self, qs, value): + if value is not None: + lc_value = value.lower() + if lc_value == "true": + value = True + elif lc_value == "false": + value = False + return qs.filter(**{self.name: value}) + return qs + + +class PipelineField(serializers.RelatedField): + """ + Used to show UUID of related pipelines + """ + def to_representation(self, value): + return value.uuid + + +class LocationSerializer(serializers.HyperlinkedModelSerializer): + """ + Serialize Location model data + """ + space = serializers.ReadOnlyField(source='space.uuid') + pipelines = PipelineField(many=True, read_only=True, source='pipeline') + + class Meta: + model = models.Location + fields = ('uuid', 'space', 'pipelines', 'purpose', 'quota', 'used', 'enabled') + + +class LocationFilter(django_filters.FilterSet): + """ + Filter for searching Location data + """ + uuid = django_filters.CharFilter(name='uuid') + space = django_filters.CharFilter(name='space') + purpose = django_filters.CharFilter(name='purpose') + enabled = CaseInsensitiveBooleanFilter(name='enabled') + + class Meta: + model = models.Location + fields = ['uuid', 'space', 'purpose', 'enabled'] + + +class LocationViewSet(viewsets.ReadOnlyModelViewSet): + """ + Search API view for Location model data + """ + queryset = models.Location.objects.all() + serializer_class = LocationSerializer + filter_backends = (filters.DjangoFilterBackend,) + filter_class = LocationFilter + + +class PackageSerializer(serializers.HyperlinkedModelSerializer): + """ + Serialize Package model data + """ + origin_pipeline = serializers.ReadOnlyField(source='origin_pipeline.uuid') + current_location = serializers.ReadOnlyField(source='current_location.uuid') + pointer_file_location = serializers.ReadOnlyField(source='pointer_file_location.uuid') + + class Meta: + model = models.Package + fields = ('uuid', 'current_path', 'size', 'origin_pipeline', 'current_location', 'package_type', 'status', 'pointer_file_location', 'pointer_file_path') + + +class PackageFilter(django_filters.FilterSet): + """ + Filter for searching Package data + """ + min_size = django_filters.NumberFilter(name='size', lookup_type='gte') + max_size = django_filters.NumberFilter(name='size', lookup_type='lte') + pipeline = django_filters.CharFilter(name='origin_pipeline') + location = django_filters.CharFilter(name='current_location') + package_type = django_filters.CharFilter(name='package_type') + + class Meta: + model = models.Package + fields = ['uuid', 'min_size', 'max_size', 'pipeline', 'location', 'package_type', 'status', 'pointer_file_location'] + + +class PackageViewSet(viewsets.ReadOnlyModelViewSet): + """ + Search API view for Package model data + """ + queryset = models.Package.objects.all() + serializer_class = PackageSerializer + filter_backends = (filters.DjangoFilterBackend,) + filter_class = PackageFilter + + +class FileSerializer(serializers.HyperlinkedModelSerializer): + """ + Serialize File model data + """ + pipeline = serializers.ReadOnlyField(source='origin.uuid') + + class Meta: + model = models.File + fields = ('uuid', 'name', 'file_type', 'size', 'format_name', 'pronom_id', 'pipeline', 'source_package', 'normalized', 'validated', 'ingestion_time') + + +class FileFilter(django_filters.FilterSet): + """ + Filter for searching File data + """ + min_size = django_filters.NumberFilter(name='size', lookup_type='gte') + max_size = django_filters.NumberFilter(name='size', lookup_type='lte') + pipeline = django_filters.CharFilter(name='origin') + package = django_filters.CharFilter(name='source_package') + name = django_filters.CharFilter(name='name', lookup_type='icontains') + normalized = CaseInsensitiveBooleanFilter(name='normalized') + ingestion_time = django_filters.DateFilter(name='ingestion_time', lookup_type='contains') + + class Meta: + model = models.File + fields = ['uuid', 'name', 'file_type', 'min_size', 'max_size', 'format_name', 'pronom_id', 'pipeline', 'source_package', 'normalized', 'validated', 'ingestion_time'] + + +class FileViewSet(viewsets.ReadOnlyModelViewSet): + """ + Search API view for File model data + + Custom endpoint "stats" provides total size of files searched for + """ + queryset = models.File.objects.all() + serializer_class = FileSerializer + filter_backends = (filters.DjangoFilterBackend,) + filter_class = FileFilter + + @list_route(methods=['get']) + def stats(self, request): + filtered = FileFilter(request.GET, queryset=self.get_queryset()) + count = filtered.qs.count() + summary = filtered.qs.aggregate(Sum('size')) + return Response({'count': count, 'total_size': summary['size__sum']}) + + +# Route location, package, and file search API requests +router = routers.DefaultRouter() +router.register(r'location', LocationViewSet) +router.register(r'package', PackageViewSet) +router.register(r'file', FileViewSet) diff --git a/storage_service/locations/api/urls.py b/storage_service/locations/api/urls.py index a33e74a5d..07c7d2d88 100644 --- a/storage_service/locations/api/urls.py +++ b/storage_service/locations/api/urls.py @@ -1,9 +1,13 @@ from django.conf.urls import include, url +from rest_framework import routers, serializers, viewsets, filters, generics from tastypie.api import Api -from locations.api import v1, v2 +from locations import models +from locations.api import v1, v2 +from locations.api.search import router from locations.api.sword import views + v1_api = Api(api_name='v1') v1_api.register(v1.SpaceResource()) v1_api.register(v1.LocationResource()) @@ -16,9 +20,12 @@ v2_api.register(v2.PackageResource()) v2_api.register(v2.PipelineResource()) + urlpatterns = [ url(r'', include(v1_api.urls)), url(r'v1/sword/$', views.service_document, name='sword_service_document'), url(r'', include(v2_api.urls)), url(r'v2/sword/$', views.service_document, name='sword_service_document'), + url(r'v1/search/', include(router.urls)), + url(r'v2/search/', include(router.urls)) ] diff --git a/storage_service/locations/migrations/0005_search_api.py b/storage_service/locations/migrations/0005_search_api.py new file mode 100644 index 000000000..b9eec6e2f --- /dev/null +++ b/storage_service/locations/migrations/0005_search_api.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.db import models, migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('locations', '0004_v0_7'), + ] + + operations = [ + migrations.AddField( + model_name='file', + name='file_type', + field=models.CharField(max_length=8, null=True, choices=[(b'AIP', b'AIP'), (b'transfer', b'Transfer')]), + preserve_default=True, + ), + migrations.AddField( + model_name='file', + name='format_name', + field=models.TextField(max_length=128, blank=True), + preserve_default=True, + ), + migrations.AddField( + model_name='file', + name='ingestion_time', + field=models.DateTimeField(null=True), + preserve_default=True, + ), + migrations.AddField( + model_name='file', + name='normalized', + field=models.NullBooleanField(blank=True, default=None, null=True, help_text=b'Whether or not file has been normalized'), + preserve_default=True, + ), + migrations.AddField( + model_name='file', + name='pronom_id', + field=models.TextField(max_length=128, blank=True), + preserve_default=True, + ), + migrations.AddField( + model_name='file', + name='size', + field=models.IntegerField(default=0, help_text=b'Size in bytes of the file'), + preserve_default=True, + ), + migrations.AddField( + model_name='file', + name='validated', + field=models.NullBooleanField(blank=True, default=None, null=True, help_text=b'Whether or not file has been validated'), + preserve_default=True, + ), + ] diff --git a/storage_service/locations/models/event.py b/storage_service/locations/models/event.py index 2d4eb39ee..dc18f303a 100644 --- a/storage_service/locations/models/event.py +++ b/storage_service/locations/models/event.py @@ -132,9 +132,22 @@ class File(models.Model): help_text="Unique identifier") package = models.ForeignKey('Package', null=True) name = models.TextField(max_length=1000) + ingestion_time = models.DateTimeField(null=True) + + AIP = "AIP" + TRANSFER = "transfer" + FILE_TYPE_CHOICES = ( + (AIP, 'AIP'), + (TRANSFER, 'Transfer') + ) + file_type = models.CharField(max_length=8, choices=FILE_TYPE_CHOICES, null=True) + source_id = models.TextField(max_length=128) source_package = models.TextField(blank=True, help_text="Unique identifier of originating unit") + size = models.IntegerField(default=0, help_text='Size in bytes of the file') + format_name = models.TextField(blank=True, max_length=128) + pronom_id = models.TextField(blank=True, max_length=128) # Sized to fit sha512 checksum = models.TextField(max_length=128) stored = models.BooleanField(default=False) @@ -142,6 +155,11 @@ class File(models.Model): help_text="Accession ID of originating transfer") origin = UUIDField(editable=False, unique=False, version=4, blank=True, help_text="Unique identifier of originating Archivematica dashboard") + normalized = models.NullBooleanField(blank=True, default=None, null=True, + help_text="Whether or not file has been normalized") + validated = models.NullBooleanField(blank=True, default=None, null=True, + help_text="Whether or not file has been validated") + class Meta: diff --git a/storage_service/locations/models/package.py b/storage_service/locations/models/package.py index 2757fca0f..f91e8ca2f 100644 --- a/storage_service/locations/models/package.py +++ b/storage_service/locations/models/package.py @@ -17,6 +17,7 @@ # Third party dependencies, alphabetical import bagit import jsonfield +import metsrw from django_extensions.db.fields import UUIDField # This project, alphabetical @@ -143,7 +144,7 @@ def full_pointer_file_path(self): Includes the space, location and package paths joined.""" if not self.pointer_file_location: - return None + return else: return os.path.join(self.pointer_file_location.full_path, self.pointer_file_path) @@ -197,7 +198,6 @@ def get_local_path(self): # TODO use Space protocol to determine if this is possible? self.local_path = self.full_path return self.local_path - return None def fetch_local_path(self): """ @@ -372,7 +372,7 @@ def recover_aip(self, origin_location, origin_path): temp_aip.delete() # Do fixity check of AIP with recovered files - return self.check_fixity() + return self.check_fixity() def store_aip(self, origin_location, origin_path): """ Stores an AIP in the correct Location. @@ -698,14 +698,14 @@ def index_file_data_from_transfer_mets(self, prefix=None): file_data = self._parse_mets(prefix=prefix) for f in file_data['files']: - File.objects.create(source_id=f['file_uuid'], + File.objects.create(file_type=File.TRANSFER, + source_id=f['file_uuid'], source_package=file_data['transfer_uuid'], accessionid=file_data['accession_id'], package=self, name=f['path'], origin=file_data['dashboard_uuid']) - def backlog_transfer(self, origin_location, origin_path): """ Stores a package in backlog. @@ -752,6 +752,106 @@ def backlog_transfer(self, origin_location, origin_path): self.status = Package.UPLOADED self.save() + def index_file_data_from_aip_mets(self): + """ + Attempts to read an Archivematica AIP METS file inside this + package, then uses the retrieved metadata to generate one entry in the + File table in the database for each file inside the package. + + :raises StorageException: if the transfer METS cannot be found, + or if required elements are missing. + """ + aip_dir_name = os.path.basename(os.path.splitext(self.full_path)[0]) + relative_path = os.path.join(aip_dir_name, "data", "METS." + self.uuid + ".xml") + + path_to_mets, temp_dir = self.extract_file(relative_path) + + mw = metsrw.mets.METSWriter() + mw.fromfile(path_to_mets) + + for fsentry in mw.all_files(): + metadata = self._parse_file_metadata(fsentry) + + if metadata is not None: + aip_file = File() + aip_file.file_type = File.AIP + aip_file.package = self + aip_file.source_id = metadata['uuid'] + aip_file.origin = self.origin_pipeline.uuid + aip_file.name = os.path.join(aip_dir_name, fsentry.path) + aip_file.ingestion_time = mw.createdate + if 'format_name' in metadata: + aip_file.format_name = metadata['format_name'] + if 'size' in metadata: + aip_file.size = int(metadata['size']) + if 'pronom_id' in metadata: + aip_file.pronom_id = metadata['pronom_id'] + if 'normalized' in metadata: + aip_file.normalized = metadata['normalized'] + if 'validated' in metadata: + aip_file.validated = metadata['validated'] + aip_file.save() + + shutil.rmtree(temp_dir) + + def _parse_file_metadata(self, fsentry): + """ + Cycle through an FSEntry object's AMDsec subsections and consolidate + PREMIS object/event metadata. + """ + metadata = None + + if fsentry.path != 'None': + metadata = {} + + # Get technical metadata + if len(fsentry.techmds): + techmd = fsentry.techmds[0] + premis_object = metsrw.premis.Object.parse(techmd.contents.document, False) + + # Don't provide metadata for METS files + if premis_object.characteristics[0]['is_mets']: + return + + metadata['filename'] = premis_object.original_name + + if len(premis_object.object_identifiers[0]): + if premis_object.object_identifiers[0]['type'] == 'UUID': + metadata['uuid'] = premis_object.object_identifiers[0]['value'] + + if premis_object.characteristics[0]['size'] is not None: + metadata['size'] = premis_object.characteristics[0]['size'] + + # Add file format to metadata + if len(premis_object.characteristics[0]['formats']): + first_format = premis_object.characteristics[0]['formats'][0] + if first_format['name'] is not None: + metadata['format_name'] = first_format['name'] + if first_format['version'] is not None: + metadata['format_version'] = first_format['version'] + if first_format['registry_name'] == 'PRONOM': + metadata['pronom_id'] = first_format['registry_key'] + + # Add normalization status to metadata + if len(premis_object.relationships) and premis_object.relationships[0]['type'] == 'derivation': + if premis_object.relationships[0]['subtype'] == 'has source': + metadata['derivative'] = True + + if premis_object.relationships[0]['subtype'] == 'is source of': + metadata['normalized'] = True + + # Cycle through event data to see if file has been validated and if it passed + for digiprovmd in fsentry.digiprovmds: + if digiprovmd.contents.mdtype == 'PREMIS:EVENT': + # Parse PREMIS event + premis_event = metsrw.premis.Event.parse(digiprovmd.contents.document) + + # Indicate whether or not a file has been validated in metadata and if it passed + if premis_event.event_type == 'validation': + metadata['validated'] = premis_event.outcomes[0]['outcome'] == "pass" + + return metadata + def check_fixity(self, delete_after=True): """ Scans the package to verify its checksums.