diff --git a/README.md b/README.md index b35906e0..4a1a58a3 100644 --- a/README.md +++ b/README.md @@ -25,8 +25,9 @@ this setup will work on other systems. Assumptions made are that you have sudo p * alyx-dev is sync with the **dev** branch * Migrations files are provided by the repository * Continuous integration is setup, to run tests locally: - - `./manage.py test -n` test without migrations (faster) - - `./manage.py test` test with migrations (recommended if model changes) + - `./manage.py test -n` test without migrations (faster) + - `./manage.py test` test with migrations (recommended if model changes) + - NB: When running tests ensure `DEBUG = True` in the settings.py file (specifically `SECURE_SSL_REDIRECT = True` causes REST tests to fail) ``` $ /manage.py test -n diff --git a/alyx/alyx/__init__.py b/alyx/alyx/__init__.py index 56684fc6..8310a90d 100644 --- a/alyx/alyx/__init__.py +++ b/alyx/alyx/__init__.py @@ -1 +1 @@ -VERSION = __version__ = '1.7.0' +VERSION = __version__ = '1.8.0' diff --git a/alyx/data/admin.py b/alyx/data/admin.py index e214b1de..e2e9df01 100644 --- a/alyx/data/admin.py +++ b/alyx/data/admin.py @@ -1,5 +1,5 @@ -from django.db.models import Count -from django.contrib import admin +from django.db.models import Count, ProtectedError +from django.contrib import admin, messages from django.utils.html import format_html from django_admin_listfilter_dropdown.filters import RelatedDropdownFilter from rangefilter.filter import DateRangeFilter @@ -135,6 +135,21 @@ def _public(self, obj): _public.short_description = 'Public' _public.boolean = True + def delete_queryset(self, request, queryset): + try: + queryset.delete() + except ProtectedError as e: + err_msg = e.args[0] if e.args else 'One or more dataset(s) protected' + self.message_user(request, err_msg, level=messages.ERROR) + + def delete_model(self, request, obj): + try: + obj.delete() + except ProtectedError as e: + # FIXME This still shows the successful message which is confusing for users + err_msg = e.args[0] if e.args else f'Dataset {obj.name} is protected' + self.message_user(request, err_msg, level=messages.ERROR) + class FileRecordAdmin(BaseAdmin): fields = ('relative_path', 'data_repository', 'dataset', 'exists') diff --git a/alyx/data/management/commands/files.py b/alyx/data/management/commands/files.py index 45094ff7..fed7123c 100644 --- a/alyx/data/management/commands/files.py +++ b/alyx/data/management/commands/files.py @@ -1,4 +1,5 @@ import logging + from django.core.management import BaseCommand from django.db.models import Count, Q @@ -195,7 +196,7 @@ def handle(self, *args, **options): dt = None for d in FileRecord.objects.all().select_related('dataset'): try: - dt = transfers.get_dataset_type(d.relative_path, qs=qs) + dt = transfers.get_dataset_type(d.relative_path, qs) except ValueError: dt = None continue diff --git a/alyx/data/models.py b/alyx/data/models.py index 46192bbd..31d31d65 100644 --- a/alyx/data/models.py +++ b/alyx/data/models.py @@ -1,10 +1,14 @@ +import structlog + from django.core.validators import RegexValidator from django.db import models from django.utils import timezone from alyx.settings import TIME_ZONE, AUTH_USER_MODEL from actions.models import Session -from alyx.base import BaseModel, modify_fields, BaseManager, CharNullField +from alyx.base import BaseModel, modify_fields, BaseManager, CharNullField, BaseQuerySet + +logger = structlog.get_logger(__name__) def _related_string(field): @@ -252,9 +256,27 @@ def __str__(self): return "" % self.name +class DatasetQuerySet(BaseQuerySet): + """A Queryset that checks for protected datasets before deletion""" + + def delete(self, force=False): + if (protected := self.filter(tags__protected=True)).exists(): + if force: + logger.warning('The following protected datasets will be deleted:\n%s', + '\n'.join(map(str, protected.values_list('name', 'session_id')))) + else: + logger.error( + 'The following protected datasets cannot be deleted without force=True:\n%s', + '\n'.join(map(str, protected.values_list('name', 'session_id')))) + raise models.ProtectedError( + f'Failed to delete {protected.count()} dataset(s) due to protected tags', + protected) + super().delete() + + class DatasetManager(BaseManager): def get_queryset(self): - qs = super(DatasetManager, self).get_queryset() + qs = DatasetQuerySet(self.model, using=self._db) qs = qs.select_related('dataset_type', 'data_format') return qs @@ -361,6 +383,17 @@ def save(self, *args, **kwargs): if len(pis): self.probe_insertion.set(pis.values_list('pk', flat=True)) + def delete(self, *args, force=False, **kwargs): + # If a dataset is protected and force=False, raise an exception + # NB This is not called when bulk deleting or in cascading deletes + if self.is_protected and not force: + tags = self.tags.filter(protected=True).values_list('name', flat=True) + tags_str = '"' + '", "'.join(tags) + '"' + logger.error(f'Dataset {self.name} is protected by tag(s); use force=True.') + raise models.ProtectedError( + f'Failed to delete dataset {self.name} due to protected tag(s) {tags_str}', self) + super().delete(*args, **kwargs) + # Files # ------------------------------------------------------------------------------------------------ diff --git a/alyx/data/tests.py b/alyx/data/tests.py index 5eff5d4e..cdc33bed 100644 --- a/alyx/data/tests.py +++ b/alyx/data/tests.py @@ -1,16 +1,37 @@ from django.test import TestCase +from django.db import transaction from django.db.utils import IntegrityError -from data.models import Dataset, DatasetType +from django.db.models import ProtectedError + +from data.models import Dataset, DatasetType, Tag from data.transfers import get_dataset_type class TestModel(TestCase): def test_model_methods(self): (dset, _) = Dataset.objects.get_or_create(name='toto.npy') + assert dset.is_online is False assert dset.is_public is False assert dset.is_protected is False + def test_delete(self): + (dset, _) = Dataset.objects.get_or_create(name='foo.npy') + (tag, _) = Tag.objects.get_or_create(name='protected_tag', protected=True) + dset.tags.set([tag]) + assert dset.is_protected is True + + # Individual object delete + with transaction.atomic(): + self.assertRaises(ProtectedError, dset.delete) + + # As queryset + qs = Dataset.objects.filter(tags__name='protected_tag') + with transaction.atomic(): + self.assertRaises(ProtectedError, qs.delete) + with self.assertLogs('data.models', 'WARNING'): + qs.delete(force=True) + class TestDatasetTypeModel(TestCase): def test_model_methods(self): @@ -33,6 +54,8 @@ def test_model_methods(self): ('bar.baz.ext', 'bar.baz'), ('some_file.ext', 'some_file') ) + + dtypes = DatasetType.objects.all() for filename, dataname in filename_typename: with self.subTest(filename=filename): - self.assertEqual(get_dataset_type(filename).name, dataname) + self.assertEqual(get_dataset_type(filename, dtypes).name, dataname) diff --git a/alyx/data/tests_rest.py b/alyx/data/tests_rest.py index 24a02d55..fcce55ac 100644 --- a/alyx/data/tests_rest.py +++ b/alyx/data/tests_rest.py @@ -184,6 +184,19 @@ def test_dataset(self): self.ar(r, 201) self.assertEqual(r.data['default_dataset'], False) + # Create protected tag and dataset + r = self.ar(self.post(reverse('tag-list'), {'name': 'foo_tag', 'protected': True}), 201) + data = {'name': 'foo.bar', 'dataset_type': 'dst', 'created_by': 'test', + 'data_format': 'df', 'date': '2018-01-01', 'number': 2, 'subject': self.subject, + 'tags': [r['name']]} + + r = self.ar(self.post(reverse('dataset-list'), data), 201) + did = r['url'].split('/')[-1] + + # Now attempt to delete the protected dataset + r = self.client.delete(reverse('dataset-detail', args=[did]), data) + self.assertRegex(self.ar(r, 403), 'protected') + def test_dataset_date_filter(self): # create 2 datasets with different dates data = { diff --git a/alyx/data/transfers.py b/alyx/data/transfers.py index 9abfcc8e..654f5a2f 100644 --- a/alyx/data/transfers.py +++ b/alyx/data/transfers.py @@ -5,13 +5,12 @@ import re import time from pathlib import Path -from fnmatch import fnmatch from django.db.models import Case, When, Count, Q, F import globus_sdk import numpy as np -from one.alf.files import filename_parts, add_uuid_string -from one.alf.spec import is_valid +from one.alf.files import add_uuid_string +from one.registration import get_dataset_type from alyx import settings from data.models import FileRecord, Dataset, DatasetType, DataFormat, DataRepository @@ -170,31 +169,6 @@ def globus_file_exists(file_record): return False -def get_dataset_type(filename, qs=None): - """Get the dataset type from a given filename""" - dataset_types = [] - for dt in qs or DatasetType.objects.all(): - if not dt.filename_pattern.strip(): - # If the filename pattern is null, check whether the filename object.attribute matches - # the dataset type name. - if is_valid(filename): - obj_attr = '.'.join(filename_parts(filename)[1:3]) - else: # will match name against filename sans extension - obj_attr = op.splitext(filename)[0] - if dt.name == obj_attr: - dataset_types.append(dt) - # Check whether pattern matches filename - elif fnmatch(op.basename(filename).lower(), dt.filename_pattern.lower()): - dataset_types.append(dt) - n = len(dataset_types) - if n == 0: - raise ValueError("No dataset type found for filename `%s`" % filename) - elif n >= 2: - raise ValueError("Multiple matching dataset types found for filename `%s`: %s" % ( - filename, ', '.join(map(str, dataset_types)))) - return dataset_types[0] - - def get_data_format(filename): file_extension = op.splitext(filename)[-1] # This raises an error if there is 0 or 2+ matching data formats. @@ -275,7 +249,7 @@ def _create_dataset_file_records( assert session is not None revision_name = f'#{revision.name}#' if revision else '' relative_path = op.join(rel_dir_path, collection or '', revision_name, filename) - dataset_type = get_dataset_type(filename) + dataset_type = get_dataset_type(filename, DatasetType.objects.all()) data_format = get_data_format(filename) assert dataset_type assert data_format diff --git a/alyx/data/views.py b/alyx/data/views.py index e1d98fe4..fd2f412b 100644 --- a/alyx/data/views.py +++ b/alyx/data/views.py @@ -2,6 +2,7 @@ import re from django.contrib.auth import get_user_model +from django.db import models from rest_framework import generics, viewsets, mixins, serializers from rest_framework.response import Response import django_filters @@ -226,6 +227,14 @@ class DatasetDetail(generics.RetrieveUpdateDestroyAPIView): serializer_class = DatasetSerializer permission_classes = rest_permission_classes() + def delete(self, request, *args, **kwargs): + try: + return super().delete(request, *args, **kwargs) + except models.ProtectedError as e: + # Return Forbidden response with dataset name and list of protected tags associated + err_msg, _ = e.args + return Response(e.args[0], 403) + # FileRecord # ------------------------------------------------------------------------------------------------ diff --git a/alyx/misc/management/commands/one_cache.py b/alyx/misc/management/commands/one_cache.py index c01351bc..65da2197 100644 --- a/alyx/misc/management/commands/one_cache.py +++ b/alyx/misc/management/commands/one_cache.py @@ -381,9 +381,9 @@ def generate_datasets_frame(int_id=True, tags=None) -> pd.DataFrame: if tags: kw = {'tags__name__in' if not isinstance(tags, str) else 'tags__name': tags} ds = ds.prefetch_related('tag').filter(**kw) - # Filter out datasets that do not exist on either repository + # Filter out datasets that do not exist on either repository or have no associated session ds = ds.annotate(exists_flatiron=Exists(on_flatiron), exists_aws=Exists(on_aws)) - ds = ds.filter(Q(exists_flatiron=True) | Q(exists_aws=True)) + ds = ds.filter(Q(exists_flatiron=True) | Q(exists_aws=True), session__isnull=False) # fields to keep from Dataset table fields = ( diff --git a/requirements.txt b/requirements.txt index af32feae..76243b17 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,4 +28,4 @@ python-magic pytz structlog>=21.5.0 webdavclient3 -ONE-api>=1.13.0 +ONE-api>=1.18.0 diff --git a/requirements_frozen.txt b/requirements_frozen.txt index aceb9e55..5c30cc69 100644 --- a/requirements_frozen.txt +++ b/requirements_frozen.txt @@ -1,20 +1,20 @@ -asgiref==3.5.2 +asgiref==3.6.0 backports.zoneinfo==0.2.1 -boto3==1.26.24 -botocore==1.29.24 -certifi==2022.9.24 +boto3==1.26.51 +botocore==1.29.51 +certifi==2022.12.7 cffi==1.15.1 -charset-normalizer==2.1.1 +charset-normalizer==3.0.1 click==8.1.3 colorlog==6.7.0 -contourpy==1.0.6 +contourpy==1.0.7 coreapi==2.3.3 coreschema==0.0.4 coverage==6.5.0 coveralls==3.3.1 cryptography==36.0.2 cycler==0.11.0 -Django==4.1.4 +Django==4.1.5 django-admin-list-filter-dropdown==1.0.3 django-admin-rangefilter==0.9.0 django-autocomplete-light==3.9.4 @@ -25,7 +25,7 @@ django-js-asset==2.0.0 django-mptt==0.14.0 django-polymorphic==3.1.0 django-reversion==5.0.4 -django-storages==1.13.1 +django-storages==1.13.2 django-structlog==4.0.1 django-test-without-migrations==0.6 djangorestframework==3.14.0 @@ -38,23 +38,23 @@ globus-cli==3.10.1 globus-sdk==3.15.0 iblutil==1.4.0 idna==3.4 -importlib-metadata==5.1.0 +importlib-metadata==6.0.0 itypes==1.2.0 Jinja2==3.1.2 jmespath==1.0.1 kiwisolver==1.4.4 llvmlite==0.39.1 -lxml==4.9.1 +lxml==4.9.2 Markdown==3.4.1 -MarkupSafe==2.1.1 -matplotlib==3.6.2 +MarkupSafe==2.1.2 +matplotlib==3.6.3 mccabe==0.7.0 numba==0.56.4 numpy==1.23.5 -ONE-api==1.16.2 -packaging==21.3 +ONE-api==1.18.0 +packaging==23.0 pandas==1.5.2 -Pillow==9.3.0 +Pillow==9.4.0 psycopg2-binary==2.9.5 pyarrow==10.0.1 pycodestyle==2.10.0 @@ -64,9 +64,9 @@ PyJWT==2.6.0 pyparsing==3.0.9 python-dateutil==2.8.2 python-magic==0.4.27 -pytz==2022.6 +pytz==2022.7.1 PyYAML==6.0 -requests==2.28.1 +requests==2.28.2 s3transfer==0.6.0 six==1.16.0 sqlparse==0.4.3 @@ -74,6 +74,6 @@ structlog==22.3.0 tqdm==4.64.1 typing_extensions==4.4.0 uritemplate==4.1.1 -urllib3==1.26.13 +urllib3==1.26.14 webdavclient3==3.14.6 zipp==3.11.0 diff --git a/scripts/oneoff/2019-08-30-patch_register.py b/scripts/oneoff/2019-08-30-patch_register.py index 729972a7..f39059a5 100644 --- a/scripts/oneoff/2019-08-30-patch_register.py +++ b/scripts/oneoff/2019-08-30-patch_register.py @@ -13,14 +13,14 @@ fr_server = FileRecord.objects.filter(dataset__session__in=sessions, data_repository__globus_is_personal=False) -assert(fr_local.count() == fr_server.count()) +assert fr_local.count() == fr_server.count() fr_server.update(exists=False) files_repos_save = fr_local.values_list('id', 'data_repository') fr_local.update() repo = fr_local.values_list('data_repository', flat=True).distinct() -assert(repo.count() == 1) +assert repo.count() == 1 repo = DataRepository.objects.get(id=repo[0]) globus_id_save = repo.globus_endpoint_id diff --git a/scripts/sync_ucl/prune_cortexlab.py b/scripts/sync_ucl/prune_cortexlab.py index d2317483..8d4e346f 100755 --- a/scripts/sync_ucl/prune_cortexlab.py +++ b/scripts/sync_ucl/prune_cortexlab.py @@ -2,6 +2,8 @@ import numpy as np from django.core.management import call_command +from django.db.models import CharField +from django.db.models.functions import Concat from subjects.models import Subject, Project, SubjectRequest from actions.models import Session, Surgery, NotificationRule, Notification @@ -131,7 +133,8 @@ In this case we remove the offending datasets from IBL: the UCL version always has priority (at some point using pandas might be much easier and legible) """ -# Here we are looking for duplicated that DO NOT have the same primary key, but the same session, collection, name and revision +# Here we are looking for duplicated that DO NOT have the same primary key, but the same session, +# collection, name and revision. # Find the datasets that only exist in the IBL database, load session, collection and name pk2check = ids_pk.difference(cds_pk) ibl_datasets = Dataset.objects.filter(pk__in=pk2check) @@ -159,14 +162,24 @@ set_cortex_lab_only = cds_pk.difference(ids_pk) set_ibl_only = ids_pk.difference(cds_pk) # get the interection querysets -cqs = Dataset.objects.using('cortexlab').exclude(pk__in=set_cortex_lab_only).order_by('pk').values_list(*dfields) -iqs = Dataset.objects.filter(session__lab__name='cortexlab').exclude(pk__in=set_ibl_only).order_by('pk').values_list(*dfields) +cqs = (Dataset + .objects + .using('cortexlab') + .exclude(pk__in=set_cortex_lab_only) + .order_by('pk') + .values_list(*dfields)) +iqs = (Dataset + .objects + .filter(session__lab__name='cortexlab') + .exclude(pk__in=set_ibl_only) + .order_by('pk') + .values_list(*dfields)) # manual check but this is expensive # assert len(set(iqs).difference(set(cqs))) == len(set(cqs).difference(set(iqs))) -# this is the set of pks for which there is a md5 mismatch - for all the others, do not import anything by deleting -# many datasets from the cortexlab database +# this is the set of pks for which there is a md5 mismatch - for all the others, do not import +# anything by deleting many datasets from the cortexlab database dpk = [s[0] for s in set(cds).difference(set(ids))] Dataset.objects.using('cortexlab').exclude(pk__in=set_cortex_lab_only.union(dpk)).delete() @@ -177,25 +190,25 @@ ti = np.array(iqs_md5.values_list('auto_datetime', flat=True)).astype(np.datetime64) tc = np.array(cqs_md5.values_list('auto_datetime', flat=True)).astype(np.datetime64) -# those are the indices where the autodatetiem from IBL is posterior to cortexlab - do not import by deleting the datasets -# from the cortexlab database +# those are the indices where the autodatetime from IBL is posterior to cortexlab - do not import +# by deleting the datasets from the cortexlab database ind_ibl = np.where(ti >= tc)[0] pk2remove = list(np.array(iqs_md5.values_list('pk', flat=True))[ind_ibl]) Dataset.objects.using('cortexlab').filter(pk__in=pk2remove).delete() -# for those that will imported from UCL, set the filerecord status to exist=False fr the local server fierecords +# for those that will imported from UCL, set the file record status to exist=False fr the local +# server file records ind_ucl = np.where(tc > ti)[0] pk2import = list(np.array(iqs_md5.values_list('pk', flat=True))[ind_ucl]) FileRecord.objects.filter(dataset__in=pk2import).update(exists=False, json=None) """ -Sync the tasks 1/2: For DLC tasks there might be duplicates, as we sometimes run them as batch on remote servers. +Sync the tasks 1/2: For DLC tasks there might be duplicates, as we sometimes run them as batch on +remote servers. For those import the cortexlab tasks unless there is a NEWER version in the ibl database """ task_names_to_check = ['TrainingDLC', 'EphysDLC'] dfields = ('session_id', 'name', 'arguments') -from django.db.models import CharField, Value -from django.db.models.functions import Concat # remove duplicates from cortexlab if any qs_cortex = Task.objects.using('cortexlab').filter(name__in=task_names_to_check) qs_cortex = qs_cortex.distinct(*dfields) @@ -206,15 +219,18 @@ qs_ibl = Task.objects.filter(session__lab__name='cortexlab').filter(name__in=task_names_to_check) qs_ibl = qs_ibl.annotate(eid_name_args=Concat(*dfields, output_field=CharField())) qs_cortex = qs_cortex.annotate(eid_name_args=Concat(*dfields, output_field=CharField())) -eid_name_args = set(qs_cortex.values_list('eid_name_args')).intersection(qs_cortex.values_list('eid_name_args')) +eid_name_args = (set(qs_cortex.values_list('eid_name_args')) + .intersection(qs_cortex.values_list('eid_name_args'))) dlc_cortex = qs_cortex.filter(eid_name_args__in=eid_name_args).order_by('eid_name_args') -dlc_ibl = qs_ibl.filter(name__in=task_names_to_check, eid_name_args__in=eid_name_args).order_by('eid_name_args') - +dlc_ibl = (qs_ibl + .filter(name__in=task_names_to_check, eid_name_args__in=eid_name_args) + .order_by('eid_name_args')) times_cortex = np.array(dlc_cortex.values_list('datetime', flat=True)).astype(np.datetime64) times_ibl = np.array(dlc_ibl.values_list('datetime', flat=True)).astype(np.datetime64) -# Indices where datetime from IBL is newer than cortexlab -- do not import by deleting the datasets from cortexlab db +# Indices where datetime from IBL is newer than cortexlab -- do not import by deleting the datasets +# from cortexlab db # Indices where datetime from IBL is older than cortexlab -- delete from ibl db keep_ibl = np.where(times_ibl >= times_cortex)[0] keep_cortex = np.where(times_ibl < times_cortex)[0] @@ -224,21 +240,34 @@ Task.objects.filter(pk__in=pk_del_ibl, name__in=task_names_to_check).delete() """ -Sync the tasks 2/2: For all other tasks, make sure there are no duplicate tasks with different ids that have been made -on IBL and cortex lab database. In the case of duplicates cortex lab database are kept and IBL deleted +Sync the tasks 2/2: For all other tasks, make sure there are no duplicate tasks with different ids +that have been made on IBL and cortex lab database. In the case of duplicates cortex lab database +are kept and IBL deleted """ task_names_to_exclude = ['TrainingDLC', 'EphysDLC'] -cortex_eids = Task.objects.using('cortexlab').exclude(name__in=task_names_to_exclude).values_list('session', flat=True) +cortex_eids = (Task + .objects + .using('cortexlab') + .exclude(name__in=task_names_to_exclude) + .values_list('session', flat=True)) ibl_eids = Task.objects.all().filter(session__lab__name='cortexlab').exclude( name__in=task_names_to_exclude).values_list('session', flat=True) # finds eids that have tasks on both ibl and cortex lab database overlap_eids = set(cortex_eids).intersection(ibl_eids) dfields = ('id', 'name', 'session') -task_cortex = Task.objects.using('cortexlab').filter(session__in=overlap_eids).exclude(name__in=task_names_to_exclude) +task_cortex = (Task + .objects + .using('cortexlab') + .filter(session__in=overlap_eids) + .exclude(name__in=task_names_to_exclude)) cids = task_cortex.values_list(*dfields) -task_ibl = Task.objects.all().filter(session__in=overlap_eids).exclude(name__in=task_names_to_exclude) +task_ibl = (Task + .objects + .all() + .filter(session__in=overlap_eids) + .exclude(name__in=task_names_to_exclude)) ids = task_ibl.values_list(*dfields) # find the tasks that are not common to both @@ -253,12 +282,15 @@ ts.delete() """ -Sync the notes. When a note is updated (in the behaviour criteria tracking) it is deleted and created anew. -The problem is this will create many duplicates on the IBL side after import. +Sync the notes. When a note is updated (in the behaviour criteria tracking) it is deleted and +created anew. The problem is this will create many duplicates on the IBL side after import. Here we look for all of the notes that are present on IBL and remove those that are not in UCL """ ibl_notes = Note.objects.filter(object_id__in=Subject.objects.filter(lab=CORTEX_LAB_PK)) -ucl_notes = Note.objects.using('cortexlab').filter(object_id__in=Subject.objects.filter(lab=CORTEX_LAB_PK)) +ucl_notes = (Note + .objects + .using('cortexlab') + .filter(object_id__in=Subject.objects.filter(lab=CORTEX_LAB_PK))) ibl_notes.exclude(pk__in=list(ucl_notes.values_list('pk', flat=True))).count() """