Merge remote-tracking branch 'origin/dev' into jaclyn-taroni/460-chan…

…gelog-readme
AlexsLemonade · Nov 9, 2023 · 15171c5 · 15171c5
2 parents 7f648dd + d49b2e5
commit 15171c5
Show file tree

Hide file tree

Showing 10 changed files with 167 additions and 33 deletions.
diff --git a/api/README.md b/api/README.md
@@ -145,6 +145,12 @@ If you would like to purge a project and remove its files from the S3 bucket, yo
 sportal manage-api purge_project --scpca-id SCPCP000001 --delete-from-s3
 ```
 
+The `--cleanup-input-data` flag can help you control the projects input data size. If flag is set the
+input data cleanup process will be run for each project right after its processing is over.
+```
+sportal load-data --cleanup-input-data --reload-all --update-s3
+```
+
 The `--cleanup-output-data` flag can help you control the projects output data size. If flag is set the
 output (no longer needed) data cleanup process will be run for each project right after its processing is over.
 ```
@@ -160,6 +166,13 @@ This is to help prevent the S3 bucket data from accidentally becoming out of syn
 To run a command in production, there is a run_command.sh script that is created on the API instance.
 It passes any arguments through to the `manage.py`, so `./run_command.sh load_data --reload-all` will work nicely.
 
+The following code can be used to process projects one by one with a minimum disk space footprint:
+```
+for i in $(seq -f "%02g" 1 20); do
+    ./run_command.sh load_data --cleanup-input-data --cleanup-output-data --reload-existing --scpca-project-id SCPCP0000$i
+done
+```
+
 The `purge_project` command can be run in a similar fashion: `./run_command.sh purge_project --scpca-id SCPCP000001`
 
 ## Cloud Deployments

diff --git a/api/scpca_portal/management/commands/load_data.py b/api/scpca_portal/management/commands/load_data.py
@@ -63,6 +63,9 @@ class Command(BaseCommand):
     to a stack-specific S3 bucket."""
 
     def add_arguments(self, parser):
+        parser.add_argument(
+            "--cleanup-input-data", action=BooleanOptionalAction, default=settings.PRODUCTION
+        )
         parser.add_argument(
             "--cleanup-output-data", action=BooleanOptionalAction, default=settings.PRODUCTION
         )
@@ -75,6 +78,7 @@ def add_arguments(self, parser):
 
     def handle(self, *args, **options):
         load_data_from_s3(
+            cleanup_input_data=options["cleanup_input_data"],
             cleanup_output_data=options["cleanup_output_data"],
             reload_all=options["reload_all"],
             reload_existing=options["reload_existing"],
@@ -99,6 +103,7 @@ def cleanup_output_data_dir():
 
 def load_data_from_s3(
     allowed_submitters: set = ALLOWED_SUBMITTERS,
+    cleanup_input_data: bool = False,
     cleanup_output_data: bool = False,
     input_bucket_name: str = "scpca-portal-inputs",
     reload_all: bool = False,
@@ -195,6 +200,11 @@ def load_data_from_s3(
         project.save()
 
         project.add_contacts(project_data["contact_email"], project_data["contact_name"])
+        project.add_external_accessions(
+            project_data["external_accession"],
+            project_data["external_accession_url"],
+            project_data["external_accession_raw"],
+        )
         project.add_publications(project_data["citation"], project_data["citation_doi"])
 
         if project.scpca_id not in os.listdir(common.INPUT_DATA_DIR):
@@ -216,6 +226,10 @@ def load_data_from_s3(
                     computed_file.s3_key,
                 )
 
+        if cleanup_input_data:
+            logger.info(f"Cleaning up '{project}' input data")
+            shutil.rmtree(os.path.join(common.INPUT_DATA_DIR, project.scpca_id), ignore_errors=True)
+
         if cleanup_output_data:
             logger.info(f"Cleaning up '{project}' output data")
             for computed_file in computed_files:

diff --git a/api/scpca_portal/migrations/0030_auto_20231030_2259.py b/api/scpca_portal/migrations/0030_auto_20231030_2259.py
@@ -0,0 +1,31 @@
+# Generated by Django 3.2.22 on 2023-10-30 22:59
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("scpca_portal", "0029_auto_20221217_0256"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="ExternalAccession",
+            fields=[
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+                ("updated_at", models.DateTimeField(auto_now=True)),
+                ("accession", models.TextField(primary_key=True, serialize=False)),
+                ("has_raw", models.BooleanField(default=False)),
+                ("url", models.TextField()),
+            ],
+            options={
+                "db_table": "external_accessions",
+            },
+        ),
+        migrations.AddField(
+            model_name="project",
+            name="external_accessions",
+            field=models.ManyToManyField(to="scpca_portal.ExternalAccession"),
+        ),
+    ]
diff --git a/api/scpca_portal/models/__init__.py b/api/scpca_portal/models/__init__.py
@@ -1,6 +1,7 @@
 from scpca_portal.models.api_token import APIToken
 from scpca_portal.models.computed_file import ComputedFile
 from scpca_portal.models.contact import Contact
+from scpca_portal.models.external_accession import ExternalAccession
 from scpca_portal.models.project import Project
 from scpca_portal.models.project_summary import ProjectSummary
 from scpca_portal.models.publication import Publication

diff --git a/api/scpca_portal/models/computed_file.py b/api/scpca_portal/models/computed_file.py
@@ -12,8 +12,6 @@
 from scpca_portal.config.logging import get_and_configure_logger
 from scpca_portal.models.base import TimestampedModel
 
-DATE_FORMAT = "%Y-%M-%d"
-
 logger = get_and_configure_logger(__name__)
 s3 = boto3.client("s3", config=Config(signature_version="s3v4"))
 
@@ -80,12 +78,6 @@ class OutputFileTypes:
     def __str__(self):
         return f"Computed file for '{self.project or self.sample}'"
 
-    @staticmethod
-    def get_readme_contents(readme_path: str) -> str:
-        with open(readme_path, "r") as readme_file:
-            date = utils.get_today_string()
-            return f"Generated on: {date}\n\n{readme_file.read()}"
-
     @classmethod
     def get_project_multiplexed_file(cls, project, sample_to_file_mapping, workflow_versions):
         """Prepares a ready for saving single data file of project's combined multiplexed data."""
@@ -99,9 +91,9 @@ def get_project_multiplexed_file(cls, project, sample_to_file_mapping, workflow_
         )
 
         with ZipFile(computed_file.zip_file_path, "w") as zip_file:
-            zip_file.writestr(
+            zip_file.write(
+                ComputedFile.README_MULTIPLEXED_FILE_PATH,
                 ComputedFile.OUTPUT_README_FILE_NAME,
-                ComputedFile.get_readme_contents(ComputedFile.README_MULTIPLEXED_FILE_PATH),
             )
             zip_file.write(
                 project.output_multiplexed_metadata_file_path, computed_file.metadata_file_name
@@ -134,9 +126,9 @@ def get_project_single_cell_file(cls, project, sample_to_file_mapping, workflow_
         )
 
         with ZipFile(computed_file.zip_file_path, "w") as zip_file:
-            zip_file.writestr(
+            zip_file.write(
+                ComputedFile.README_FILE_PATH,
                 ComputedFile.OUTPUT_README_FILE_NAME,
-                ComputedFile.get_readme_contents(ComputedFile.README_FILE_PATH),
             )
             zip_file.write(
                 project.output_single_cell_metadata_file_path, computed_file.metadata_file_name
@@ -169,9 +161,9 @@ def get_project_spatial_file(cls, project, sample_to_file_mapping, workflow_vers
         )
 
         with ZipFile(computed_file.zip_file_path, "w") as zip_file:
-            zip_file.writestr(
+            zip_file.write(
+                ComputedFile.README_SPATIAL_FILE_PATH,
                 ComputedFile.OUTPUT_README_FILE_NAME,
-                ComputedFile.get_readme_contents(ComputedFile.README_SPATIAL_FILE_PATH),
             )
             zip_file.write(
                 project.output_spatial_metadata_file_path, computed_file.metadata_file_name
@@ -213,9 +205,9 @@ def get_sample_multiplexed_file(
 
         if not os.path.exists(computed_file.zip_file_path):
             with ZipFile(computed_file.zip_file_path, "w") as zip_file:
-                zip_file.writestr(
+                zip_file.write(
+                    ComputedFile.README_MULTIPLEXED_FILE_PATH,
                     ComputedFile.OUTPUT_README_FILE_NAME,
-                    ComputedFile.get_readme_contents(ComputedFile.README_MULTIPLEXED_FILE_PATH),
                 )
                 zip_file.write(
                     sample.output_multiplexed_metadata_file_path,
@@ -244,9 +236,9 @@ def get_sample_single_cell_file(cls, sample, libraries, workflow_versions):
 
         file_paths = []
         with ZipFile(computed_file.zip_file_path, "w") as zip_file:
-            zip_file.writestr(
+            zip_file.write(
+                ComputedFile.README_FILE_PATH,
                 ComputedFile.OUTPUT_README_FILE_NAME,
-                ComputedFile.get_readme_contents(ComputedFile.README_FILE_PATH),
             )
             zip_file.write(
                 sample.output_single_cell_metadata_file_path,
@@ -287,9 +279,9 @@ def get_sample_spatial_file(cls, sample, libraries, workflow_versions):
 
         file_paths = []
         with ZipFile(computed_file.zip_file_path, "w") as zip_file:
-            zip_file.writestr(
+            zip_file.write(
+                ComputedFile.README_SPATIAL_FILE_PATH,
                 ComputedFile.OUTPUT_README_FILE_NAME,
-                ComputedFile.get_readme_contents(ComputedFile.README_SPATIAL_FILE_PATH),
             )
             zip_file.write(
                 sample.output_spatial_metadata_file_path,

diff --git a/api/scpca_portal/models/external_accession.py b/api/scpca_portal/models/external_accession.py
@@ -0,0 +1,17 @@
+from django.db import models
+
+from scpca_portal.models.base import TimestampedModel
+
+
+class ExternalAccession(TimestampedModel):
+    """External accession."""
+
+    class Meta:
+        db_table = "external_accessions"
+
+    accession = models.TextField(primary_key=True)
+    has_raw = models.BooleanField(default=False)
+    url = models.TextField()
+
+    def __str__(self) -> str:
+        return self.accession
diff --git a/api/scpca_portal/models/project.py b/api/scpca_portal/models/project.py
@@ -8,10 +8,11 @@
 
 from django.db import models
 
-from scpca_portal import common
+from scpca_portal import common, utils
 from scpca_portal.models.base import TimestampedModel
 from scpca_portal.models.computed_file import ComputedFile
 from scpca_portal.models.contact import Contact
+from scpca_portal.models.external_accession import ExternalAccession
 from scpca_portal.models.project_summary import ProjectSummary
 from scpca_portal.models.publication import Publication
 from scpca_portal.models.sample import Sample
@@ -48,6 +49,7 @@ class Meta:
     unavailable_samples_count = models.PositiveIntegerField(default=0)
 
     contacts = models.ManyToManyField(Contact)
+    external_accessions = models.ManyToManyField(ExternalAccession)
     publications = models.ManyToManyField(Publication)
 
     def __str__(self):
@@ -257,10 +259,10 @@ def combine_multiplexed_metadata(
 
         return combined_metadata, multiplexed_sample_mapping
 
-    def add_contacts(self, contact_emails, contact_names):
+    def add_contacts(self, contact_email, contact_name):
         """Creates and adds project contacts."""
-        emails = contact_emails.split(common.CSV_MULTI_VALUE_DELIMITER)
-        names = contact_names.split(common.CSV_MULTI_VALUE_DELIMITER)
+        emails = contact_email.split(common.CSV_MULTI_VALUE_DELIMITER)
+        names = contact_name.split(common.CSV_MULTI_VALUE_DELIMITER)
 
         if len(emails) != len(names):
             logger.error("Unable to add ambiguous contacts.")
@@ -277,10 +279,32 @@ def add_contacts(self, contact_emails, contact_names):
 
             self.contacts.add(contact)
 
-    def add_publications(self, citations, citation_dois):
+    def add_external_accessions(
+        self, external_accession, external_accession_url, external_accession_raw
+    ):
+        """Creates and adds project external accessions."""
+        accessions = external_accession.split(common.CSV_MULTI_VALUE_DELIMITER)
+        urls = external_accession_url.split(common.CSV_MULTI_VALUE_DELIMITER)
+        accessions_raw = external_accession_raw.split(common.CSV_MULTI_VALUE_DELIMITER)
+
+        if len(set((len(accessions), len(urls), len(accessions_raw)))) != 1:
+            logger.error("Unable to add ambiguous external accessions.")
+            return
+
+        for idx, accession in enumerate(accessions):
+            external_accession, _ = ExternalAccession.objects.get_or_create(
+                accession=accession.strip()
+            )
+            external_accession.url = urls[idx].strip()
+            external_accession.has_raw = utils.boolean_from_string(accessions_raw[idx].strip())
+            external_accession.save()
+
+            self.external_accessions.add(external_accession)
+
+    def add_publications(self, citation, citation_doi):
         """Creates and adds project publications."""
-        citations = citations.split(common.CSV_MULTI_VALUE_DELIMITER)
-        dois = citation_dois.split(common.CSV_MULTI_VALUE_DELIMITER)
+        citations = citation.split(common.CSV_MULTI_VALUE_DELIMITER)
+        dois = citation_doi.split(common.CSV_MULTI_VALUE_DELIMITER)
 
         if len(citations) != len(dois):
             logger.error("Unable to add ambiguous publications.")
@@ -451,7 +475,11 @@ def create_single_cell_readme_file(self):
             readme_template = readme_template_file.read()
         with open(ComputedFile.README_FILE_PATH, "w") as readme_file:
             readme_file.write(
-                readme_template.format(project_accession=self.scpca_id, project_url=self.url)
+                readme_template.format(
+                    project_accession=self.scpca_id,
+                    project_url=self.url,
+                    date=utils.get_today_string(),
+                )
             )
 
     def create_multiplexed_readme_file(self):
@@ -460,7 +488,11 @@ def create_multiplexed_readme_file(self):
             readme_template = readme_template_file.read()
         with open(ComputedFile.README_MULTIPLEXED_FILE_PATH, "w") as readme_file:
             readme_file.write(
-                readme_template.format(project_accession=self.scpca_id, project_url=self.url)
+                readme_template.format(
+                    project_accession=self.scpca_id,
+                    project_url=self.url,
+                    date=utils.get_today_string(),
+                )
             )
 
     def create_spatial_readme_file(self):
@@ -469,7 +501,11 @@ def create_spatial_readme_file(self):
             readme_template = readme_template_file.read()
         with open(ComputedFile.README_SPATIAL_FILE_PATH, "w") as readme_file:
             readme_file.write(
-                readme_template.format(project_accession=self.scpca_id, project_url=self.url)
+                readme_template.format(
+                    project_accession=self.scpca_id,
+                    project_url=self.url,
+                    date=utils.get_today_string(),
+                )
             )
 
     def get_bulk_rna_seq_sample_ids(self):

diff --git a/api/scpca_portal/serializers.py b/api/scpca_portal/serializers.py
@@ -11,7 +11,15 @@
 
 from rest_framework import serializers
 
-from scpca_portal.models import ComputedFile, Contact, Project, ProjectSummary, Publication, Sample
+from scpca_portal.models import (
+    ComputedFile,
+    Contact,
+    ExternalAccession,
+    Project,
+    ProjectSummary,
+    Publication,
+    Sample,
+)
 
 
 class ComputedFileSerializer(serializers.ModelSerializer):
@@ -43,6 +51,16 @@ class Meta:
         )
 
 
+class ExternalAccessionSerializer(serializers.ModelSerializer):
+    class Meta:
+        model = ExternalAccession
+        fields = (
+            "accession",
+            "has_raw",
+            "url",
+        )
+
+
 class PublicationSerializer(serializers.ModelSerializer):
     class Meta:
         model = Publication
@@ -78,6 +96,7 @@ class Meta:
             "diagnoses",
             "disease_timings",
             "downloadable_sample_count",
+            "external_accessions",
             "has_bulk_rna_seq",
             "has_cite_seq_data",
             "has_multiplexed_data",
@@ -103,6 +122,7 @@ class Meta:
     # but we want these to always be included.
     computed_files = ComputedFileSerializer(read_only=True, many=True)
     contacts = ContactSerializer(read_only=True, many=True)
+    external_accessions = ExternalAccessionSerializer(read_only=True, many=True)
     publications = PublicationSerializer(read_only=True, many=True)
     samples = serializers.SlugRelatedField(many=True, read_only=True, slug_field="scpca_id")
     summaries = ProjectSummarySerializer(many=True, read_only=True)