Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/dev' into jaclyn-taroni/460-chan…
Browse files Browse the repository at this point in the history
…gelog-readme
  • Loading branch information
jaclyn-taroni committed Nov 9, 2023
2 parents 7f648dd + d49b2e5 commit 15171c5
Show file tree
Hide file tree
Showing 10 changed files with 167 additions and 33 deletions.
13 changes: 13 additions & 0 deletions api/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,12 @@ If you would like to purge a project and remove its files from the S3 bucket, yo
sportal manage-api purge_project --scpca-id SCPCP000001 --delete-from-s3
```

The `--cleanup-input-data` flag can help you control the projects input data size. If flag is set the
input data cleanup process will be run for each project right after its processing is over.
```
sportal load-data --cleanup-input-data --reload-all --update-s3
```

The `--cleanup-output-data` flag can help you control the projects output data size. If flag is set the
output (no longer needed) data cleanup process will be run for each project right after its processing is over.
```
Expand All @@ -160,6 +166,13 @@ This is to help prevent the S3 bucket data from accidentally becoming out of syn
To run a command in production, there is a run_command.sh script that is created on the API instance.
It passes any arguments through to the `manage.py`, so `./run_command.sh load_data --reload-all` will work nicely.

The following code can be used to process projects one by one with a minimum disk space footprint:
```
for i in $(seq -f "%02g" 1 20); do
./run_command.sh load_data --cleanup-input-data --cleanup-output-data --reload-existing --scpca-project-id SCPCP0000$i
done
```

The `purge_project` command can be run in a similar fashion: `./run_command.sh purge_project --scpca-id SCPCP000001`

## Cloud Deployments
Expand Down
14 changes: 14 additions & 0 deletions api/scpca_portal/management/commands/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ class Command(BaseCommand):
to a stack-specific S3 bucket."""

def add_arguments(self, parser):
parser.add_argument(
"--cleanup-input-data", action=BooleanOptionalAction, default=settings.PRODUCTION
)
parser.add_argument(
"--cleanup-output-data", action=BooleanOptionalAction, default=settings.PRODUCTION
)
Expand All @@ -75,6 +78,7 @@ def add_arguments(self, parser):

def handle(self, *args, **options):
load_data_from_s3(
cleanup_input_data=options["cleanup_input_data"],
cleanup_output_data=options["cleanup_output_data"],
reload_all=options["reload_all"],
reload_existing=options["reload_existing"],
Expand All @@ -99,6 +103,7 @@ def cleanup_output_data_dir():

def load_data_from_s3(
allowed_submitters: set = ALLOWED_SUBMITTERS,
cleanup_input_data: bool = False,
cleanup_output_data: bool = False,
input_bucket_name: str = "scpca-portal-inputs",
reload_all: bool = False,
Expand Down Expand Up @@ -195,6 +200,11 @@ def load_data_from_s3(
project.save()

project.add_contacts(project_data["contact_email"], project_data["contact_name"])
project.add_external_accessions(
project_data["external_accession"],
project_data["external_accession_url"],
project_data["external_accession_raw"],
)
project.add_publications(project_data["citation"], project_data["citation_doi"])

if project.scpca_id not in os.listdir(common.INPUT_DATA_DIR):
Expand All @@ -216,6 +226,10 @@ def load_data_from_s3(
computed_file.s3_key,
)

if cleanup_input_data:
logger.info(f"Cleaning up '{project}' input data")
shutil.rmtree(os.path.join(common.INPUT_DATA_DIR, project.scpca_id), ignore_errors=True)

if cleanup_output_data:
logger.info(f"Cleaning up '{project}' output data")
for computed_file in computed_files:
Expand Down
31 changes: 31 additions & 0 deletions api/scpca_portal/migrations/0030_auto_20231030_2259.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Generated by Django 3.2.22 on 2023-10-30 22:59

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("scpca_portal", "0029_auto_20221217_0256"),
]

operations = [
migrations.CreateModel(
name="ExternalAccession",
fields=[
("created_at", models.DateTimeField(auto_now_add=True)),
("updated_at", models.DateTimeField(auto_now=True)),
("accession", models.TextField(primary_key=True, serialize=False)),
("has_raw", models.BooleanField(default=False)),
("url", models.TextField()),
],
options={
"db_table": "external_accessions",
},
),
migrations.AddField(
model_name="project",
name="external_accessions",
field=models.ManyToManyField(to="scpca_portal.ExternalAccession"),
),
]
1 change: 1 addition & 0 deletions api/scpca_portal/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from scpca_portal.models.api_token import APIToken
from scpca_portal.models.computed_file import ComputedFile
from scpca_portal.models.contact import Contact
from scpca_portal.models.external_accession import ExternalAccession
from scpca_portal.models.project import Project
from scpca_portal.models.project_summary import ProjectSummary
from scpca_portal.models.publication import Publication
Expand Down
32 changes: 12 additions & 20 deletions api/scpca_portal/models/computed_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
from scpca_portal.config.logging import get_and_configure_logger
from scpca_portal.models.base import TimestampedModel

DATE_FORMAT = "%Y-%M-%d"

logger = get_and_configure_logger(__name__)
s3 = boto3.client("s3", config=Config(signature_version="s3v4"))

Expand Down Expand Up @@ -80,12 +78,6 @@ class OutputFileTypes:
def __str__(self):
return f"Computed file for '{self.project or self.sample}'"

@staticmethod
def get_readme_contents(readme_path: str) -> str:
with open(readme_path, "r") as readme_file:
date = utils.get_today_string()
return f"Generated on: {date}\n\n{readme_file.read()}"

@classmethod
def get_project_multiplexed_file(cls, project, sample_to_file_mapping, workflow_versions):
"""Prepares a ready for saving single data file of project's combined multiplexed data."""
Expand All @@ -99,9 +91,9 @@ def get_project_multiplexed_file(cls, project, sample_to_file_mapping, workflow_
)

with ZipFile(computed_file.zip_file_path, "w") as zip_file:
zip_file.writestr(
zip_file.write(
ComputedFile.README_MULTIPLEXED_FILE_PATH,
ComputedFile.OUTPUT_README_FILE_NAME,
ComputedFile.get_readme_contents(ComputedFile.README_MULTIPLEXED_FILE_PATH),
)
zip_file.write(
project.output_multiplexed_metadata_file_path, computed_file.metadata_file_name
Expand Down Expand Up @@ -134,9 +126,9 @@ def get_project_single_cell_file(cls, project, sample_to_file_mapping, workflow_
)

with ZipFile(computed_file.zip_file_path, "w") as zip_file:
zip_file.writestr(
zip_file.write(
ComputedFile.README_FILE_PATH,
ComputedFile.OUTPUT_README_FILE_NAME,
ComputedFile.get_readme_contents(ComputedFile.README_FILE_PATH),
)
zip_file.write(
project.output_single_cell_metadata_file_path, computed_file.metadata_file_name
Expand Down Expand Up @@ -169,9 +161,9 @@ def get_project_spatial_file(cls, project, sample_to_file_mapping, workflow_vers
)

with ZipFile(computed_file.zip_file_path, "w") as zip_file:
zip_file.writestr(
zip_file.write(
ComputedFile.README_SPATIAL_FILE_PATH,
ComputedFile.OUTPUT_README_FILE_NAME,
ComputedFile.get_readme_contents(ComputedFile.README_SPATIAL_FILE_PATH),
)
zip_file.write(
project.output_spatial_metadata_file_path, computed_file.metadata_file_name
Expand Down Expand Up @@ -213,9 +205,9 @@ def get_sample_multiplexed_file(

if not os.path.exists(computed_file.zip_file_path):
with ZipFile(computed_file.zip_file_path, "w") as zip_file:
zip_file.writestr(
zip_file.write(
ComputedFile.README_MULTIPLEXED_FILE_PATH,
ComputedFile.OUTPUT_README_FILE_NAME,
ComputedFile.get_readme_contents(ComputedFile.README_MULTIPLEXED_FILE_PATH),
)
zip_file.write(
sample.output_multiplexed_metadata_file_path,
Expand Down Expand Up @@ -244,9 +236,9 @@ def get_sample_single_cell_file(cls, sample, libraries, workflow_versions):

file_paths = []
with ZipFile(computed_file.zip_file_path, "w") as zip_file:
zip_file.writestr(
zip_file.write(
ComputedFile.README_FILE_PATH,
ComputedFile.OUTPUT_README_FILE_NAME,
ComputedFile.get_readme_contents(ComputedFile.README_FILE_PATH),
)
zip_file.write(
sample.output_single_cell_metadata_file_path,
Expand Down Expand Up @@ -287,9 +279,9 @@ def get_sample_spatial_file(cls, sample, libraries, workflow_versions):

file_paths = []
with ZipFile(computed_file.zip_file_path, "w") as zip_file:
zip_file.writestr(
zip_file.write(
ComputedFile.README_SPATIAL_FILE_PATH,
ComputedFile.OUTPUT_README_FILE_NAME,
ComputedFile.get_readme_contents(ComputedFile.README_SPATIAL_FILE_PATH),
)
zip_file.write(
sample.output_spatial_metadata_file_path,
Expand Down
17 changes: 17 additions & 0 deletions api/scpca_portal/models/external_accession.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from django.db import models

from scpca_portal.models.base import TimestampedModel


class ExternalAccession(TimestampedModel):
"""External accession."""

class Meta:
db_table = "external_accessions"

accession = models.TextField(primary_key=True)
has_raw = models.BooleanField(default=False)
url = models.TextField()

def __str__(self) -> str:
return self.accession
56 changes: 46 additions & 10 deletions api/scpca_portal/models/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@

from django.db import models

from scpca_portal import common
from scpca_portal import common, utils
from scpca_portal.models.base import TimestampedModel
from scpca_portal.models.computed_file import ComputedFile
from scpca_portal.models.contact import Contact
from scpca_portal.models.external_accession import ExternalAccession
from scpca_portal.models.project_summary import ProjectSummary
from scpca_portal.models.publication import Publication
from scpca_portal.models.sample import Sample
Expand Down Expand Up @@ -48,6 +49,7 @@ class Meta:
unavailable_samples_count = models.PositiveIntegerField(default=0)

contacts = models.ManyToManyField(Contact)
external_accessions = models.ManyToManyField(ExternalAccession)
publications = models.ManyToManyField(Publication)

def __str__(self):
Expand Down Expand Up @@ -257,10 +259,10 @@ def combine_multiplexed_metadata(

return combined_metadata, multiplexed_sample_mapping

def add_contacts(self, contact_emails, contact_names):
def add_contacts(self, contact_email, contact_name):
"""Creates and adds project contacts."""
emails = contact_emails.split(common.CSV_MULTI_VALUE_DELIMITER)
names = contact_names.split(common.CSV_MULTI_VALUE_DELIMITER)
emails = contact_email.split(common.CSV_MULTI_VALUE_DELIMITER)
names = contact_name.split(common.CSV_MULTI_VALUE_DELIMITER)

if len(emails) != len(names):
logger.error("Unable to add ambiguous contacts.")
Expand All @@ -277,10 +279,32 @@ def add_contacts(self, contact_emails, contact_names):

self.contacts.add(contact)

def add_publications(self, citations, citation_dois):
def add_external_accessions(
self, external_accession, external_accession_url, external_accession_raw
):
"""Creates and adds project external accessions."""
accessions = external_accession.split(common.CSV_MULTI_VALUE_DELIMITER)
urls = external_accession_url.split(common.CSV_MULTI_VALUE_DELIMITER)
accessions_raw = external_accession_raw.split(common.CSV_MULTI_VALUE_DELIMITER)

if len(set((len(accessions), len(urls), len(accessions_raw)))) != 1:
logger.error("Unable to add ambiguous external accessions.")
return

for idx, accession in enumerate(accessions):
external_accession, _ = ExternalAccession.objects.get_or_create(
accession=accession.strip()
)
external_accession.url = urls[idx].strip()
external_accession.has_raw = utils.boolean_from_string(accessions_raw[idx].strip())
external_accession.save()

self.external_accessions.add(external_accession)

def add_publications(self, citation, citation_doi):
"""Creates and adds project publications."""
citations = citations.split(common.CSV_MULTI_VALUE_DELIMITER)
dois = citation_dois.split(common.CSV_MULTI_VALUE_DELIMITER)
citations = citation.split(common.CSV_MULTI_VALUE_DELIMITER)
dois = citation_doi.split(common.CSV_MULTI_VALUE_DELIMITER)

if len(citations) != len(dois):
logger.error("Unable to add ambiguous publications.")
Expand Down Expand Up @@ -451,7 +475,11 @@ def create_single_cell_readme_file(self):
readme_template = readme_template_file.read()
with open(ComputedFile.README_FILE_PATH, "w") as readme_file:
readme_file.write(
readme_template.format(project_accession=self.scpca_id, project_url=self.url)
readme_template.format(
project_accession=self.scpca_id,
project_url=self.url,
date=utils.get_today_string(),
)
)

def create_multiplexed_readme_file(self):
Expand All @@ -460,7 +488,11 @@ def create_multiplexed_readme_file(self):
readme_template = readme_template_file.read()
with open(ComputedFile.README_MULTIPLEXED_FILE_PATH, "w") as readme_file:
readme_file.write(
readme_template.format(project_accession=self.scpca_id, project_url=self.url)
readme_template.format(
project_accession=self.scpca_id,
project_url=self.url,
date=utils.get_today_string(),
)
)

def create_spatial_readme_file(self):
Expand All @@ -469,7 +501,11 @@ def create_spatial_readme_file(self):
readme_template = readme_template_file.read()
with open(ComputedFile.README_SPATIAL_FILE_PATH, "w") as readme_file:
readme_file.write(
readme_template.format(project_accession=self.scpca_id, project_url=self.url)
readme_template.format(
project_accession=self.scpca_id,
project_url=self.url,
date=utils.get_today_string(),
)
)

def get_bulk_rna_seq_sample_ids(self):
Expand Down
22 changes: 21 additions & 1 deletion api/scpca_portal/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,15 @@

from rest_framework import serializers

from scpca_portal.models import ComputedFile, Contact, Project, ProjectSummary, Publication, Sample
from scpca_portal.models import (
ComputedFile,
Contact,
ExternalAccession,
Project,
ProjectSummary,
Publication,
Sample,
)


class ComputedFileSerializer(serializers.ModelSerializer):
Expand Down Expand Up @@ -43,6 +51,16 @@ class Meta:
)


class ExternalAccessionSerializer(serializers.ModelSerializer):
class Meta:
model = ExternalAccession
fields = (
"accession",
"has_raw",
"url",
)


class PublicationSerializer(serializers.ModelSerializer):
class Meta:
model = Publication
Expand Down Expand Up @@ -78,6 +96,7 @@ class Meta:
"diagnoses",
"disease_timings",
"downloadable_sample_count",
"external_accessions",
"has_bulk_rna_seq",
"has_cite_seq_data",
"has_multiplexed_data",
Expand All @@ -103,6 +122,7 @@ class Meta:
# but we want these to always be included.
computed_files = ComputedFileSerializer(read_only=True, many=True)
contacts = ContactSerializer(read_only=True, many=True)
external_accessions = ExternalAccessionSerializer(read_only=True, many=True)
publications = PublicationSerializer(read_only=True, many=True)
samples = serializers.SlugRelatedField(many=True, read_only=True, slug_field="scpca_id")
summaries = ProjectSummarySerializer(many=True, read_only=True)
Expand Down
Loading

0 comments on commit 15171c5

Please sign in to comment.