Skip to content

Commit

Permalink
debugging aws s3 sync command
Browse files Browse the repository at this point in the history
  • Loading branch information
avrohomgottlieb committed Dec 6, 2024
1 parent 3da9343 commit fca811a
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 7 deletions.
2 changes: 1 addition & 1 deletion api/scpca_portal/config/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def get_thread_id() -> str:
"%(asctime)s {0} %(name)s %(color)s%(levelname)s%(extras)s" ": %(message)s%(color_stop)s"
).format(get_thread_id())
LOG_LEVEL = None
LOG_RUNTIMES = os.getenv("LOG_RUNTIMES", False)
LOG_RUNTIMES = os.getenv("LOG_RUNTIMES", True)


def unconfigure_root_logger():
Expand Down
5 changes: 4 additions & 1 deletion api/scpca_portal/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from django.template.defaultfilters import pluralize

from scpca_portal import metadata_file, s3
from scpca_portal.config.logging import get_and_configure_logger
from scpca_portal.config.logging import configure_runtime_logging, get_and_configure_logger
from scpca_portal.models import (
ComputedFile,
Contact,
Expand All @@ -19,6 +19,7 @@
)

logger = get_and_configure_logger(__name__)
log_runtime = configure_runtime_logging(logger)


def prep_data_dirs(wipe_input_dir: bool = False, wipe_output_dir: bool = True) -> None:
Expand Down Expand Up @@ -142,6 +143,7 @@ def create_project(
return project


@log_runtime
def _create_computed_file(
computed_file: ComputedFile, update_s3: bool, clean_up_output_data: bool
) -> None:
Expand Down Expand Up @@ -172,6 +174,7 @@ def _create_computed_file_callback(future, *, update_s3: bool, clean_up_output_d
connection.close()


@log_runtime
def generate_computed_file(
*,
download_config: Dict,
Expand Down
29 changes: 29 additions & 0 deletions api/scpca_portal/management/commands/download_input_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import logging

from django.core.management.base import BaseCommand

from scpca_portal import common, loader, s3
from scpca_portal.models import Project

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())


class Command(BaseCommand):
def handle(self, *args, **kwargs):
loader.prep_data_dirs()

project = Project.objects.filter(scpca_id="SCPCP000006").first()

download_config = common.PROJECT_DOWNLOAD_CONFIGS["SPATIAL_SINGLE_CELL_EXPERIMENT"]
libraries = project.get_libraries(download_config)

library_data_file_paths = [
fp for lib in libraries for fp in lib.get_download_config_file_paths(download_config)
]
project_data_file_paths = project.get_download_config_file_paths(download_config)

s3.download_input_files(
library_data_file_paths + project_data_file_paths, project.s3_input_bucket
)
12 changes: 7 additions & 5 deletions api/scpca_portal/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
import boto3
from botocore.client import Config

from scpca_portal.config.logging import get_and_configure_logger
from scpca_portal.config.logging import configure_runtime_logging, get_and_configure_logger

logger = get_and_configure_logger(__name__)
log_runtime = configure_runtime_logging(logger)

aws_s3 = boto3.client("s3", config=Config(signature_version="s3v4"))

MAX_QUEUE_CHUNK_SIZE = 250
Expand Down Expand Up @@ -82,11 +84,12 @@ def list_input_paths(
return file_paths


@log_runtime
def download_input_files(file_paths: List[Path], bucket_name: str) -> bool:
"""Download all passed data file paths which have not previously been downloaded.'"""
command_parts = ["aws", "s3", "sync", f"s3://{bucket_name}", settings.INPUT_DATA_PATH]

download_queue = [fp for fp in file_paths if not fp.exists()]

# If download_queue is empty, exit early
if not download_queue:
return True
Expand All @@ -98,10 +101,9 @@ def download_input_files(file_paths: List[Path], bucket_name: str) -> bool:
else len(download_queue)
)

command_parts = ["aws", "s3", "sync", f"s3://{bucket_name}", settings.INPUT_DATA_PATH]
command_parts.append("--exclude=*")
command_parts.extend(
[f"--include={file_path}" for file_path in download_queue[:chunk_size]]
)
command_parts.extend([f"--include={file_path}" for file_path in download_queue])

if "public-test" in bucket_name:
command_parts.append("--no-sign-request")
Expand Down

0 comments on commit fca811a

Please sign in to comment.