From fca811ad8a781b1644bcecfbf3e1eca1b0542851 Mon Sep 17 00:00:00 2001 From: Avrohom Gottlieb Date: Fri, 6 Dec 2024 15:38:58 -0500 Subject: [PATCH] debugging aws s3 sync command --- api/scpca_portal/config/logging.py | 2 +- api/scpca_portal/loader.py | 5 +++- .../commands/download_input_files.py | 29 +++++++++++++++++++ api/scpca_portal/s3.py | 12 ++++---- 4 files changed, 41 insertions(+), 7 deletions(-) create mode 100644 api/scpca_portal/management/commands/download_input_files.py diff --git a/api/scpca_portal/config/logging.py b/api/scpca_portal/config/logging.py index ea69b0ed..7c4f98b9 100644 --- a/api/scpca_portal/config/logging.py +++ b/api/scpca_portal/config/logging.py @@ -21,7 +21,7 @@ def get_thread_id() -> str: "%(asctime)s {0} %(name)s %(color)s%(levelname)s%(extras)s" ": %(message)s%(color_stop)s" ).format(get_thread_id()) LOG_LEVEL = None -LOG_RUNTIMES = os.getenv("LOG_RUNTIMES", False) +LOG_RUNTIMES = os.getenv("LOG_RUNTIMES", True) def unconfigure_root_logger(): diff --git a/api/scpca_portal/loader.py b/api/scpca_portal/loader.py index f5abc638..c8248412 100644 --- a/api/scpca_portal/loader.py +++ b/api/scpca_portal/loader.py @@ -8,7 +8,7 @@ from django.template.defaultfilters import pluralize from scpca_portal import metadata_file, s3 -from scpca_portal.config.logging import get_and_configure_logger +from scpca_portal.config.logging import configure_runtime_logging, get_and_configure_logger from scpca_portal.models import ( ComputedFile, Contact, @@ -19,6 +19,7 @@ ) logger = get_and_configure_logger(__name__) +log_runtime = configure_runtime_logging(logger) def prep_data_dirs(wipe_input_dir: bool = False, wipe_output_dir: bool = True) -> None: @@ -142,6 +143,7 @@ def create_project( return project +@log_runtime def _create_computed_file( computed_file: ComputedFile, update_s3: bool, clean_up_output_data: bool ) -> None: @@ -172,6 +174,7 @@ def _create_computed_file_callback(future, *, update_s3: bool, clean_up_output_d connection.close() +@log_runtime def generate_computed_file( *, download_config: Dict, diff --git a/api/scpca_portal/management/commands/download_input_files.py b/api/scpca_portal/management/commands/download_input_files.py new file mode 100644 index 00000000..d2416683 --- /dev/null +++ b/api/scpca_portal/management/commands/download_input_files.py @@ -0,0 +1,29 @@ +import logging + +from django.core.management.base import BaseCommand + +from scpca_portal import common, loader, s3 +from scpca_portal.models import Project + +logger = logging.getLogger() +logger.setLevel(logging.INFO) +logger.addHandler(logging.StreamHandler()) + + +class Command(BaseCommand): + def handle(self, *args, **kwargs): + loader.prep_data_dirs() + + project = Project.objects.filter(scpca_id="SCPCP000006").first() + + download_config = common.PROJECT_DOWNLOAD_CONFIGS["SPATIAL_SINGLE_CELL_EXPERIMENT"] + libraries = project.get_libraries(download_config) + + library_data_file_paths = [ + fp for lib in libraries for fp in lib.get_download_config_file_paths(download_config) + ] + project_data_file_paths = project.get_download_config_file_paths(download_config) + + s3.download_input_files( + library_data_file_paths + project_data_file_paths, project.s3_input_bucket + ) diff --git a/api/scpca_portal/s3.py b/api/scpca_portal/s3.py index 536eaf34..9bb3bd88 100644 --- a/api/scpca_portal/s3.py +++ b/api/scpca_portal/s3.py @@ -8,9 +8,11 @@ import boto3 from botocore.client import Config -from scpca_portal.config.logging import get_and_configure_logger +from scpca_portal.config.logging import configure_runtime_logging, get_and_configure_logger logger = get_and_configure_logger(__name__) +log_runtime = configure_runtime_logging(logger) + aws_s3 = boto3.client("s3", config=Config(signature_version="s3v4")) MAX_QUEUE_CHUNK_SIZE = 250 @@ -82,11 +84,12 @@ def list_input_paths( return file_paths +@log_runtime def download_input_files(file_paths: List[Path], bucket_name: str) -> bool: """Download all passed data file paths which have not previously been downloaded.'""" - command_parts = ["aws", "s3", "sync", f"s3://{bucket_name}", settings.INPUT_DATA_PATH] download_queue = [fp for fp in file_paths if not fp.exists()] + # If download_queue is empty, exit early if not download_queue: return True @@ -98,10 +101,9 @@ def download_input_files(file_paths: List[Path], bucket_name: str) -> bool: else len(download_queue) ) + command_parts = ["aws", "s3", "sync", f"s3://{bucket_name}", settings.INPUT_DATA_PATH] command_parts.append("--exclude=*") - command_parts.extend( - [f"--include={file_path}" for file_path in download_queue[:chunk_size]] - ) + command_parts.extend([f"--include={file_path}" for file_path in download_queue]) if "public-test" in bucket_name: command_parts.append("--no-sign-request")