diff --git a/api/scpca_portal/config/test.py b/api/scpca_portal/config/test.py index a559965e..677a14a3 100644 --- a/api/scpca_portal/config/test.py +++ b/api/scpca_portal/config/test.py @@ -6,7 +6,7 @@ class Test(Local): # AWS S3 # Note: Data must be resynced when test bucket is updated - AWS_S3_INPUT_BUCKET_NAME = "scpca-portal-public-test-inputs/2024-09-10/" + AWS_S3_INPUT_BUCKET_NAME = "scpca-portal-public-test-inputs/2024-09-10" # Code Paths INPUT_DATA_PATH = Path("/home/user/code/test_data/input") diff --git a/api/scpca_portal/s3.py b/api/scpca_portal/s3.py index 1154b782..7866f886 100644 --- a/api/scpca_portal/s3.py +++ b/api/scpca_portal/s3.py @@ -1,5 +1,5 @@ import subprocess -from collections import namedtuple +from collections import defaultdict, namedtuple from pathlib import Path from typing import List @@ -82,24 +82,44 @@ def list_input_paths( def download_input_files(file_paths: List[Path], bucket_name: str) -> bool: """Download all passed data file paths which have not previously been downloaded.'""" - command_parts = ["aws", "s3", "sync", f"s3://{bucket_name}", settings.INPUT_DATA_PATH] - - download_queue = [fp for fp in file_paths if not fp.exists()] - # If download_queue is empty, exit early - if not download_queue: - return True - - command_parts.append("--exclude=*") - command_parts.extend([f"--include={file_path}" for file_path in download_queue]) - if "public-test" in bucket_name: - command_parts.append("--no-sign-request") - - try: - subprocess.check_call(command_parts) - except subprocess.CalledProcessError as error: - logger.error(f"Data files failed to download due to the following error:\n\t{error}") - return False + # NOTE: AWS Sync does one iteration per include flag. + # This causes a tremendous slowdown when trying to sync a long list of specific files. + # In order to overcome this we should sync once + # per project folder's immediate child subdirectory or file. + download_queue = defaultdict(list) + + for file_path in file_paths: + if not file_path.exists(): + + # default to project folder for immediately nested files + bucket_path = Path(file_path.parts[0]) + + if len(file_path.parts) > 2: + # append the subdirectory to the parent directory to form the bucket_path + bucket_path /= file_path.parts[1] + + download_queue[bucket_path].append(file_path.relative_to(bucket_path)) + + for bucket_path, project_file_paths in download_queue.items(): + command_parts = [ + "aws", + "s3", + "sync", + f"s3://{bucket_name}/{bucket_path}", + settings.INPUT_DATA_PATH / bucket_path, + ] + command_parts.append("--exclude=*") + command_parts.extend([f"--include={file_path}" for file_path in project_file_paths]) + + if "public-test" in bucket_name: + command_parts.append("--no-sign-request") + + try: + subprocess.check_call(command_parts) + except subprocess.CalledProcessError as error: + logger.error(f"Data files failed to download due to the following error:\n\t{error}") + return False return True diff --git a/infrastructure/batch.tf b/infrastructure/batch.tf index e7414de5..43b7c3d1 100644 --- a/infrastructure/batch.tf +++ b/infrastructure/batch.tf @@ -22,6 +22,6 @@ stage = var.stage batch_tags = { module = "batch", - revision = "initial - 16 vCPU compute environment and 1 queue" + revision = "first - 16 vCPU compute environment with 1 vCPU per job" } }