Skip to content

Commit

Permalink
Merge pull request #1018 from AlexsLemonade/avrohom/1014-optimizing-s…
Browse files Browse the repository at this point in the history
…ync-command

1014 - Optimizing the Sync Command
  • Loading branch information
avrohomgottlieb authored Dec 9, 2024
2 parents 1d8036e + accdbfd commit 770fb4b
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 20 deletions.
2 changes: 1 addition & 1 deletion api/scpca_portal/config/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
class Test(Local):
# AWS S3
# Note: Data must be resynced when test bucket is updated
AWS_S3_INPUT_BUCKET_NAME = "scpca-portal-public-test-inputs/2024-09-10/"
AWS_S3_INPUT_BUCKET_NAME = "scpca-portal-public-test-inputs/2024-09-10"

# Code Paths
INPUT_DATA_PATH = Path("/home/user/code/test_data/input")
Expand Down
56 changes: 38 additions & 18 deletions api/scpca_portal/s3.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import subprocess
from collections import namedtuple
from collections import defaultdict, namedtuple
from pathlib import Path
from typing import List

Expand Down Expand Up @@ -82,24 +82,44 @@ def list_input_paths(

def download_input_files(file_paths: List[Path], bucket_name: str) -> bool:
"""Download all passed data file paths which have not previously been downloaded.'"""
command_parts = ["aws", "s3", "sync", f"s3://{bucket_name}", settings.INPUT_DATA_PATH]

download_queue = [fp for fp in file_paths if not fp.exists()]
# If download_queue is empty, exit early
if not download_queue:
return True

command_parts.append("--exclude=*")
command_parts.extend([f"--include={file_path}" for file_path in download_queue])

if "public-test" in bucket_name:
command_parts.append("--no-sign-request")

try:
subprocess.check_call(command_parts)
except subprocess.CalledProcessError as error:
logger.error(f"Data files failed to download due to the following error:\n\t{error}")
return False
# NOTE: AWS Sync does one iteration per include flag.
# This causes a tremendous slowdown when trying to sync a long list of specific files.
# In order to overcome this we should sync once
# per project folder's immediate child subdirectory or file.
download_queue = defaultdict(list)

for file_path in file_paths:
if not file_path.exists():

# default to project folder for immediately nested files
bucket_path = Path(file_path.parts[0])

if len(file_path.parts) > 2:
# append the subdirectory to the parent directory to form the bucket_path
bucket_path /= file_path.parts[1]

download_queue[bucket_path].append(file_path.relative_to(bucket_path))

for bucket_path, project_file_paths in download_queue.items():
command_parts = [
"aws",
"s3",
"sync",
f"s3://{bucket_name}/{bucket_path}",
settings.INPUT_DATA_PATH / bucket_path,
]
command_parts.append("--exclude=*")
command_parts.extend([f"--include={file_path}" for file_path in project_file_paths])

if "public-test" in bucket_name:
command_parts.append("--no-sign-request")

try:
subprocess.check_call(command_parts)
except subprocess.CalledProcessError as error:
logger.error(f"Data files failed to download due to the following error:\n\t{error}")
return False

return True

Expand Down
2 changes: 1 addition & 1 deletion infrastructure/batch.tf
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,6 @@
stage = var.stage
batch_tags = {
module = "batch",
revision = "initial - 16 vCPU compute environment and 1 queue"
revision = "first - 16 vCPU compute environment with 1 vCPU per job"
}
}

0 comments on commit 770fb4b

Please sign in to comment.