Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1014 - Optimizing the Sync Command #1018

Merged
merged 8 commits into from
Dec 9, 2024
Merged
2 changes: 1 addition & 1 deletion api/scpca_portal/config/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
class Test(Local):
# AWS S3
# Note: Data must be resynced when test bucket is updated
AWS_S3_INPUT_BUCKET_NAME = "scpca-portal-public-test-inputs/2024-09-10/"
AWS_S3_INPUT_BUCKET_NAME = "scpca-portal-public-test-inputs/2024-09-10"

# Code Paths
INPUT_DATA_PATH = Path("/home/user/code/test_data/input")
Expand Down
56 changes: 38 additions & 18 deletions api/scpca_portal/s3.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import subprocess
from collections import namedtuple
from collections import defaultdict, namedtuple
from pathlib import Path
from typing import List

Expand Down Expand Up @@ -82,24 +82,44 @@ def list_input_paths(

def download_input_files(file_paths: List[Path], bucket_name: str) -> bool:
"""Download all passed data file paths which have not previously been downloaded.'"""
command_parts = ["aws", "s3", "sync", f"s3://{bucket_name}", settings.INPUT_DATA_PATH]

download_queue = [fp for fp in file_paths if not fp.exists()]
# If download_queue is empty, exit early
if not download_queue:
return True

command_parts.append("--exclude=*")
command_parts.extend([f"--include={file_path}" for file_path in download_queue])

if "public-test" in bucket_name:
command_parts.append("--no-sign-request")

try:
subprocess.check_call(command_parts)
except subprocess.CalledProcessError as error:
logger.error(f"Data files failed to download due to the following error:\n\t{error}")
return False
# NOTE: AWS Sync does one iteration per include flag.
# This causes a tremendous slowdown when trying to sync a long list of specific files.
# In order to overcome this we should sync once
# per project folder's immediate child subdirectory or file.
download_queue = defaultdict(list)

for file_path in file_paths:
if not file_path.exists():

# default to project folder for immediately nested files
bucket_path = Path(file_path.parts[0])

if len(file_path.parts) > 2:
# append the subdirectory to the parent directory to form the bucket_path
bucket_path /= file_path.parts[1]

download_queue[bucket_path].append(file_path.relative_to(bucket_path))

for bucket_path, project_file_paths in download_queue.items():
command_parts = [
"aws",
"s3",
"sync",
f"s3://{bucket_name}/{bucket_path}",
settings.INPUT_DATA_PATH / bucket_path,
]
command_parts.append("--exclude=*")
command_parts.extend([f"--include={file_path}" for file_path in project_file_paths])

if "public-test" in bucket_name:
command_parts.append("--no-sign-request")

try:
subprocess.check_call(command_parts)
except subprocess.CalledProcessError as error:
logger.error(f"Data files failed to download due to the following error:\n\t{error}")
return False

return True

Expand Down
2 changes: 1 addition & 1 deletion infrastructure/batch.tf
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,6 @@
stage = var.stage
batch_tags = {
module = "batch",
revision = "initial - 16 vCPU compute environment and 1 queue"
revision = "first - 16 vCPU compute environment with 1 vCPU per job"
}
}
Loading