Skip to content

Commit

Permalink
Addressed Michael's review comments resolved basic problems
Browse files Browse the repository at this point in the history
- #15
  • Loading branch information
lbeckman314 committed Dec 19, 2022
1 parent 4213252 commit c12b5bc
Show file tree
Hide file tree
Showing 9 changed files with 62 additions and 52 deletions.
17 changes: 9 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@
A file download tool for AnVIL/TDR data identified by DRS URIs

- [Installation](#installation)
- [Checksums](#checksums)
- [Successful Verification](#successful-verification)
- [Unsuccessful Verification](#unsuccessful-verification)
- [Checksums](#checksums)
- [Successful Verification](#successful-verification)
- [Unsuccessful Verification](#unsuccessful-verification)
- [Usage](#usage)
- [Quick Start](#quick-start)
- [Example](#example)
- [Additional Options](#additional-options)
- [Quick Start](#quick-start)
- [Example](#example)
- [Additional Options](#additional-options)
- [Development](#development)
- [Tests](#tests)
- [Tests](#tests)
- [Authentication](#authentication)
- [Credits](#credits)
- [Contributing](#contributing)
Expand Down Expand Up @@ -108,7 +108,7 @@ Usage: drs_download terra [OPTIONS]

Options:
-s, --silent Display nothing.
-d, --destination_dir TEXT Destination directory. [default: /tmp/testing]
-d, --destination_dir TEXT Destination directory. [default: os.getcwd()]
-m, --manifest_path TEXT Path to manifest tsv. [default:
tests/fixtures/terra-data.tsv]
--help Show this message and exit.
Expand All @@ -128,6 +128,7 @@ Then create and activate a virtual environment using `Python3.9`:
python3.9 -m venv venv
. venv/bin/activate
pip install -r requirements.txt -r requirements-dev.txt
pip install -e .
```
Now you should be ready to start coding and testing!
Expand Down
46 changes: 26 additions & 20 deletions drs_downloader/cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import os
from pathlib import Path
from typing import List
import math
Expand All @@ -25,59 +26,60 @@ def cli():

@cli.command()
@click.option("--silent", "-s", is_flag=True, show_default=True, default=False, help="Display nothing.")
@click.option("--destination_dir", "-d", show_default=True, default='/tmp/testing',
@click.option("--destination-dir", "-d", show_default=True, default=os.getcwd(),
help="Destination directory.")
@click.option("--manifest_path", "-m", show_default=True,
@click.option("--manifest-path", "-m", show_default=True,
help="Path to manifest tsv.")
@click.option('--drs_header', default='ga4gh_drs_uri', show_default=True,
@click.option('--drs-column-name', default='ga4gh_drs_uri', show_default=True,
help='The column header in the TSV file associated with the DRS URIs.'
'Example: pfb:ga4gh_drs_uri')
def mock(silent: bool, destination_dir: str, manifest_path, drs_header):
def mock(silent: bool, destination_dir: str, manifest_path, drs_column_name):
"""Generate test files locally, without the need for server."""

#
# get ids from manifest
ids_from_manifest = _extract_tsv_info(Path(manifest_path), drs_header)
ids_from_manifest = _extract_tsv_info(Path(manifest_path), drs_column_name)

# perform downloads with a mock drs client
_perform_downloads(destination_dir, MockDrsClient(), ids_from_manifest, silent)


@cli.command()
@click.option("--silent", "-s", is_flag=True, show_default=True, default=False, help="Display nothing.")
@click.option("--destination_dir", "-d", show_default=True,
default="/tmp/testing", help="Destination directory.")
@click.option("--manifest_path", "-m", show_default=True,
@click.option("--destination-dir", "-d", show_default=True,
default=os.getcwd(), help="Destination directory.")
@click.option("--manifest-path", "-m", show_default=True,
help="Path to manifest tsv.")
@click.option('--drs_header', default=None, help='The column header in the TSV file associated with the DRS URIs.'
@click.option('--drs-column-name', default=None, help='The column header in the TSV file associated with the DRS URIs.'
'Example: pfb:ga4gh_drs_uri')
def terra(silent: bool, destination_dir: str, manifest_path: str, drs_header: str):
def terra(silent: bool, destination_dir: str, manifest_path: str, drs_column_name: str):
"""Copy files from terra.bio"""

# get ids from manifest
ids_from_manifest = _extract_tsv_info(Path(manifest_path), drs_header)
ids_from_manifest = _extract_tsv_info(Path(manifest_path), drs_column_name)

# perform downloads with a terra drs client
_perform_downloads(destination_dir, TerraDrsClient(), ids_from_manifest, silent)


@cli.command()
@click.option("--silent", "-s", is_flag=True, show_default=True, default=False, help="Display nothing.")
@click.option("--destination_dir", "-d", show_default=True,
default="/tmp/testing", help="Destination directory.")
@click.option("--manifest_path", "-m", show_default=True,
@click.option("--destination-dir", "-d", show_default=True,
default=os.getcwd(), help="Destination directory.")
@click.option("--manifest-path", "-m", show_default=True,
help="Path to manifest tsv.")
@click.option('--drs_header', default='ga4gh_drs_uri', show_default=True,
@click.option('--drs-column-name', default='ga4gh_drs_uri', show_default=True,
help='The column header in the TSV file associated with the DRS URIs.'
'Example: pfb:ga4gh_drs_uri')
@click.option('--api_key_path', show_default=True,
@click.option('--api-key-path', show_default=True,
help='Gen3 credentials file')
@click.option('--endpoint', show_default=True, required=True,
help='Gen3 endpoint')
def gen3(silent: bool, destination_dir: str, manifest_path: str, drs_header: str, api_key_path: str, endpoint: str):
def gen3(silent: bool, destination_dir: str, manifest_path: str, drs_column_name: str,
api_key_path: str, endpoint: str):
"""Copy files from gen3 server."""
# read from manifest
ids_from_manifest = _extract_tsv_info(Path(manifest_path), drs_header)
ids_from_manifest = _extract_tsv_info(Path(manifest_path), drs_column_name)

_perform_downloads(destination_dir,
Gen3DrsClient(api_key_path=api_key_path, endpoint=endpoint),
Expand All @@ -91,6 +93,10 @@ def _perform_downloads(destination_dir, drs_client, ids_from_manifest, silent):
if destination_dir:
destination_dir = Path(destination_dir)
destination_dir.mkdir(parents=True, exist_ok=True)

if not silent:
logger.info("Downloading to: ", destination_dir)

# create a manager
drs_manager = DrsAsyncManager(drs_client=drs_client, show_progress=not silent)

Expand Down Expand Up @@ -124,7 +130,7 @@ def _perform_downloads(destination_dir, drs_client, ids_from_manifest, silent):
logger.error((drs_object.name, 'ERROR', drs_object.size, len(drs_object.file_parts), drs_object.errors))
at_least_one_error = True
if at_least_one_error:
exit(99)
exit(1)


def _extract_tsv_info(manifest_path: Path, drs_header: str) -> List[str]:
Expand Down Expand Up @@ -171,7 +177,7 @@ def _extract_tsv_info(manifest_path: Path, drs_header: str) -> List[str]:
" or the URI header matches the uri header name in the TSV file that was specified")

for url in uris:
if '/' in url:
if 'drs://' in url:
continue
else:
raise Exception(
Expand Down
4 changes: 2 additions & 2 deletions drs_downloader/clients/mock.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ async def download_part(self, drs_object: DrsObject, start: int, size: int, dest

length_ = size - start + 1
# logger.info((drs_object.name, start, length_))
with open(Path(f'/tmp/testing/{drs_object.name}.golden'), 'rb') as f:
with open(Path(os.getcwd(), f'{drs_object.name}.golden'), 'rb') as f:
f.seek(start)
data = f.read(length_)

Expand Down Expand Up @@ -117,7 +117,7 @@ async def get_object(self, object_id: str) -> DrsObject:
size_ = len(lines)

# write it for testing
destination_dir = Path('/tmp/testing')
destination_dir = Path(os.getcwd())
destination_dir.mkdir(parents=True, exist_ok=True)
with open(Path(f'{destination_dir}/{name_}.golden'), 'wb') as f:
f.write(lines)
Expand Down
2 changes: 1 addition & 1 deletion drs_downloader/clients/terra.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ async def get_object(self, object_id: str) -> DrsObject:
"""
data = {
"url": object_id,
"fields": ["fileName", "size", "hashes", "accessUrl"]
"fields": ["fileName", "size", "hashes"]
}
session = aiohttp.ClientSession(headers={
'authorization': 'Bearer ' + self.token,
Expand Down
17 changes: 6 additions & 11 deletions drs_downloader/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,8 @@ async def _run_download_parts(self, drs_object: DrsObject, destination_path: Pat
parts.append((start, size, ))

if len(parts) > 1000:
logger.error(f'tasks > 1000 {drs_object.name} has over 1000 parts, consider optimization. ({len(parts)})')
logger.warning(f'Warning: tasks > 1000 {drs_object.name} has over 1000 parts and is a large download. \
({len(parts)})')

paths = []
# TODO - tqdm ugly here?
Expand Down Expand Up @@ -181,7 +182,8 @@ async def _run_download_parts(self, drs_object: DrsObject, destination_path: Pat
drs_object.file_parts = paths

i = 1
filename = f"{drs_object.name}"
filename = f"{drs_object.name}" or drs_object.access_methods[0].access_url.split("/")[-1].split('?')[0]
print("!!!", drs_object.access_methods[0].access_url.split("/")[-1].split('?')[0])
original_file_name = Path(filename)
while True:
if os.path.isfile(destination_path.joinpath(filename)):
Expand Down Expand Up @@ -312,14 +314,7 @@ def get_objects(self, object_ids: List[str]) -> List[DrsObject]:

drs_objects = []

total_batches = len(object_ids) / self.max_simultaneous_object_retrievers
# rounding
# this would imply that if batch count is 9.3, and you round down the last .3 is never
# actually downloaded since there are only 9 batches. math.ciel would round up if there is a decimal at all
if math.ceil(total_batches) - total_batches > 0:
total_batches += 1
total_batches = int(total_batches)

total_batches = math.ceil(len(object_ids) / self.max_simultaneous_part_handlers)
current = 0

for chunk_of_object_ids in DrsAsyncManager.chunker(object_ids, self.max_simultaneous_object_retrievers):
Expand Down Expand Up @@ -377,7 +372,7 @@ def optimize_workload(self, drs_objects: List[DrsObject]) -> List[DrsObject]:

elif any(drs_object.size > (1 * GB) for drs_object in drs_objects):
self.max_simultaneous_part_handlers = 10
self.part_size = 10 * MB
self.part_size = 128 * MB
self.max_simultaneous_downloaders = 10

elif all((drs_object.size < (5 * MB)) for drs_object in drs_objects):
Expand Down
8 changes: 8 additions & 0 deletions tests/integration/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,11 @@ Tests in this folder require a drs server with a known population of data object
* workflow mgt - manifest with 1 file
* workflow mgt - any file in manifest > 1 GB


# Needed

* DRS URI that doesn't return name of file to test URL splitting. manager.py:184

* Write test for checking no destination path specified downloads to current working directory. https://github.com/anvilproject/drs_downloader/pull/15#discussion_r1051637307

* Write test for checking part size == 128MB on batches with 1GB files. https://github.com/anvilproject/drs_downloader/pull/15#discussion_r1051653484
12 changes: 6 additions & 6 deletions tests/integration/test_cli_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
def test_terra():
with tempfile.TemporaryDirectory() as dest:
runner = CliRunner()
result = runner.invoke(cli, ['terra', '-d', dest, '--manifest_path', 'tests/fixtures/terra-data.tsv'])
result = runner.invoke(cli, ['terra', '-d', dest, '--manifest-path', 'tests/fixtures/terra-data.tsv'])
assert result.exit_code == 0

files = sorted(next(os.walk(dest))[2])
Expand All @@ -21,21 +21,21 @@ def test_terra():
def test_terra_silent():
"""The terra command should execute without error."""
runner = CliRunner()
result = runner.invoke(cli, ['terra', '--silent', '--manifest_path', 'tests/fixtures/terra-data.tsv'])
result = runner.invoke(cli, ['terra', '--silent', '--manifest-path', 'tests/fixtures/terra-data.tsv'])
assert result.exit_code == 0


def test_gen3():
"""The gen3 command should execute without error."""
runner = CliRunner()
result = runner.invoke(cli, ['gen3', '--endpoint', 'https://development.aced-idp.org', '--api_key_path',
'tests/fixtures/credentials.json', '--manifest_path', 'tests/fixtures/gen3-small.tsv'])
result = runner.invoke(cli, ['gen3', '--endpoint', 'https://development.aced-idp.org', '--api-key-path',
'tests/fixtures/credentials.json', '--manifest-path', 'tests/fixtures/gen3-small.tsv'])
assert result.exit_code == 0


def test_gen3_silent():
"""The gen3 command should execute without error."""
runner = CliRunner()
result = runner.invoke(cli, ['gen3', '--silent', '--endpoint', 'https://development.aced-idp.org', '--api_key_path',
'tests/fixtures/credentials.json', '--manifest_path', 'tests/fixtures/gen3-small.tsv'])
result = runner.invoke(cli, ['gen3', '--silent', '--endpoint', 'https://development.aced-idp.org', '--api-key-path',
'tests/fixtures/credentials.json', '--manifest-path', 'tests/fixtures/gen3-small.tsv'])
assert result.exit_code == 0
4 changes: 2 additions & 2 deletions tests/integration/test_problems.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
def test_terra_bad_tsv():
"""The terra command should execute with an error."""
runner = CliRunner()
result = runner.invoke(cli, ['terra', '--manifest_path', 'tests/fixtures/terra-data-bad.tsv'])
result = runner.invoke(cli, ['terra', '--manifest-path', 'tests/fixtures/terra-data-bad.tsv'])
assert result.exit_code != 0


def test_gen3_bad_tsv():
"""The gen3 command should execute with an error."""
runner = CliRunner()
result = runner.invoke(cli, ['gen3', '--endpoint', 'https://development.aced-idp.org', '--manifest_path',
result = runner.invoke(cli, ['gen3', '--endpoint', 'https://development.aced-idp.org', '--manifest-path',
'tests/fixtures/gen3-bad.tsv'])
assert result.exit_code != 0
4 changes: 2 additions & 2 deletions tests/unit/test_basic_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_mock_all_ok(number_of_object_ids=10):
tsv_file = manifest_all_ok(number_of_object_ids)
print(tsv_file.name)

result = runner.invoke(cli, ['mock', '--manifest_path', tsv_file.name])
result = runner.invoke(cli, ['mock', '--manifest-path', tsv_file.name])

assert result.exit_code == 0

Expand All @@ -40,7 +40,7 @@ def test_mock_bad_file_size(caplog):
tsv_file = manifest_bad_file_size()

print(tsv_file.name)
result = runner.invoke(cli, ['mock', '--manifest_path', tsv_file.name])
result = runner.invoke(cli, ['mock', '--manifest-path', tsv_file.name])

# should return non zero
assert result.exit_code != 0
Expand Down

0 comments on commit c12b5bc

Please sign in to comment.