From c12b5bc7ef78a4e1665da44309f5dd10f6742314 Mon Sep 17 00:00:00 2001 From: Liam Beckman Date: Mon, 19 Dec 2022 12:38:33 -0800 Subject: [PATCH] Addressed Michael's review comments resolved basic problems - https://github.com/anvilproject/drs_downloader/pull/15 --- README.md | 17 +++++----- drs_downloader/cli.py | 46 +++++++++++++++------------ drs_downloader/clients/mock.py | 4 +-- drs_downloader/clients/terra.py | 2 +- drs_downloader/manager.py | 17 ++++------ tests/integration/README.md | 8 +++++ tests/integration/test_cli_options.py | 12 +++---- tests/integration/test_problems.py | 4 +-- tests/unit/test_basic_cli.py | 4 +-- 9 files changed, 62 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index 2efe6ac..6625fb2 100644 --- a/README.md +++ b/README.md @@ -8,15 +8,15 @@ A file download tool for AnVIL/TDR data identified by DRS URIs - [Installation](#installation) - - [Checksums](#checksums) - - [Successful Verification](#successful-verification) - - [Unsuccessful Verification](#unsuccessful-verification) + - [Checksums](#checksums) + - [Successful Verification](#successful-verification) + - [Unsuccessful Verification](#unsuccessful-verification) - [Usage](#usage) - - [Quick Start](#quick-start) - - [Example](#example) - - [Additional Options](#additional-options) + - [Quick Start](#quick-start) + - [Example](#example) + - [Additional Options](#additional-options) - [Development](#development) - - [Tests](#tests) + - [Tests](#tests) - [Authentication](#authentication) - [Credits](#credits) - [Contributing](#contributing) @@ -108,7 +108,7 @@ Usage: drs_download terra [OPTIONS] Options: -s, --silent Display nothing. - -d, --destination_dir TEXT Destination directory. [default: /tmp/testing] + -d, --destination_dir TEXT Destination directory. [default: os.getcwd()] -m, --manifest_path TEXT Path to manifest tsv. [default: tests/fixtures/terra-data.tsv] --help Show this message and exit. @@ -128,6 +128,7 @@ Then create and activate a virtual environment using `Python3.9`: python3.9 -m venv venv . venv/bin/activate pip install -r requirements.txt -r requirements-dev.txt +pip install -e . ``` Now you should be ready to start coding and testing! diff --git a/drs_downloader/cli.py b/drs_downloader/cli.py index dbe2533..ab620e9 100644 --- a/drs_downloader/cli.py +++ b/drs_downloader/cli.py @@ -1,4 +1,5 @@ import logging +import os from pathlib import Path from typing import List import math @@ -25,19 +26,19 @@ def cli(): @cli.command() @click.option("--silent", "-s", is_flag=True, show_default=True, default=False, help="Display nothing.") -@click.option("--destination_dir", "-d", show_default=True, default='/tmp/testing', +@click.option("--destination-dir", "-d", show_default=True, default=os.getcwd(), help="Destination directory.") -@click.option("--manifest_path", "-m", show_default=True, +@click.option("--manifest-path", "-m", show_default=True, help="Path to manifest tsv.") -@click.option('--drs_header', default='ga4gh_drs_uri', show_default=True, +@click.option('--drs-column-name', default='ga4gh_drs_uri', show_default=True, help='The column header in the TSV file associated with the DRS URIs.' 'Example: pfb:ga4gh_drs_uri') -def mock(silent: bool, destination_dir: str, manifest_path, drs_header): +def mock(silent: bool, destination_dir: str, manifest_path, drs_column_name): """Generate test files locally, without the need for server.""" # # get ids from manifest - ids_from_manifest = _extract_tsv_info(Path(manifest_path), drs_header) + ids_from_manifest = _extract_tsv_info(Path(manifest_path), drs_column_name) # perform downloads with a mock drs client _perform_downloads(destination_dir, MockDrsClient(), ids_from_manifest, silent) @@ -45,17 +46,17 @@ def mock(silent: bool, destination_dir: str, manifest_path, drs_header): @cli.command() @click.option("--silent", "-s", is_flag=True, show_default=True, default=False, help="Display nothing.") -@click.option("--destination_dir", "-d", show_default=True, - default="/tmp/testing", help="Destination directory.") -@click.option("--manifest_path", "-m", show_default=True, +@click.option("--destination-dir", "-d", show_default=True, + default=os.getcwd(), help="Destination directory.") +@click.option("--manifest-path", "-m", show_default=True, help="Path to manifest tsv.") -@click.option('--drs_header', default=None, help='The column header in the TSV file associated with the DRS URIs.' +@click.option('--drs-column-name', default=None, help='The column header in the TSV file associated with the DRS URIs.' 'Example: pfb:ga4gh_drs_uri') -def terra(silent: bool, destination_dir: str, manifest_path: str, drs_header: str): +def terra(silent: bool, destination_dir: str, manifest_path: str, drs_column_name: str): """Copy files from terra.bio""" # get ids from manifest - ids_from_manifest = _extract_tsv_info(Path(manifest_path), drs_header) + ids_from_manifest = _extract_tsv_info(Path(manifest_path), drs_column_name) # perform downloads with a terra drs client _perform_downloads(destination_dir, TerraDrsClient(), ids_from_manifest, silent) @@ -63,21 +64,22 @@ def terra(silent: bool, destination_dir: str, manifest_path: str, drs_header: st @cli.command() @click.option("--silent", "-s", is_flag=True, show_default=True, default=False, help="Display nothing.") -@click.option("--destination_dir", "-d", show_default=True, - default="/tmp/testing", help="Destination directory.") -@click.option("--manifest_path", "-m", show_default=True, +@click.option("--destination-dir", "-d", show_default=True, + default=os.getcwd(), help="Destination directory.") +@click.option("--manifest-path", "-m", show_default=True, help="Path to manifest tsv.") -@click.option('--drs_header', default='ga4gh_drs_uri', show_default=True, +@click.option('--drs-column-name', default='ga4gh_drs_uri', show_default=True, help='The column header in the TSV file associated with the DRS URIs.' 'Example: pfb:ga4gh_drs_uri') -@click.option('--api_key_path', show_default=True, +@click.option('--api-key-path', show_default=True, help='Gen3 credentials file') @click.option('--endpoint', show_default=True, required=True, help='Gen3 endpoint') -def gen3(silent: bool, destination_dir: str, manifest_path: str, drs_header: str, api_key_path: str, endpoint: str): +def gen3(silent: bool, destination_dir: str, manifest_path: str, drs_column_name: str, + api_key_path: str, endpoint: str): """Copy files from gen3 server.""" # read from manifest - ids_from_manifest = _extract_tsv_info(Path(manifest_path), drs_header) + ids_from_manifest = _extract_tsv_info(Path(manifest_path), drs_column_name) _perform_downloads(destination_dir, Gen3DrsClient(api_key_path=api_key_path, endpoint=endpoint), @@ -91,6 +93,10 @@ def _perform_downloads(destination_dir, drs_client, ids_from_manifest, silent): if destination_dir: destination_dir = Path(destination_dir) destination_dir.mkdir(parents=True, exist_ok=True) + + if not silent: + logger.info("Downloading to: ", destination_dir) + # create a manager drs_manager = DrsAsyncManager(drs_client=drs_client, show_progress=not silent) @@ -124,7 +130,7 @@ def _perform_downloads(destination_dir, drs_client, ids_from_manifest, silent): logger.error((drs_object.name, 'ERROR', drs_object.size, len(drs_object.file_parts), drs_object.errors)) at_least_one_error = True if at_least_one_error: - exit(99) + exit(1) def _extract_tsv_info(manifest_path: Path, drs_header: str) -> List[str]: @@ -171,7 +177,7 @@ def _extract_tsv_info(manifest_path: Path, drs_header: str) -> List[str]: " or the URI header matches the uri header name in the TSV file that was specified") for url in uris: - if '/' in url: + if 'drs://' in url: continue else: raise Exception( diff --git a/drs_downloader/clients/mock.py b/drs_downloader/clients/mock.py index 1226d75..bb4264f 100644 --- a/drs_downloader/clients/mock.py +++ b/drs_downloader/clients/mock.py @@ -71,7 +71,7 @@ async def download_part(self, drs_object: DrsObject, start: int, size: int, dest length_ = size - start + 1 # logger.info((drs_object.name, start, length_)) - with open(Path(f'/tmp/testing/{drs_object.name}.golden'), 'rb') as f: + with open(Path(os.getcwd(), f'{drs_object.name}.golden'), 'rb') as f: f.seek(start) data = f.read(length_) @@ -117,7 +117,7 @@ async def get_object(self, object_id: str) -> DrsObject: size_ = len(lines) # write it for testing - destination_dir = Path('/tmp/testing') + destination_dir = Path(os.getcwd()) destination_dir.mkdir(parents=True, exist_ok=True) with open(Path(f'{destination_dir}/{name_}.golden'), 'wb') as f: f.write(lines) diff --git a/drs_downloader/clients/terra.py b/drs_downloader/clients/terra.py index fac8e23..b4a31b2 100644 --- a/drs_downloader/clients/terra.py +++ b/drs_downloader/clients/terra.py @@ -126,7 +126,7 @@ async def get_object(self, object_id: str) -> DrsObject: """ data = { "url": object_id, - "fields": ["fileName", "size", "hashes", "accessUrl"] + "fields": ["fileName", "size", "hashes"] } session = aiohttp.ClientSession(headers={ 'authorization': 'Bearer ' + self.token, diff --git a/drs_downloader/manager.py b/drs_downloader/manager.py index 172cffc..38ff08c 100644 --- a/drs_downloader/manager.py +++ b/drs_downloader/manager.py @@ -149,7 +149,8 @@ async def _run_download_parts(self, drs_object: DrsObject, destination_path: Pat parts.append((start, size, )) if len(parts) > 1000: - logger.error(f'tasks > 1000 {drs_object.name} has over 1000 parts, consider optimization. ({len(parts)})') + logger.warning(f'Warning: tasks > 1000 {drs_object.name} has over 1000 parts and is a large download. \ + ({len(parts)})') paths = [] # TODO - tqdm ugly here? @@ -181,7 +182,8 @@ async def _run_download_parts(self, drs_object: DrsObject, destination_path: Pat drs_object.file_parts = paths i = 1 - filename = f"{drs_object.name}" + filename = f"{drs_object.name}" or drs_object.access_methods[0].access_url.split("/")[-1].split('?')[0] + print("!!!", drs_object.access_methods[0].access_url.split("/")[-1].split('?')[0]) original_file_name = Path(filename) while True: if os.path.isfile(destination_path.joinpath(filename)): @@ -312,14 +314,7 @@ def get_objects(self, object_ids: List[str]) -> List[DrsObject]: drs_objects = [] - total_batches = len(object_ids) / self.max_simultaneous_object_retrievers - # rounding - # this would imply that if batch count is 9.3, and you round down the last .3 is never - # actually downloaded since there are only 9 batches. math.ciel would round up if there is a decimal at all - if math.ceil(total_batches) - total_batches > 0: - total_batches += 1 - total_batches = int(total_batches) - + total_batches = math.ceil(len(object_ids) / self.max_simultaneous_part_handlers) current = 0 for chunk_of_object_ids in DrsAsyncManager.chunker(object_ids, self.max_simultaneous_object_retrievers): @@ -377,7 +372,7 @@ def optimize_workload(self, drs_objects: List[DrsObject]) -> List[DrsObject]: elif any(drs_object.size > (1 * GB) for drs_object in drs_objects): self.max_simultaneous_part_handlers = 10 - self.part_size = 10 * MB + self.part_size = 128 * MB self.max_simultaneous_downloaders = 10 elif all((drs_object.size < (5 * MB)) for drs_object in drs_objects): diff --git a/tests/integration/README.md b/tests/integration/README.md index 04e4332..ae04836 100644 --- a/tests/integration/README.md +++ b/tests/integration/README.md @@ -17,3 +17,11 @@ Tests in this folder require a drs server with a known population of data object * workflow mgt - manifest with 1 file * workflow mgt - any file in manifest > 1 GB + +# Needed + +* DRS URI that doesn't return name of file to test URL splitting. manager.py:184 + +* Write test for checking no destination path specified downloads to current working directory. https://github.com/anvilproject/drs_downloader/pull/15#discussion_r1051637307 + +* Write test for checking part size == 128MB on batches with 1GB files. https://github.com/anvilproject/drs_downloader/pull/15#discussion_r1051653484 \ No newline at end of file diff --git a/tests/integration/test_cli_options.py b/tests/integration/test_cli_options.py index 4804fc9..662ef34 100644 --- a/tests/integration/test_cli_options.py +++ b/tests/integration/test_cli_options.py @@ -9,7 +9,7 @@ def test_terra(): with tempfile.TemporaryDirectory() as dest: runner = CliRunner() - result = runner.invoke(cli, ['terra', '-d', dest, '--manifest_path', 'tests/fixtures/terra-data.tsv']) + result = runner.invoke(cli, ['terra', '-d', dest, '--manifest-path', 'tests/fixtures/terra-data.tsv']) assert result.exit_code == 0 files = sorted(next(os.walk(dest))[2]) @@ -21,21 +21,21 @@ def test_terra(): def test_terra_silent(): """The terra command should execute without error.""" runner = CliRunner() - result = runner.invoke(cli, ['terra', '--silent', '--manifest_path', 'tests/fixtures/terra-data.tsv']) + result = runner.invoke(cli, ['terra', '--silent', '--manifest-path', 'tests/fixtures/terra-data.tsv']) assert result.exit_code == 0 def test_gen3(): """The gen3 command should execute without error.""" runner = CliRunner() - result = runner.invoke(cli, ['gen3', '--endpoint', 'https://development.aced-idp.org', '--api_key_path', - 'tests/fixtures/credentials.json', '--manifest_path', 'tests/fixtures/gen3-small.tsv']) + result = runner.invoke(cli, ['gen3', '--endpoint', 'https://development.aced-idp.org', '--api-key-path', + 'tests/fixtures/credentials.json', '--manifest-path', 'tests/fixtures/gen3-small.tsv']) assert result.exit_code == 0 def test_gen3_silent(): """The gen3 command should execute without error.""" runner = CliRunner() - result = runner.invoke(cli, ['gen3', '--silent', '--endpoint', 'https://development.aced-idp.org', '--api_key_path', - 'tests/fixtures/credentials.json', '--manifest_path', 'tests/fixtures/gen3-small.tsv']) + result = runner.invoke(cli, ['gen3', '--silent', '--endpoint', 'https://development.aced-idp.org', '--api-key-path', + 'tests/fixtures/credentials.json', '--manifest-path', 'tests/fixtures/gen3-small.tsv']) assert result.exit_code == 0 diff --git a/tests/integration/test_problems.py b/tests/integration/test_problems.py index 8b2b495..0be5894 100644 --- a/tests/integration/test_problems.py +++ b/tests/integration/test_problems.py @@ -5,13 +5,13 @@ def test_terra_bad_tsv(): """The terra command should execute with an error.""" runner = CliRunner() - result = runner.invoke(cli, ['terra', '--manifest_path', 'tests/fixtures/terra-data-bad.tsv']) + result = runner.invoke(cli, ['terra', '--manifest-path', 'tests/fixtures/terra-data-bad.tsv']) assert result.exit_code != 0 def test_gen3_bad_tsv(): """The gen3 command should execute with an error.""" runner = CliRunner() - result = runner.invoke(cli, ['gen3', '--endpoint', 'https://development.aced-idp.org', '--manifest_path', + result = runner.invoke(cli, ['gen3', '--endpoint', 'https://development.aced-idp.org', '--manifest-path', 'tests/fixtures/gen3-bad.tsv']) assert result.exit_code != 0 diff --git a/tests/unit/test_basic_cli.py b/tests/unit/test_basic_cli.py index 9c4d46d..1a9004a 100644 --- a/tests/unit/test_basic_cli.py +++ b/tests/unit/test_basic_cli.py @@ -20,7 +20,7 @@ def test_mock_all_ok(number_of_object_ids=10): tsv_file = manifest_all_ok(number_of_object_ids) print(tsv_file.name) - result = runner.invoke(cli, ['mock', '--manifest_path', tsv_file.name]) + result = runner.invoke(cli, ['mock', '--manifest-path', tsv_file.name]) assert result.exit_code == 0 @@ -40,7 +40,7 @@ def test_mock_bad_file_size(caplog): tsv_file = manifest_bad_file_size() print(tsv_file.name) - result = runner.invoke(cli, ['mock', '--manifest_path', tsv_file.name]) + result = runner.invoke(cli, ['mock', '--manifest-path', tsv_file.name]) # should return non zero assert result.exit_code != 0