From 8a533cee919f1e5ac47546388573b2b723b6c875 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Thu, 23 May 2019 13:53:10 -0400 Subject: [PATCH 1/5] Intakifying osm-1billion --- osm/anaconda-project.yml | 34 +++++++++++++++++--- osm/catalog.yml | 10 ++++++ osm-1billion.ipynb => osm/osm-1billion.ipynb | 23 +++++++++++-- 3 files changed, 60 insertions(+), 7 deletions(-) create mode 100644 osm/catalog.yml rename osm-1billion.ipynb => osm/osm-1billion.ipynb (94%) diff --git a/osm/anaconda-project.yml b/osm/anaconda-project.yml index 4422e85ae..5c601ede8 100644 --- a/osm/anaconda-project.yml +++ b/osm/anaconda-project.yml @@ -1,6 +1,6 @@ # To reproduce: install 'anaconda-project', then 'anaconda-project run' name: osm -description: Datashading 2.7-billion-point Open Street Map database +description: Datashading Open Street Map database maintainers: - jbednar @@ -22,6 +22,11 @@ packages: *deps commands: notebook: + description: Datashading 1-billion-point Open Street Map database + notebook: osm-1billion.ipynb + env_spec: spatial + osm-3billion: + description: Datashading 2.7-billion-point Open Street Map database notebook: osm.ipynb test: unix: pytest --nbsmoke-run -k *.ipynb --ignore envs @@ -32,12 +37,33 @@ commands: windows: pytest --nbsmoke-lint -k *.ipynb --ignore envs env_spec: test -variables: {} +variables: + INTAKE_CACHE_DIR: data downloads: {} env_specs: default: {} + spatial: + channels: + - pyviz + - intake + packages: + - holoviews + - intake + - intake-parquet + - ipywidgets + - jupyter + - tqdm test: + channels: + - pyviz + - intake packages: - - nbsmoke ==0.2.8 - - pytest ==4.4.1 + - nbsmoke ==0.2.8 + - pytest ==4.4.1 + - holoviews + - intake + - intake-parquet + - ipywidgets + - jupyter + - tqdm diff --git a/osm/catalog.yml b/osm/catalog.yml new file mode 100644 index 000000000..c8ec3e8af --- /dev/null +++ b/osm/catalog.yml @@ -0,0 +1,10 @@ +sources: + osm_one_billion: + description: Open Street Map database as a parquet file (1-billion-point) + driver: parquet + cache: + - argkey: urlpath + regex: https://s3.amazonaws.com/datashader-data + type: compressed + args: + urlpath: https://s3.amazonaws.com/datashader-data/osm-1billion.parq.zip diff --git a/osm-1billion.ipynb b/osm/osm-1billion.ipynb similarity index 94% rename from osm-1billion.ipynb rename to osm/osm-1billion.ipynb index 7cb17836f..ef167098d 100644 --- a/osm-1billion.ipynb +++ b/osm/osm-1billion.ipynb @@ -26,9 +26,9 @@ "metadata": {}, "outputs": [], "source": [ + "import intake\n", "import datashader as ds\n", "import datashader.transfer_functions as tf\n", - "from datashader import spatial\n", "\n", "from colorcet import fire" ] @@ -39,8 +39,25 @@ "metadata": {}, "outputs": [], "source": [ - "%%time\n", - "df = spatial.read_parquet('../data/osm-1billion.parq')\n", + "cat = intake.open_catalog('./catalog.yml')\n", + "data_entry = cat.osm_one_billion\n", + "data_entry" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the file isn't downloaded yet. The following step will take some time:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = data_entry.to_spatial()\n", "df = df.persist()" ] }, From 5762f33af4c74f36b42924217fe2856bdb9953f7 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Thu, 23 May 2019 13:54:33 -0400 Subject: [PATCH 2/5] Putting test intake logic in dodo --- .travis.yml | 5 -- dodo.py | 106 +++++++++++++++++++++++++++----------- test_data/osm/catalog.yml | 6 +++ 3 files changed, 83 insertions(+), 34 deletions(-) create mode 100644 test_data/osm/catalog.yml diff --git a/.travis.yml b/.travis.yml index 3400740da..85dc231a2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -46,11 +46,6 @@ before_install: script: - doit small_data_setup --name $DIR - cd $DIR - - if ! anaconda-project list-downloads | grep -q 'No downloads'; then - if ! [ -d data ]; then - echo 'FAIL needs data and no test data found' && exit 1; - fi; - fi - $RUN_LINT && anaconda-project run lint || echo "skipping lint" - $RUN_TEST && anaconda-project run test || echo "skipping test" - cd .. diff --git a/dodo.py b/dodo.py index ac7ba868e..37a72e3e3 100644 --- a/dodo.py +++ b/dodo.py @@ -25,16 +25,28 @@ def task_ecosystem_setup(): 'default': 'attractors' } -def _prepare_paths(root, name, test_data): +def _prepare_paths(root, name, test_data, filename='catalog.yml'): if root == '': root = os.getcwd() root = os.path.abspath(root) test_data = test_data if os.path.isabs(test_data) else os.path.join(root, test_data) + test_path = os.path.join(test_data, name) + project_path = os.path.join(root, name) return { - 'real': os.path.join(root, name, 'data'), - 'test': os.path.join(test_data, name), + 'project': project_path, + 'real': os.path.join(project_path, 'data'), + 'test': test_path, + 'cat_real': os.path.join(project_path, filename), + 'cat_test': os.path.join(test_path, filename), + 'cat_tmp': os.path.join(project_path, 'tmp_' + filename), } +def _has_download(path, filename='anaconda-project.yml'): + import yaml + with open(os.path.join(path, filename), 'r') as f: + text = yaml.load(f, Loader=yaml.FullLoader) + return bool(text.get('downloads', {})) + # From https://stackoverflow.com/a/24860799/4021797 class dircmp(filecmp.dircmp): """ @@ -64,26 +76,51 @@ def is_same(dir1, dir2): return False return True - def task_small_data_setup(): """Copy small versions of the data from test_data""" - def copy_test_data(root='', name='attractors', test_data='test_data'): - print('Copying test data for {} ...'.format(name)) - paths = _prepare_paths(root, name, test_data) + def copy_test_data(root='', name='attractors', test_data='test_data', cat_filename='catalog.yml'): + print('Setting up test data for {}:'.format(name)) - if not os.path.exists(paths['test']): - print("Warning: No test_data found for {} in {}".format(name, paths['test'])) - return + paths = _prepare_paths(root, name, test_data, cat_filename) + has_catalog = os.path.exists(paths['cat_real']) + has_download = _has_download(paths['project']) + + if not os.path.exists(paths['test']) or not os.listdir(paths['test']): + if has_catalog or has_download: + raise ValueError("Fail: {} has no test_data".format(name)) + else: + print(" Nothing to do: Test data not needed for {}".format(name)) + print("Done!") + return + + if has_catalog and not os.path.exists(paths['cat_test']): + raise ValueError("Fail: {} contains intake catalog, but " + "no catalog found in test_data".format(name)) + + if has_catalog: + print('* Copying intake catalog ...') + + # move real catalog file to tmp if tmp doesn't exist + if os.path.exists(paths['cat_tmp']): + raise ValueError("Fail: Temp file already exists - try 'doit small_data_cleanup'") + os.rename(paths['cat_real'], paths['cat_tmp']) + # move test catalog to project directory + shutil.copyfile(paths['cat_test'], paths['cat_real']) + print(' Intake catalog successfully copied') + + print('* Copying test data ...') if os.path.exists(paths['real']) and os.listdir(paths['real']): matching_files = filecmp.dircmp(paths['test'], paths['real']).same_files if os.listdir(paths['real']) != matching_files: raise ValueError("Fail: Data files already exist in {}".format(paths['real'])) else: - print("Nothing to do: Test data already in {}".format(paths['real'])) - return - copy_tree(paths['test'], paths['real']) + print(" Nothing to do: Test data already in {}".format(paths['real'])) + else: + copy_tree(paths['test'], paths['real']) + print(' Test data sucessfully copied') + print("Done!") return {'actions': [copy_test_data], 'params': [name_param]} @@ -92,27 +129,38 @@ def copy_test_data(root='', name='attractors', test_data='test_data'): def task_small_data_cleanup(): """Remove test_data from real data path""" - def remove_test_data(root='', name='attractors', test_data='test_data'): - print('Removing test data for {} ...'.format(name)) - paths = _prepare_paths(root, name, test_data) + def remove_test_data(root='', name='attractors', test_data='test_data', + cat_filename='catalog.yml'): + print('Cleaning up test data for {}:'.format(name)) + paths = _prepare_paths(root, name, test_data, cat_filename) - if not os.path.exists(paths['test']): - print("Nothing to do: No test_data found for {} in {}".format(name, paths['test'])) - return + if os.path.exists(paths['cat_real']): + print("* Replacing intake catalog ...") - if not os.path.exists(paths['real']): - print("Nothing to do: No data found in {}".format(paths['real'])) - return + if not os.path.exists(paths['cat_tmp']): + print(" Nothing to do: No temp file found. Use git status to " + "check that you have the real catalog at {}".format(paths['cat_real'])) + else: + os.remove(paths['cat_real']) + os.rename(paths['cat_tmp'], paths['cat_real']) + print(' Intake catalog successfully cleaned') - if not os.listdir(paths['real']): - print("No data found in {}, just removing empty dir".format(paths['real'])) - os.rmdir(paths['real']) - return + print('* Removing test data ...') - if not is_same(paths['test'], paths['real']): - raise ValueError("Fail: Data files at {} are not identical to test, so they shouldn't be deleted.".format(paths['real'])) + if not os.path.exists(paths['test']): + print(" Nothing to do: No test_data found for {} in {}".format(name, paths['test'])) + elif not os.path.exists(paths['real']): + print(" Nothing to do: No data found in {}".format(paths['real'])) + elif not os.listdir(paths['real']): + os.rmdir(paths['real']) + print(" No data found in {}, just removed empty dir".format(paths['real'])) + elif not is_same(paths['test'], paths['real']): + raise ValueError("Fail: Data files at {} are not identical to test, " + "so they shouldn't be deleted.".format(paths['real'])) + else: + shutil.rmtree(paths['real']) + print(' Test data successfully removed') - shutil.rmtree(paths['real']) print("Done!") return {'actions': [remove_test_data], 'params': [name_param]} diff --git a/test_data/osm/catalog.yml b/test_data/osm/catalog.yml new file mode 100644 index 000000000..ea994ed39 --- /dev/null +++ b/test_data/osm/catalog.yml @@ -0,0 +1,6 @@ +sources: + osm_one_billion: + description: Test data points to same fake osm-3billion file + driver: parquet + args: + urlpath: '{{ CATALOG_DIR }}/data/osm-3billion.parq' From 4812a30787ae800365cc49a1d9058de2d056e44a Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Thu, 23 May 2019 14:14:00 -0400 Subject: [PATCH 3/5] Adding yaml to setup and using git source [only:osm] --- dodo.py | 2 +- osm/anaconda-project.yml | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/dodo.py b/dodo.py index 37a72e3e3..d538291fc 100644 --- a/dodo.py +++ b/dodo.py @@ -14,7 +14,7 @@ def task_ecosystem_setup(): return {'actions': [ "conda config --set always_yes True", "conda update conda", - "conda install anaconda-project 'tornado<5.0'", + "conda install anaconda-project 'tornado<5.0' pyyaml", ]} diff --git a/osm/anaconda-project.yml b/osm/anaconda-project.yml index 5c601ede8..2f1481c57 100644 --- a/osm/anaconda-project.yml +++ b/osm/anaconda-project.yml @@ -29,12 +29,12 @@ commands: description: Datashading 2.7-billion-point Open Street Map database notebook: osm.ipynb test: - unix: pytest --nbsmoke-run -k *.ipynb --ignore envs - windows: pytest --nbsmoke-run -k *.ipynb --ignore envs + unix: pytest --nbsmoke-run --ignore envs + windows: pytest --nbsmoke-run --ignore envs env_spec: test lint: - unix: pytest --nbsmoke-lint -k *.ipynb --ignore envs - windows: pytest --nbsmoke-lint -k *.ipynb --ignore envs + unix: pytest --nbsmoke-lint --ignore envs + windows: pytest --nbsmoke-lint --ignore envs env_spec: test variables: @@ -50,10 +50,11 @@ env_specs: packages: - holoviews - intake - - intake-parquet - ipywidgets - jupyter - tqdm + - pip: + - git+https://github.com/intake/intake-parquet.git@jsignell/spatial#egg=intake_parquet test: channels: - pyviz @@ -63,7 +64,8 @@ env_specs: - pytest ==4.4.1 - holoviews - intake - - intake-parquet - ipywidgets - jupyter - tqdm + - pip: + - git+https://github.com/intake/intake-parquet.git@jsignell/spatial#egg=intake_parquet From 3a4f980e48c4a164e918a55f74b1c92c311fffcc Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Thu, 23 May 2019 14:21:29 -0400 Subject: [PATCH 4/5] Don't to yaml parsing in dodo [only:osm] --- .travis.yml | 5 +++++ dodo.py | 13 +++---------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/.travis.yml b/.travis.yml index 85dc231a2..3400740da 100644 --- a/.travis.yml +++ b/.travis.yml @@ -46,6 +46,11 @@ before_install: script: - doit small_data_setup --name $DIR - cd $DIR + - if ! anaconda-project list-downloads | grep -q 'No downloads'; then + if ! [ -d data ]; then + echo 'FAIL needs data and no test data found' && exit 1; + fi; + fi - $RUN_LINT && anaconda-project run lint || echo "skipping lint" - $RUN_TEST && anaconda-project run test || echo "skipping test" - cd .. diff --git a/dodo.py b/dodo.py index d538291fc..b82d58142 100644 --- a/dodo.py +++ b/dodo.py @@ -14,7 +14,7 @@ def task_ecosystem_setup(): return {'actions': [ "conda config --set always_yes True", "conda update conda", - "conda install anaconda-project 'tornado<5.0' pyyaml", + "conda install anaconda-project 'tornado<5.0'", ]} @@ -41,12 +41,6 @@ def _prepare_paths(root, name, test_data, filename='catalog.yml'): 'cat_tmp': os.path.join(project_path, 'tmp_' + filename), } -def _has_download(path, filename='anaconda-project.yml'): - import yaml - with open(os.path.join(path, filename), 'r') as f: - text = yaml.load(f, Loader=yaml.FullLoader) - return bool(text.get('downloads', {})) - # From https://stackoverflow.com/a/24860799/4021797 class dircmp(filecmp.dircmp): """ @@ -84,13 +78,12 @@ def copy_test_data(root='', name='attractors', test_data='test_data', cat_filena paths = _prepare_paths(root, name, test_data, cat_filename) has_catalog = os.path.exists(paths['cat_real']) - has_download = _has_download(paths['project']) if not os.path.exists(paths['test']) or not os.listdir(paths['test']): - if has_catalog or has_download: + if has_catalog: raise ValueError("Fail: {} has no test_data".format(name)) else: - print(" Nothing to do: Test data not needed for {}".format(name)) + print(" Nothing to do: Test data not found for {}".format(name)) print("Done!") return From 6de43741e191f24af082d4ba02d0ff71985b932a Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Thu, 23 May 2019 17:23:20 -0400 Subject: [PATCH 5/5] Using s3 directly --- osm/anaconda-project.yml | 2 ++ osm/catalog.yml | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/osm/anaconda-project.yml b/osm/anaconda-project.yml index 2f1481c57..bd811eedf 100644 --- a/osm/anaconda-project.yml +++ b/osm/anaconda-project.yml @@ -52,6 +52,7 @@ env_specs: - intake - ipywidgets - jupyter + - s3fs - tqdm - pip: - git+https://github.com/intake/intake-parquet.git@jsignell/spatial#egg=intake_parquet @@ -66,6 +67,7 @@ env_specs: - intake - ipywidgets - jupyter + - s3fs - tqdm - pip: - git+https://github.com/intake/intake-parquet.git@jsignell/spatial#egg=intake_parquet diff --git a/osm/catalog.yml b/osm/catalog.yml index c8ec3e8af..78aac79a0 100644 --- a/osm/catalog.yml +++ b/osm/catalog.yml @@ -4,7 +4,8 @@ sources: driver: parquet cache: - argkey: urlpath - regex: https://s3.amazonaws.com/datashader-data + regex: 's3://datashader-data/osm-1billion.parq.zip' type: compressed args: - urlpath: https://s3.amazonaws.com/datashader-data/osm-1billion.parq.zip + urlpath: 's3://datashader-data/osm-1billion.parq.zip' + storage_options: {'anon': True}