diff --git a/dodo.py b/dodo.py index ac7ba868e..b82d58142 100644 --- a/dodo.py +++ b/dodo.py @@ -25,14 +25,20 @@ def task_ecosystem_setup(): 'default': 'attractors' } -def _prepare_paths(root, name, test_data): +def _prepare_paths(root, name, test_data, filename='catalog.yml'): if root == '': root = os.getcwd() root = os.path.abspath(root) test_data = test_data if os.path.isabs(test_data) else os.path.join(root, test_data) + test_path = os.path.join(test_data, name) + project_path = os.path.join(root, name) return { - 'real': os.path.join(root, name, 'data'), - 'test': os.path.join(test_data, name), + 'project': project_path, + 'real': os.path.join(project_path, 'data'), + 'test': test_path, + 'cat_real': os.path.join(project_path, filename), + 'cat_test': os.path.join(test_path, filename), + 'cat_tmp': os.path.join(project_path, 'tmp_' + filename), } # From https://stackoverflow.com/a/24860799/4021797 @@ -64,26 +70,50 @@ def is_same(dir1, dir2): return False return True - def task_small_data_setup(): """Copy small versions of the data from test_data""" - def copy_test_data(root='', name='attractors', test_data='test_data'): - print('Copying test data for {} ...'.format(name)) - paths = _prepare_paths(root, name, test_data) + def copy_test_data(root='', name='attractors', test_data='test_data', cat_filename='catalog.yml'): + print('Setting up test data for {}:'.format(name)) - if not os.path.exists(paths['test']): - print("Warning: No test_data found for {} in {}".format(name, paths['test'])) - return + paths = _prepare_paths(root, name, test_data, cat_filename) + has_catalog = os.path.exists(paths['cat_real']) + + if not os.path.exists(paths['test']) or not os.listdir(paths['test']): + if has_catalog: + raise ValueError("Fail: {} has no test_data".format(name)) + else: + print(" Nothing to do: Test data not found for {}".format(name)) + print("Done!") + return + + if has_catalog and not os.path.exists(paths['cat_test']): + raise ValueError("Fail: {} contains intake catalog, but " + "no catalog found in test_data".format(name)) + + if has_catalog: + print('* Copying intake catalog ...') + + # move real catalog file to tmp if tmp doesn't exist + if os.path.exists(paths['cat_tmp']): + raise ValueError("Fail: Temp file already exists - try 'doit small_data_cleanup'") + os.rename(paths['cat_real'], paths['cat_tmp']) + # move test catalog to project directory + shutil.copyfile(paths['cat_test'], paths['cat_real']) + print(' Intake catalog successfully copied') + + print('* Copying test data ...') if os.path.exists(paths['real']) and os.listdir(paths['real']): matching_files = filecmp.dircmp(paths['test'], paths['real']).same_files if os.listdir(paths['real']) != matching_files: raise ValueError("Fail: Data files already exist in {}".format(paths['real'])) else: - print("Nothing to do: Test data already in {}".format(paths['real'])) - return - copy_tree(paths['test'], paths['real']) + print(" Nothing to do: Test data already in {}".format(paths['real'])) + else: + copy_tree(paths['test'], paths['real']) + print(' Test data sucessfully copied') + print("Done!") return {'actions': [copy_test_data], 'params': [name_param]} @@ -92,27 +122,38 @@ def copy_test_data(root='', name='attractors', test_data='test_data'): def task_small_data_cleanup(): """Remove test_data from real data path""" - def remove_test_data(root='', name='attractors', test_data='test_data'): - print('Removing test data for {} ...'.format(name)) - paths = _prepare_paths(root, name, test_data) + def remove_test_data(root='', name='attractors', test_data='test_data', + cat_filename='catalog.yml'): + print('Cleaning up test data for {}:'.format(name)) + paths = _prepare_paths(root, name, test_data, cat_filename) - if not os.path.exists(paths['test']): - print("Nothing to do: No test_data found for {} in {}".format(name, paths['test'])) - return + if os.path.exists(paths['cat_real']): + print("* Replacing intake catalog ...") - if not os.path.exists(paths['real']): - print("Nothing to do: No data found in {}".format(paths['real'])) - return + if not os.path.exists(paths['cat_tmp']): + print(" Nothing to do: No temp file found. Use git status to " + "check that you have the real catalog at {}".format(paths['cat_real'])) + else: + os.remove(paths['cat_real']) + os.rename(paths['cat_tmp'], paths['cat_real']) + print(' Intake catalog successfully cleaned') - if not os.listdir(paths['real']): - print("No data found in {}, just removing empty dir".format(paths['real'])) - os.rmdir(paths['real']) - return + print('* Removing test data ...') - if not is_same(paths['test'], paths['real']): - raise ValueError("Fail: Data files at {} are not identical to test, so they shouldn't be deleted.".format(paths['real'])) + if not os.path.exists(paths['test']): + print(" Nothing to do: No test_data found for {} in {}".format(name, paths['test'])) + elif not os.path.exists(paths['real']): + print(" Nothing to do: No data found in {}".format(paths['real'])) + elif not os.listdir(paths['real']): + os.rmdir(paths['real']) + print(" No data found in {}, just removed empty dir".format(paths['real'])) + elif not is_same(paths['test'], paths['real']): + raise ValueError("Fail: Data files at {} are not identical to test, " + "so they shouldn't be deleted.".format(paths['real'])) + else: + shutil.rmtree(paths['real']) + print(' Test data successfully removed') - shutil.rmtree(paths['real']) print("Done!") return {'actions': [remove_test_data], 'params': [name_param]} diff --git a/osm/anaconda-project.yml b/osm/anaconda-project.yml index 4422e85ae..bd811eedf 100644 --- a/osm/anaconda-project.yml +++ b/osm/anaconda-project.yml @@ -1,6 +1,6 @@ # To reproduce: install 'anaconda-project', then 'anaconda-project run' name: osm -description: Datashading 2.7-billion-point Open Street Map database +description: Datashading Open Street Map database maintainers: - jbednar @@ -22,22 +22,52 @@ packages: *deps commands: notebook: + description: Datashading 1-billion-point Open Street Map database + notebook: osm-1billion.ipynb + env_spec: spatial + osm-3billion: + description: Datashading 2.7-billion-point Open Street Map database notebook: osm.ipynb test: - unix: pytest --nbsmoke-run -k *.ipynb --ignore envs - windows: pytest --nbsmoke-run -k *.ipynb --ignore envs + unix: pytest --nbsmoke-run --ignore envs + windows: pytest --nbsmoke-run --ignore envs env_spec: test lint: - unix: pytest --nbsmoke-lint -k *.ipynb --ignore envs - windows: pytest --nbsmoke-lint -k *.ipynb --ignore envs + unix: pytest --nbsmoke-lint --ignore envs + windows: pytest --nbsmoke-lint --ignore envs env_spec: test -variables: {} +variables: + INTAKE_CACHE_DIR: data downloads: {} env_specs: default: {} + spatial: + channels: + - pyviz + - intake + packages: + - holoviews + - intake + - ipywidgets + - jupyter + - s3fs + - tqdm + - pip: + - git+https://github.com/intake/intake-parquet.git@jsignell/spatial#egg=intake_parquet test: + channels: + - pyviz + - intake packages: - - nbsmoke ==0.2.8 - - pytest ==4.4.1 + - nbsmoke ==0.2.8 + - pytest ==4.4.1 + - holoviews + - intake + - ipywidgets + - jupyter + - s3fs + - tqdm + - pip: + - git+https://github.com/intake/intake-parquet.git@jsignell/spatial#egg=intake_parquet diff --git a/osm/catalog.yml b/osm/catalog.yml new file mode 100644 index 000000000..78aac79a0 --- /dev/null +++ b/osm/catalog.yml @@ -0,0 +1,11 @@ +sources: + osm_one_billion: + description: Open Street Map database as a parquet file (1-billion-point) + driver: parquet + cache: + - argkey: urlpath + regex: 's3://datashader-data/osm-1billion.parq.zip' + type: compressed + args: + urlpath: 's3://datashader-data/osm-1billion.parq.zip' + storage_options: {'anon': True} diff --git a/osm-1billion.ipynb b/osm/osm-1billion.ipynb similarity index 94% rename from osm-1billion.ipynb rename to osm/osm-1billion.ipynb index 7cb17836f..ef167098d 100644 --- a/osm-1billion.ipynb +++ b/osm/osm-1billion.ipynb @@ -26,9 +26,9 @@ "metadata": {}, "outputs": [], "source": [ + "import intake\n", "import datashader as ds\n", "import datashader.transfer_functions as tf\n", - "from datashader import spatial\n", "\n", "from colorcet import fire" ] @@ -39,8 +39,25 @@ "metadata": {}, "outputs": [], "source": [ - "%%time\n", - "df = spatial.read_parquet('../data/osm-1billion.parq')\n", + "cat = intake.open_catalog('./catalog.yml')\n", + "data_entry = cat.osm_one_billion\n", + "data_entry" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the file isn't downloaded yet. The following step will take some time:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = data_entry.to_spatial()\n", "df = df.persist()" ] }, diff --git a/test_data/osm/catalog.yml b/test_data/osm/catalog.yml new file mode 100644 index 000000000..ea994ed39 --- /dev/null +++ b/test_data/osm/catalog.yml @@ -0,0 +1,6 @@ +sources: + osm_one_billion: + description: Test data points to same fake osm-3billion file + driver: parquet + args: + urlpath: '{{ CATALOG_DIR }}/data/osm-3billion.parq'