Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Intake-ifying osm #20

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 70 additions & 29 deletions dodo.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,20 @@ def task_ecosystem_setup():
'default': 'attractors'
}

def _prepare_paths(root, name, test_data):
def _prepare_paths(root, name, test_data, filename='catalog.yml'):
if root == '':
root = os.getcwd()
root = os.path.abspath(root)
test_data = test_data if os.path.isabs(test_data) else os.path.join(root, test_data)
test_path = os.path.join(test_data, name)
project_path = os.path.join(root, name)
return {
'real': os.path.join(root, name, 'data'),
'test': os.path.join(test_data, name),
'project': project_path,
'real': os.path.join(project_path, 'data'),
'test': test_path,
'cat_real': os.path.join(project_path, filename),
'cat_test': os.path.join(test_path, filename),
'cat_tmp': os.path.join(project_path, 'tmp_' + filename),
}

# From https://stackoverflow.com/a/24860799/4021797
Expand Down Expand Up @@ -64,26 +70,50 @@ def is_same(dir1, dir2):
return False
return True


def task_small_data_setup():
"""Copy small versions of the data from test_data"""

def copy_test_data(root='', name='attractors', test_data='test_data'):
print('Copying test data for {} ...'.format(name))
paths = _prepare_paths(root, name, test_data)
def copy_test_data(root='', name='attractors', test_data='test_data', cat_filename='catalog.yml'):
print('Setting up test data for {}:'.format(name))

if not os.path.exists(paths['test']):
print("Warning: No test_data found for {} in {}".format(name, paths['test']))
return
paths = _prepare_paths(root, name, test_data, cat_filename)
has_catalog = os.path.exists(paths['cat_real'])

if not os.path.exists(paths['test']) or not os.listdir(paths['test']):
if has_catalog:
raise ValueError("Fail: {} has no test_data".format(name))
else:
print(" Nothing to do: Test data not found for {}".format(name))
print("Done!")
return

if has_catalog and not os.path.exists(paths['cat_test']):
raise ValueError("Fail: {} contains intake catalog, but "
"no catalog found in test_data".format(name))

if has_catalog:
print('* Copying intake catalog ...')

# move real catalog file to tmp if tmp doesn't exist
if os.path.exists(paths['cat_tmp']):
raise ValueError("Fail: Temp file already exists - try 'doit small_data_cleanup'")
os.rename(paths['cat_real'], paths['cat_tmp'])

# move test catalog to project directory
shutil.copyfile(paths['cat_test'], paths['cat_real'])
print(' Intake catalog successfully copied')

print('* Copying test data ...')
if os.path.exists(paths['real']) and os.listdir(paths['real']):
matching_files = filecmp.dircmp(paths['test'], paths['real']).same_files
if os.listdir(paths['real']) != matching_files:
raise ValueError("Fail: Data files already exist in {}".format(paths['real']))
else:
print("Nothing to do: Test data already in {}".format(paths['real']))
return
copy_tree(paths['test'], paths['real'])
print(" Nothing to do: Test data already in {}".format(paths['real']))
else:
copy_tree(paths['test'], paths['real'])
print(' Test data sucessfully copied')

print("Done!")

return {'actions': [copy_test_data], 'params': [name_param]}
Expand All @@ -92,27 +122,38 @@ def copy_test_data(root='', name='attractors', test_data='test_data'):
def task_small_data_cleanup():
"""Remove test_data from real data path"""

def remove_test_data(root='', name='attractors', test_data='test_data'):
print('Removing test data for {} ...'.format(name))
paths = _prepare_paths(root, name, test_data)
def remove_test_data(root='', name='attractors', test_data='test_data',
cat_filename='catalog.yml'):
print('Cleaning up test data for {}:'.format(name))
paths = _prepare_paths(root, name, test_data, cat_filename)

if not os.path.exists(paths['test']):
print("Nothing to do: No test_data found for {} in {}".format(name, paths['test']))
return
if os.path.exists(paths['cat_real']):
print("* Replacing intake catalog ...")

if not os.path.exists(paths['real']):
print("Nothing to do: No data found in {}".format(paths['real']))
return
if not os.path.exists(paths['cat_tmp']):
print(" Nothing to do: No temp file found. Use git status to "
"check that you have the real catalog at {}".format(paths['cat_real']))
else:
os.remove(paths['cat_real'])
os.rename(paths['cat_tmp'], paths['cat_real'])
print(' Intake catalog successfully cleaned')

if not os.listdir(paths['real']):
print("No data found in {}, just removing empty dir".format(paths['real']))
os.rmdir(paths['real'])
return
print('* Removing test data ...')

if not is_same(paths['test'], paths['real']):
raise ValueError("Fail: Data files at {} are not identical to test, so they shouldn't be deleted.".format(paths['real']))
if not os.path.exists(paths['test']):
print(" Nothing to do: No test_data found for {} in {}".format(name, paths['test']))
elif not os.path.exists(paths['real']):
print(" Nothing to do: No data found in {}".format(paths['real']))
elif not os.listdir(paths['real']):
os.rmdir(paths['real'])
print(" No data found in {}, just removed empty dir".format(paths['real']))
elif not is_same(paths['test'], paths['real']):
raise ValueError("Fail: Data files at {} are not identical to test, "
"so they shouldn't be deleted.".format(paths['real']))
else:
shutil.rmtree(paths['real'])
print(' Test data successfully removed')

shutil.rmtree(paths['real'])
print("Done!")

return {'actions': [remove_test_data], 'params': [name_param]}
46 changes: 38 additions & 8 deletions osm/anaconda-project.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# To reproduce: install 'anaconda-project', then 'anaconda-project run'
name: osm
description: Datashading 2.7-billion-point Open Street Map database
description: Datashading Open Street Map database
maintainers:
- jbednar

Expand All @@ -22,22 +22,52 @@ packages: *deps

commands:
notebook:
description: Datashading 1-billion-point Open Street Map database
notebook: osm-1billion.ipynb
env_spec: spatial
osm-3billion:
description: Datashading 2.7-billion-point Open Street Map database
notebook: osm.ipynb
test:
unix: pytest --nbsmoke-run -k *.ipynb --ignore envs
windows: pytest --nbsmoke-run -k *.ipynb --ignore envs
unix: pytest --nbsmoke-run --ignore envs
windows: pytest --nbsmoke-run --ignore envs
env_spec: test
lint:
unix: pytest --nbsmoke-lint -k *.ipynb --ignore envs
windows: pytest --nbsmoke-lint -k *.ipynb --ignore envs
unix: pytest --nbsmoke-lint --ignore envs
windows: pytest --nbsmoke-lint --ignore envs
env_spec: test

variables: {}
variables:
INTAKE_CACHE_DIR: data
downloads: {}

env_specs:
default: {}
spatial:
channels:
- pyviz
- intake
packages:
- holoviews
- intake
- ipywidgets
- jupyter
- s3fs
- tqdm
- pip:
- git+https://github.com/intake/intake-parquet.git@jsignell/spatial#egg=intake_parquet
jbednar marked this conversation as resolved.
Show resolved Hide resolved
test:
channels:
- pyviz
- intake
packages:
- nbsmoke ==0.2.8
- pytest ==4.4.1
- nbsmoke ==0.2.8
- pytest ==4.4.1
- holoviews
- intake
- ipywidgets
- jupyter
- s3fs
- tqdm
- pip:
- git+https://github.com/intake/intake-parquet.git@jsignell/spatial#egg=intake_parquet
11 changes: 11 additions & 0 deletions osm/catalog.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
sources:
osm_one_billion:
description: Open Street Map database as a parquet file (1-billion-point)
driver: parquet
cache:
- argkey: urlpath
regex: 's3://datashader-data/osm-1billion.parq.zip'
type: compressed
args:
urlpath: 's3://datashader-data/osm-1billion.parq.zip'
storage_options: {'anon': True}
23 changes: 20 additions & 3 deletions osm-1billion.ipynb → osm/osm-1billion.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@
"metadata": {},
"outputs": [],
"source": [
"import intake\n",
"import datashader as ds\n",
"import datashader.transfer_functions as tf\n",
"from datashader import spatial\n",
"\n",
"from colorcet import fire"
]
Expand All @@ -39,8 +39,25 @@
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"df = spatial.read_parquet('../data/osm-1billion.parq')\n",
"cat = intake.open_catalog('./catalog.yml')\n",
"data_entry = cat.osm_one_billion\n",
"data_entry"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that the file isn't downloaded yet. The following step will take some time:"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do you know at this point whether it's been downloaded yet? Won't it be cached? Seems like it should say instead that "Note that the first time this cell is executed, the file will take some time to download, but subsequent runs will skip that step".

]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = data_entry.to_spatial()\n",
"df = df.persist()"
]
},
Expand Down
6 changes: 6 additions & 0 deletions test_data/osm/catalog.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
sources:
osm_one_billion:
description: Test data points to same fake osm-3billion file
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as what? Try to reword in a way that makes sense when reading just this one file.

driver: parquet
args:
urlpath: '{{ CATALOG_DIR }}/data/osm-3billion.parq'