holoviz-topics · jsignell · May 23, 2019 · May 23, 2019 · May 23, 2019 · May 23, 2019
diff --git a/dodo.py b/dodo.py
@@ -25,14 +25,20 @@ def task_ecosystem_setup():
     'default': 'attractors'
 }
 
-def _prepare_paths(root, name, test_data):
+def _prepare_paths(root, name, test_data, filename='catalog.yml'):
     if root == '':
         root = os.getcwd()
     root = os.path.abspath(root)
     test_data = test_data if os.path.isabs(test_data) else os.path.join(root, test_data)
+    test_path = os.path.join(test_data, name)
+    project_path = os.path.join(root, name)
     return {
-        'real': os.path.join(root, name, 'data'),
-        'test': os.path.join(test_data, name),
+        'project': project_path,
+        'real': os.path.join(project_path, 'data'),
+        'test': test_path,
+        'cat_real': os.path.join(project_path, filename),
+        'cat_test': os.path.join(test_path, filename),
+        'cat_tmp': os.path.join(project_path, 'tmp_' + filename),
     }
 
 # From https://stackoverflow.com/a/24860799/4021797
@@ -64,26 +70,50 @@ def is_same(dir1, dir2):
             return False
     return True
 
-
 def task_small_data_setup():
     """Copy small versions of the data from test_data"""
 
-    def copy_test_data(root='', name='attractors', test_data='test_data'):
-        print('Copying test data for {} ...'.format(name))
-        paths = _prepare_paths(root, name, test_data)
+    def copy_test_data(root='', name='attractors', test_data='test_data', cat_filename='catalog.yml'):
+        print('Setting up test data for {}:'.format(name))
 
-        if not os.path.exists(paths['test']):
-            print("Warning: No test_data found for {} in {}".format(name, paths['test']))
-            return
+        paths = _prepare_paths(root, name, test_data, cat_filename)
+        has_catalog = os.path.exists(paths['cat_real'])
+
+        if not os.path.exists(paths['test']) or not os.listdir(paths['test']):
+            if has_catalog:
+                raise ValueError("Fail: {} has no test_data".format(name))
+            else:
+                print("  Nothing to do: Test data not found for {}".format(name))
+                print("Done!")
+                return
+
+        if has_catalog and not os.path.exists(paths['cat_test']):
+            raise ValueError("Fail: {} contains intake catalog, but "
+                             "no catalog found in test_data".format(name))
+
+        if has_catalog:
+            print('* Copying intake catalog ...')
+
+            # move real catalog file to tmp if tmp doesn't exist
+            if os.path.exists(paths['cat_tmp']):
+                raise ValueError("Fail: Temp file already exists - try 'doit small_data_cleanup'")
+            os.rename(paths['cat_real'], paths['cat_tmp'])
 
+            # move test catalog to project directory
+            shutil.copyfile(paths['cat_test'], paths['cat_real'])
+            print('  Intake catalog successfully copied')
+
+        print('* Copying test data ...')
         if os.path.exists(paths['real']) and os.listdir(paths['real']):
             matching_files = filecmp.dircmp(paths['test'], paths['real']).same_files
             if os.listdir(paths['real']) != matching_files:
                 raise ValueError("Fail: Data files already exist in {}".format(paths['real']))
             else:
-                print("Nothing to do: Test data already in {}".format(paths['real']))
-                return
-        copy_tree(paths['test'], paths['real'])
+                print("  Nothing to do: Test data already in {}".format(paths['real']))
+        else:
+            copy_tree(paths['test'], paths['real'])
+            print('  Test data sucessfully copied')
+
         print("Done!")
 
     return {'actions': [copy_test_data], 'params': [name_param]}
@@ -92,27 +122,38 @@ def copy_test_data(root='', name='attractors', test_data='test_data'):
 def task_small_data_cleanup():
     """Remove test_data from real data path"""
 
-    def remove_test_data(root='', name='attractors', test_data='test_data'):
-        print('Removing test data for {} ...'.format(name))
-        paths = _prepare_paths(root, name, test_data)
+    def remove_test_data(root='', name='attractors', test_data='test_data',
+                         cat_filename='catalog.yml'):
+        print('Cleaning up test data for {}:'.format(name))
+        paths = _prepare_paths(root, name, test_data, cat_filename)
 
-        if not os.path.exists(paths['test']):
-            print("Nothing to do: No test_data found for {} in {}".format(name, paths['test']))
-            return
+        if os.path.exists(paths['cat_real']):
+            print("* Replacing intake catalog ...")
 
-        if not os.path.exists(paths['real']):
-            print("Nothing to do: No data found in {}".format(paths['real']))
-            return
+            if not os.path.exists(paths['cat_tmp']):
+                print("  Nothing to do: No temp file found. Use git status to "
+                      "check that you have the real catalog at {}".format(paths['cat_real']))
+            else:
+                os.remove(paths['cat_real'])
+                os.rename(paths['cat_tmp'], paths['cat_real'])
+                print('  Intake catalog successfully cleaned')
 
-        if not os.listdir(paths['real']):
-            print("No data found in {}, just removing empty dir".format(paths['real']))
-            os.rmdir(paths['real'])
-            return
+        print('* Removing test data ...')
 
-        if not is_same(paths['test'], paths['real']):
-            raise ValueError("Fail: Data files at {} are not identical to test, so they shouldn't be deleted.".format(paths['real']))
+        if not os.path.exists(paths['test']):
+            print("  Nothing to do: No test_data found for {} in {}".format(name, paths['test']))
+        elif not os.path.exists(paths['real']):
+            print("  Nothing to do: No data found in {}".format(paths['real']))
+        elif not os.listdir(paths['real']):
+            os.rmdir(paths['real'])
+            print("  No data found in {}, just removed empty dir".format(paths['real']))
+        elif not is_same(paths['test'], paths['real']):
+            raise ValueError("Fail: Data files at {} are not identical to test, "
+                             "so they shouldn't be deleted.".format(paths['real']))
+        else:
+            shutil.rmtree(paths['real'])
+            print('  Test data successfully removed')
 
-        shutil.rmtree(paths['real'])
         print("Done!")
 
     return {'actions': [remove_test_data], 'params': [name_param]}
diff --git a/osm/anaconda-project.yml b/osm/anaconda-project.yml
@@ -1,6 +1,6 @@
 # To reproduce: install 'anaconda-project', then 'anaconda-project run'
 name: osm
-description: Datashading 2.7-billion-point Open Street Map database
+description: Datashading Open Street Map database
 maintainers:
   - jbednar
 
@@ -22,22 +22,52 @@ packages: *deps
 
 commands:
   notebook:
+    description: Datashading 1-billion-point Open Street Map database
+    notebook: osm-1billion.ipynb
+    env_spec: spatial
+  osm-3billion:
+    description: Datashading 2.7-billion-point Open Street Map database
     notebook: osm.ipynb
   test:
-    unix:    pytest --nbsmoke-run -k *.ipynb --ignore envs
-    windows: pytest --nbsmoke-run -k *.ipynb --ignore envs
+    unix:    pytest --nbsmoke-run --ignore envs
+    windows: pytest --nbsmoke-run --ignore envs
     env_spec: test
   lint:
-    unix:    pytest --nbsmoke-lint -k *.ipynb --ignore envs
-    windows: pytest --nbsmoke-lint -k *.ipynb --ignore envs
+    unix:    pytest --nbsmoke-lint --ignore envs
+    windows: pytest --nbsmoke-lint --ignore envs
     env_spec: test
 
-variables: {}
+variables:
+  INTAKE_CACHE_DIR: data
 downloads: {}
 
 env_specs:
   default: {}
+  spatial:
+    channels:
+      - pyviz
+      - intake
+    packages:
+      - holoviews
+      - intake
+      - ipywidgets
+      - jupyter
+      - s3fs
+      - tqdm
+      - pip:
+        - git+https://github.com/intake/intake-parquet.git@jsignell/spatial#egg=intake_parquet
   test:
+    channels:
+      - pyviz
+      - intake
     packages:
-    - nbsmoke ==0.2.8
-    - pytest ==4.4.1
+      - nbsmoke ==0.2.8
+      - pytest ==4.4.1
+      - holoviews
+      - intake
+      - ipywidgets
+      - jupyter
+      - s3fs
+      - tqdm
+      - pip:
+        - git+https://github.com/intake/intake-parquet.git@jsignell/spatial#egg=intake_parquet
diff --git a/osm/catalog.yml b/osm/catalog.yml
@@ -0,0 +1,11 @@
+sources:
+  osm_one_billion:
+    description: Open Street Map database as a parquet file (1-billion-point)
+    driver: parquet
+    cache:
+      - argkey: urlpath
+        regex: 's3://datashader-data/osm-1billion.parq.zip'
+        type: compressed
+    args:
+      urlpath: 's3://datashader-data/osm-1billion.parq.zip'
+      storage_options: {'anon': True}
diff --git a/osm-1billion.ipynb → osm/osm-1billion.ipynb b/osm-1billion.ipynb → osm/osm-1billion.ipynb
@@ -26,9 +26,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import intake\n",
     "import datashader as ds\n",
     "import datashader.transfer_functions as tf\n",
-    "from datashader import spatial\n",
     "\n",
     "from colorcet import fire"
    ]
@@ -39,8 +39,25 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%%time\n",
-    "df = spatial.read_parquet('../data/osm-1billion.parq')\n",
+    "cat = intake.open_catalog('./catalog.yml')\n",
+    "data_entry = cat.osm_one_billion\n",
+    "data_entry"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that the file isn't downloaded yet. The following step will take some time:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = data_entry.to_spatial()\n",
     "df = df.persist()"
    ]
   },

diff --git a/test_data/osm/catalog.yml b/test_data/osm/catalog.yml
@@ -0,0 +1,6 @@
+sources:
+  osm_one_billion:
+    description: Test data points to same fake osm-3billion file
+    driver: parquet
+    args:
+      urlpath: '{{ CATALOG_DIR }}/data/osm-3billion.parq'