From 8a533cee919f1e5ac47546388573b2b723b6c875 Mon Sep 17 00:00:00 2001
From: Julia Signell <jsignell@gmail.com>
Date: Thu, 23 May 2019 13:53:10 -0400
Subject: [PATCH 1/5] Intakifying osm-1billion

---
 osm/anaconda-project.yml                     | 34 +++++++++++++++++---
 osm/catalog.yml                              | 10 ++++++
 osm-1billion.ipynb => osm/osm-1billion.ipynb | 23 +++++++++++--
 3 files changed, 60 insertions(+), 7 deletions(-)
 create mode 100644 osm/catalog.yml
 rename osm-1billion.ipynb => osm/osm-1billion.ipynb (94%)

diff --git a/osm/anaconda-project.yml b/osm/anaconda-project.yml
index 4422e85ae..5c601ede8 100644
--- a/osm/anaconda-project.yml
+++ b/osm/anaconda-project.yml
@@ -1,6 +1,6 @@
 # To reproduce: install 'anaconda-project', then 'anaconda-project run'
 name: osm
-description: Datashading 2.7-billion-point Open Street Map database
+description: Datashading Open Street Map database
 maintainers:
   - jbednar
 
@@ -22,6 +22,11 @@ packages: *deps
 
 commands:
   notebook:
+    description: Datashading 1-billion-point Open Street Map database
+    notebook: osm-1billion.ipynb
+    env_spec: spatial
+  osm-3billion:
+    description: Datashading 2.7-billion-point Open Street Map database
     notebook: osm.ipynb
   test:
     unix:    pytest --nbsmoke-run -k *.ipynb --ignore envs
@@ -32,12 +37,33 @@ commands:
     windows: pytest --nbsmoke-lint -k *.ipynb --ignore envs
     env_spec: test
 
-variables: {}
+variables:
+  INTAKE_CACHE_DIR: data
 downloads: {}
 
 env_specs:
   default: {}
+  spatial:
+    channels:
+      - pyviz
+      - intake
+    packages:
+      - holoviews
+      - intake
+      - intake-parquet
+      - ipywidgets
+      - jupyter
+      - tqdm
   test:
+    channels:
+      - pyviz
+      - intake
     packages:
-    - nbsmoke ==0.2.8
-    - pytest ==4.4.1
+      - nbsmoke ==0.2.8
+      - pytest ==4.4.1
+      - holoviews
+      - intake
+      - intake-parquet
+      - ipywidgets
+      - jupyter
+      - tqdm
diff --git a/osm/catalog.yml b/osm/catalog.yml
new file mode 100644
index 000000000..c8ec3e8af
--- /dev/null
+++ b/osm/catalog.yml
@@ -0,0 +1,10 @@
+sources:
+  osm_one_billion:
+    description: Open Street Map database as a parquet file (1-billion-point)
+    driver: parquet
+    cache:
+      - argkey: urlpath
+        regex: https://s3.amazonaws.com/datashader-data
+        type: compressed
+    args:
+      urlpath: https://s3.amazonaws.com/datashader-data/osm-1billion.parq.zip
diff --git a/osm-1billion.ipynb b/osm/osm-1billion.ipynb
similarity index 94%
rename from osm-1billion.ipynb
rename to osm/osm-1billion.ipynb
index 7cb17836f..ef167098d 100644
--- a/osm-1billion.ipynb
+++ b/osm/osm-1billion.ipynb
@@ -26,9 +26,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import intake\n",
     "import datashader as ds\n",
     "import datashader.transfer_functions as tf\n",
-    "from datashader import spatial\n",
     "\n",
     "from colorcet import fire"
    ]
@@ -39,8 +39,25 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%%time\n",
-    "df = spatial.read_parquet('../data/osm-1billion.parq')\n",
+    "cat = intake.open_catalog('./catalog.yml')\n",
+    "data_entry = cat.osm_one_billion\n",
+    "data_entry"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that the file isn't downloaded yet. The following step will take some time:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = data_entry.to_spatial()\n",
     "df = df.persist()"
    ]
   },

From 5762f33af4c74f36b42924217fe2856bdb9953f7 Mon Sep 17 00:00:00 2001
From: Julia Signell <jsignell@gmail.com>
Date: Thu, 23 May 2019 13:54:33 -0400
Subject: [PATCH 2/5] Putting test intake logic in dodo

---
 .travis.yml               |   5 --
 dodo.py                   | 106 +++++++++++++++++++++++++++-----------
 test_data/osm/catalog.yml |   6 +++
 3 files changed, 83 insertions(+), 34 deletions(-)
 create mode 100644 test_data/osm/catalog.yml

diff --git a/.travis.yml b/.travis.yml
index 3400740da..85dc231a2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -46,11 +46,6 @@ before_install:
 script:
   - doit small_data_setup --name $DIR
   - cd $DIR
-  - if ! anaconda-project list-downloads | grep -q 'No downloads'; then
-      if ! [ -d data ]; then
-        echo 'FAIL needs data and no test data found' && exit 1;
-      fi;
-    fi
   - $RUN_LINT && anaconda-project run lint || echo "skipping lint"
   - $RUN_TEST && anaconda-project run test || echo "skipping test"
   - cd ..
diff --git a/dodo.py b/dodo.py
index ac7ba868e..37a72e3e3 100644
--- a/dodo.py
+++ b/dodo.py
@@ -25,16 +25,28 @@ def task_ecosystem_setup():
     'default': 'attractors'
 }
 
-def _prepare_paths(root, name, test_data):
+def _prepare_paths(root, name, test_data, filename='catalog.yml'):
     if root == '':
         root = os.getcwd()
     root = os.path.abspath(root)
     test_data = test_data if os.path.isabs(test_data) else os.path.join(root, test_data)
+    test_path = os.path.join(test_data, name)
+    project_path = os.path.join(root, name)
     return {
-        'real': os.path.join(root, name, 'data'),
-        'test': os.path.join(test_data, name),
+        'project': project_path,
+        'real': os.path.join(project_path, 'data'),
+        'test': test_path,
+        'cat_real': os.path.join(project_path, filename),
+        'cat_test': os.path.join(test_path, filename),
+        'cat_tmp': os.path.join(project_path, 'tmp_' + filename),
     }
 
+def _has_download(path, filename='anaconda-project.yml'):
+    import yaml
+    with open(os.path.join(path, filename), 'r') as f:
+        text = yaml.load(f, Loader=yaml.FullLoader)
+        return bool(text.get('downloads', {}))
+
 # From https://stackoverflow.com/a/24860799/4021797
 class dircmp(filecmp.dircmp):
     """
@@ -64,26 +76,51 @@ def is_same(dir1, dir2):
             return False
     return True
 
-
 def task_small_data_setup():
     """Copy small versions of the data from test_data"""
 
-    def copy_test_data(root='', name='attractors', test_data='test_data'):
-        print('Copying test data for {} ...'.format(name))
-        paths = _prepare_paths(root, name, test_data)
+    def copy_test_data(root='', name='attractors', test_data='test_data', cat_filename='catalog.yml'):
+        print('Setting up test data for {}:'.format(name))
 
-        if not os.path.exists(paths['test']):
-            print("Warning: No test_data found for {} in {}".format(name, paths['test']))
-            return
+        paths = _prepare_paths(root, name, test_data, cat_filename)
+        has_catalog = os.path.exists(paths['cat_real'])
+        has_download = _has_download(paths['project'])
+
+        if not os.path.exists(paths['test']) or not os.listdir(paths['test']):
+            if has_catalog or has_download:
+                raise ValueError("Fail: {} has no test_data".format(name))
+            else:
+                print("  Nothing to do: Test data not needed for {}".format(name))
+                print("Done!")
+                return
+
+        if has_catalog and not os.path.exists(paths['cat_test']):
+            raise ValueError("Fail: {} contains intake catalog, but "
+                             "no catalog found in test_data".format(name))
+
+        if has_catalog:
+            print('* Copying intake catalog ...')
+
+            # move real catalog file to tmp if tmp doesn't exist
+            if os.path.exists(paths['cat_tmp']):
+                raise ValueError("Fail: Temp file already exists - try 'doit small_data_cleanup'")
+            os.rename(paths['cat_real'], paths['cat_tmp'])
 
+            # move test catalog to project directory
+            shutil.copyfile(paths['cat_test'], paths['cat_real'])
+            print('  Intake catalog successfully copied')
+
+        print('* Copying test data ...')
         if os.path.exists(paths['real']) and os.listdir(paths['real']):
             matching_files = filecmp.dircmp(paths['test'], paths['real']).same_files
             if os.listdir(paths['real']) != matching_files:
                 raise ValueError("Fail: Data files already exist in {}".format(paths['real']))
             else:
-                print("Nothing to do: Test data already in {}".format(paths['real']))
-                return
-        copy_tree(paths['test'], paths['real'])
+                print("  Nothing to do: Test data already in {}".format(paths['real']))
+        else:
+            copy_tree(paths['test'], paths['real'])
+            print('  Test data sucessfully copied')
+
         print("Done!")
 
     return {'actions': [copy_test_data], 'params': [name_param]}
@@ -92,27 +129,38 @@ def copy_test_data(root='', name='attractors', test_data='test_data'):
 def task_small_data_cleanup():
     """Remove test_data from real data path"""
 
-    def remove_test_data(root='', name='attractors', test_data='test_data'):
-        print('Removing test data for {} ...'.format(name))
-        paths = _prepare_paths(root, name, test_data)
+    def remove_test_data(root='', name='attractors', test_data='test_data',
+                         cat_filename='catalog.yml'):
+        print('Cleaning up test data for {}:'.format(name))
+        paths = _prepare_paths(root, name, test_data, cat_filename)
 
-        if not os.path.exists(paths['test']):
-            print("Nothing to do: No test_data found for {} in {}".format(name, paths['test']))
-            return
+        if os.path.exists(paths['cat_real']):
+            print("* Replacing intake catalog ...")
 
-        if not os.path.exists(paths['real']):
-            print("Nothing to do: No data found in {}".format(paths['real']))
-            return
+            if not os.path.exists(paths['cat_tmp']):
+                print("  Nothing to do: No temp file found. Use git status to "
+                      "check that you have the real catalog at {}".format(paths['cat_real']))
+            else:
+                os.remove(paths['cat_real'])
+                os.rename(paths['cat_tmp'], paths['cat_real'])
+                print('  Intake catalog successfully cleaned')
 
-        if not os.listdir(paths['real']):
-            print("No data found in {}, just removing empty dir".format(paths['real']))
-            os.rmdir(paths['real'])
-            return
+        print('* Removing test data ...')
 
-        if not is_same(paths['test'], paths['real']):
-            raise ValueError("Fail: Data files at {} are not identical to test, so they shouldn't be deleted.".format(paths['real']))
+        if not os.path.exists(paths['test']):
+            print("  Nothing to do: No test_data found for {} in {}".format(name, paths['test']))
+        elif not os.path.exists(paths['real']):
+            print("  Nothing to do: No data found in {}".format(paths['real']))
+        elif not os.listdir(paths['real']):
+            os.rmdir(paths['real'])
+            print("  No data found in {}, just removed empty dir".format(paths['real']))
+        elif not is_same(paths['test'], paths['real']):
+            raise ValueError("Fail: Data files at {} are not identical to test, "
+                             "so they shouldn't be deleted.".format(paths['real']))
+        else:
+            shutil.rmtree(paths['real'])
+            print('  Test data successfully removed')
 
-        shutil.rmtree(paths['real'])
         print("Done!")
 
     return {'actions': [remove_test_data], 'params': [name_param]}
diff --git a/test_data/osm/catalog.yml b/test_data/osm/catalog.yml
new file mode 100644
index 000000000..ea994ed39
--- /dev/null
+++ b/test_data/osm/catalog.yml
@@ -0,0 +1,6 @@
+sources:
+  osm_one_billion:
+    description: Test data points to same fake osm-3billion file
+    driver: parquet
+    args:
+      urlpath: '{{ CATALOG_DIR }}/data/osm-3billion.parq'

From 4812a30787ae800365cc49a1d9058de2d056e44a Mon Sep 17 00:00:00 2001
From: Julia Signell <jsignell@gmail.com>
Date: Thu, 23 May 2019 14:14:00 -0400
Subject: [PATCH 3/5] Adding yaml to setup and using git source [only:osm]

---
 dodo.py                  |  2 +-
 osm/anaconda-project.yml | 14 ++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/dodo.py b/dodo.py
index 37a72e3e3..d538291fc 100644
--- a/dodo.py
+++ b/dodo.py
@@ -14,7 +14,7 @@ def task_ecosystem_setup():
     return {'actions': [
         "conda config --set always_yes True",
         "conda update conda",
-        "conda install anaconda-project 'tornado<5.0'",
+        "conda install anaconda-project 'tornado<5.0' pyyaml",
     ]}
 
 
diff --git a/osm/anaconda-project.yml b/osm/anaconda-project.yml
index 5c601ede8..2f1481c57 100644
--- a/osm/anaconda-project.yml
+++ b/osm/anaconda-project.yml
@@ -29,12 +29,12 @@ commands:
     description: Datashading 2.7-billion-point Open Street Map database
     notebook: osm.ipynb
   test:
-    unix:    pytest --nbsmoke-run -k *.ipynb --ignore envs
-    windows: pytest --nbsmoke-run -k *.ipynb --ignore envs
+    unix:    pytest --nbsmoke-run --ignore envs
+    windows: pytest --nbsmoke-run --ignore envs
     env_spec: test
   lint:
-    unix:    pytest --nbsmoke-lint -k *.ipynb --ignore envs
-    windows: pytest --nbsmoke-lint -k *.ipynb --ignore envs
+    unix:    pytest --nbsmoke-lint --ignore envs
+    windows: pytest --nbsmoke-lint --ignore envs
     env_spec: test
 
 variables:
@@ -50,10 +50,11 @@ env_specs:
     packages:
       - holoviews
       - intake
-      - intake-parquet
       - ipywidgets
       - jupyter
       - tqdm
+      - pip:
+        - git+https://github.com/intake/intake-parquet.git@jsignell/spatial#egg=intake_parquet
   test:
     channels:
       - pyviz
@@ -63,7 +64,8 @@ env_specs:
       - pytest ==4.4.1
       - holoviews
       - intake
-      - intake-parquet
       - ipywidgets
       - jupyter
       - tqdm
+      - pip:
+        - git+https://github.com/intake/intake-parquet.git@jsignell/spatial#egg=intake_parquet

From 3a4f980e48c4a164e918a55f74b1c92c311fffcc Mon Sep 17 00:00:00 2001
From: Julia Signell <jsignell@gmail.com>
Date: Thu, 23 May 2019 14:21:29 -0400
Subject: [PATCH 4/5] Don't to yaml parsing in dodo [only:osm]

---
 .travis.yml |  5 +++++
 dodo.py     | 13 +++----------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 85dc231a2..3400740da 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -46,6 +46,11 @@ before_install:
 script:
   - doit small_data_setup --name $DIR
   - cd $DIR
+  - if ! anaconda-project list-downloads | grep -q 'No downloads'; then
+      if ! [ -d data ]; then
+        echo 'FAIL needs data and no test data found' && exit 1;
+      fi;
+    fi
   - $RUN_LINT && anaconda-project run lint || echo "skipping lint"
   - $RUN_TEST && anaconda-project run test || echo "skipping test"
   - cd ..
diff --git a/dodo.py b/dodo.py
index d538291fc..b82d58142 100644
--- a/dodo.py
+++ b/dodo.py
@@ -14,7 +14,7 @@ def task_ecosystem_setup():
     return {'actions': [
         "conda config --set always_yes True",
         "conda update conda",
-        "conda install anaconda-project 'tornado<5.0' pyyaml",
+        "conda install anaconda-project 'tornado<5.0'",
     ]}
 
 
@@ -41,12 +41,6 @@ def _prepare_paths(root, name, test_data, filename='catalog.yml'):
         'cat_tmp': os.path.join(project_path, 'tmp_' + filename),
     }
 
-def _has_download(path, filename='anaconda-project.yml'):
-    import yaml
-    with open(os.path.join(path, filename), 'r') as f:
-        text = yaml.load(f, Loader=yaml.FullLoader)
-        return bool(text.get('downloads', {}))
-
 # From https://stackoverflow.com/a/24860799/4021797
 class dircmp(filecmp.dircmp):
     """
@@ -84,13 +78,12 @@ def copy_test_data(root='', name='attractors', test_data='test_data', cat_filena
 
         paths = _prepare_paths(root, name, test_data, cat_filename)
         has_catalog = os.path.exists(paths['cat_real'])
-        has_download = _has_download(paths['project'])
 
         if not os.path.exists(paths['test']) or not os.listdir(paths['test']):
-            if has_catalog or has_download:
+            if has_catalog:
                 raise ValueError("Fail: {} has no test_data".format(name))
             else:
-                print("  Nothing to do: Test data not needed for {}".format(name))
+                print("  Nothing to do: Test data not found for {}".format(name))
                 print("Done!")
                 return
 

From 6de43741e191f24af082d4ba02d0ff71985b932a Mon Sep 17 00:00:00 2001
From: Julia Signell <jsignell@gmail.com>
Date: Thu, 23 May 2019 17:23:20 -0400
Subject: [PATCH 5/5] Using s3 directly

---
 osm/anaconda-project.yml | 2 ++
 osm/catalog.yml          | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/osm/anaconda-project.yml b/osm/anaconda-project.yml
index 2f1481c57..bd811eedf 100644
--- a/osm/anaconda-project.yml
+++ b/osm/anaconda-project.yml
@@ -52,6 +52,7 @@ env_specs:
       - intake
       - ipywidgets
       - jupyter
+      - s3fs
       - tqdm
       - pip:
         - git+https://github.com/intake/intake-parquet.git@jsignell/spatial#egg=intake_parquet
@@ -66,6 +67,7 @@ env_specs:
       - intake
       - ipywidgets
       - jupyter
+      - s3fs
       - tqdm
       - pip:
         - git+https://github.com/intake/intake-parquet.git@jsignell/spatial#egg=intake_parquet
diff --git a/osm/catalog.yml b/osm/catalog.yml
index c8ec3e8af..78aac79a0 100644
--- a/osm/catalog.yml
+++ b/osm/catalog.yml
@@ -4,7 +4,8 @@ sources:
     driver: parquet
     cache:
       - argkey: urlpath
-        regex: https://s3.amazonaws.com/datashader-data
+        regex: 's3://datashader-data/osm-1billion.parq.zip'
         type: compressed
     args:
-      urlpath: https://s3.amazonaws.com/datashader-data/osm-1billion.parq.zip
+      urlpath: 's3://datashader-data/osm-1billion.parq.zip'
+      storage_options: {'anon': True}