From b059667f295c1f1be773a86a11343fde6f9e8daf Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Thu, 23 May 2019 13:47:51 -0400 Subject: [PATCH 1/7] Adding a to_spatial method on ParquetSource --- intake_parquet/source.py | 27 ++++++++++++++ tests/test_spatial.py | 76 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 tests/test_spatial.py diff --git a/intake_parquet/source.py b/intake_parquet/source.py index 6055d86..0cecfdd 100644 --- a/intake_parquet/source.py +++ b/intake_parquet/source.py @@ -106,5 +106,32 @@ def _to_dask(self): self._load_metadata() return self._df + def to_spatial(self): + """ + Create a datashader spatial object from the parquet data + """ + try: + from datashader.spatial.points import SpatialPointsFrame + except ImportError: + raise ImportError('SpatialPointsFrame not found in this ' + 'version of datashader. Get latest using ' + '`conda install -c pyviz datashader`.') + import json + import fastparquet as fp + + frame = self._df or self.to_dask() + urlpath = self._get_cache(self._urlpath)[0] + pf = fp.ParquetFile(urlpath) + # Check for spatial points metadata + if 'SpatialPointsFrame' in pf.key_value_metadata: + # Load metadata + props = json.loads(pf.key_value_metadata['SpatialPointsFrame']) + else: + props = None + + # Call DataFrame constructor with the internals of frame + return SpatialPointsFrame(frame.dask, frame._name, frame._meta, + frame.divisions, props) + def _close(self): self._df = None diff --git a/tests/test_spatial.py b/tests/test_spatial.py new file mode 100644 index 0000000..0f98d16 --- /dev/null +++ b/tests/test_spatial.py @@ -0,0 +1,76 @@ +""" +Minimal spatial test copied from datashader +""" +import os +import pytest +import numpy as np +import pandas as pd + +from intake_parquet.source import ParquetSource +import intake + +pytest.importorskip('datashader') + +intake.registry['parquet'] = ParquetSource # because pytest defers import + +@pytest.fixture() +def df(): + N = 1000 + np.random.seed(25) + + df = pd.DataFrame({ + 'x': np.random.rand(N), + 'y': np.random.rand(N) * 2, + 'a': np.random.randn(N) + }) + + # Make sure we have x/y values of 0 and 1 represented so that + # autocomputed ranges are predictable + df.x.iloc[0] = 0.0 + df.x.iloc[-1] = 1.0 + df.y.iloc[0] = 0.0 + df.y.iloc[-1] = 2.0 + return df + +@pytest.fixture(params=[False, True]) +def s_points_frame(request, tmp_path, df): + import datashader.spatial.points as dsp + + # Work around https://bugs.python.org/issue33617 + tmp_path = str(tmp_path) + p = 5 + path = os.path.join(tmp_path, 'spatial_points.parquet') + + dsp.to_parquet( + df, path, 'x', 'y', p=p, npartitions=10) + + spf = ParquetSource(path).to_spatial() + + if request.param: + spf = spf.persist() + + return spf + + +def test_spatial_points_frame_properties(s_points_frame): + assert s_points_frame.spatial.x == 'x' + assert s_points_frame.spatial.y == 'y' + assert s_points_frame.spatial.p == 5 + assert s_points_frame.npartitions == 10 + assert s_points_frame.spatial.x_range == (0, 1) + assert s_points_frame.spatial.y_range == (0, 2) + assert s_points_frame.spatial.nrows == 1000 + + # x_bin_edges + np.testing.assert_array_equal( + s_points_frame.spatial.x_bin_edges, + np.linspace(0.0, 1.0, 2 ** 5 + 1)) + + # y_bin_edges + np.testing.assert_array_equal( + s_points_frame.spatial.y_bin_edges, + np.linspace(0.0, 2.0, 2 ** 5 + 1)) + + # distance_divisions + distance_divisions = s_points_frame.spatial.distance_divisions + assert len(distance_divisions) == 10 + 1 From f865e564434f9a52770e9992e97b1fee67556bfa Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Thu, 23 May 2019 14:12:50 -0400 Subject: [PATCH 2/7] Adding datashader to test requirements --- conda/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/conda/meta.yaml b/conda/meta.yaml index 1fbd620..f2fa822 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -30,6 +30,7 @@ test: - pytest - pytest-cov - coverage + - datashader==0.7.0 commands: - py.test --verbose --cov=intake_parquet tests From f029a36ba5c7a644b4faebd2a40d6ec21dbc5681 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Thu, 23 May 2019 15:52:38 -0400 Subject: [PATCH 3/7] Run pytest with datashader --- .travis.yml | 3 +++ conda/meta.yaml | 1 - intake_parquet/source.py | 15 +++++++++++++-- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index d4fd339..bde79ac 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,6 +21,9 @@ script: - | flake8 . conda build -c intake -c defaults -c conda-forge ./conda +- conda install -y -c conda-forge 'datashader==0.7.0' +- conda install -y --use-local intake +- py.test --verbose --cov=intake_parquet tests - | if [ -n "$TRAVIS_TAG" ]; then # If tagged git version, upload package to main channel diff --git a/conda/meta.yaml b/conda/meta.yaml index f2fa822..1fbd620 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -30,7 +30,6 @@ test: - pytest - pytest-cov - coverage - - datashader==0.7.0 commands: - py.test --verbose --cov=intake_parquet tests diff --git a/intake_parquet/source.py b/intake_parquet/source.py index 0cecfdd..9eef1b4 100644 --- a/intake_parquet/source.py +++ b/intake_parquet/source.py @@ -107,8 +107,19 @@ def _to_dask(self): return self._df def to_spatial(self): - """ - Create a datashader spatial object from the parquet data + """Read a data source as a SpatialPointsFrame. + + This requires that the data be structured in a particular, spatially + optimized, way. To convert a regular dataframe to a SpatialPointsFrame + use the functionality in datshader. + + >>> import datashader.spatial.points as dsp + >>> dsp.to_parquet(df, 'sorted.parq', 'x', 'y', shuffle='disk', npartitions=32) + + More info at http://datashader.org/user_guide/2_Points.html + + This whole function is copied from datashader. + https://github.com/pyviz/datashader/blob/815596bbf72017c6b8014cfc96354601a3529b04/datashader/spatial/points.py#L297-L312 """ try: from datashader.spatial.points import SpatialPointsFrame From 6e8622a8359f41dfbcf2a3c33770b788ebeade8f Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Thu, 23 May 2019 17:08:42 -0400 Subject: [PATCH 4/7] Fixing up --- .travis.yml | 2 +- intake_parquet/source.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index bde79ac..1bdb992 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,7 +21,7 @@ script: - | flake8 . conda build -c intake -c defaults -c conda-forge ./conda -- conda install -y -c conda-forge 'datashader==0.7.0' +- conda install -y -c conda-forge 'datashader==0.7.0' pytest pytest-cov coverage - conda install -y --use-local intake - py.test --verbose --cov=intake_parquet tests - | diff --git a/intake_parquet/source.py b/intake_parquet/source.py index 9eef1b4..48a7d46 100644 --- a/intake_parquet/source.py +++ b/intake_parquet/source.py @@ -130,8 +130,8 @@ def to_spatial(self): import json import fastparquet as fp - frame = self._df or self.to_dask() - urlpath = self._get_cache(self._urlpath)[0] + frame = self.to_dask() + urlpath = self._get_cache(self._urlpath)[0] pf = fp.ParquetFile(urlpath) # Check for spatial points metadata if 'SpatialPointsFrame' in pf.key_value_metadata: From 8bbf50320f551a9720c880e80a00533ecb404ae7 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Fri, 24 May 2019 09:02:23 -0400 Subject: [PATCH 5/7] Don't run tests on build --- .travis.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 1bdb992..f5fae95 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,7 @@ env: matrix: - CONDA_PY=36 global: + CHANNELS: -c intake -c defaults -c conda-forge secure: ujj1B51Q6iG0f65hMj1bLUNgxHN0rh3dBKkQb3dELJ+OzU8r6YBXjrZp6PS0OM1PGbR5tcLwfmZG8u9QR1NTDAfYlVKXcd86SWqZl3XTQT7IBn7Z0aqV4ILJfpxacSm7Uu7lu9IKAySrDhhi+k4HbUkYk+xEL72e0F2QS07QldmPU3poCxJfpGiavGumiK6JzQ/Pmln+y6SYCtbf/6E4Ub/9jmBJ6XwyXAXS1zaUCbCnaMnLox3WIDaOInSmAa+yUh6E7uWAMOYF3w89ZGa5Q69TXLVZitUDRut1Faouu8RZbAsCdRHtR1CtqciLZiUf0kaWibDXpKoCQcnQIliDGbZIEBa191xOB8joTtb8F2Hz4QhzQ8tBZDa9hxOA/XlGFmBvNh27eMHO2PfJu08s3S/FZGaMjHuGsECvOmVor6lcHPqSE7gmRGUxiH3VQ++agtGw4I8qD83fsKRvnCR+YopViEgtjMAaXHWSNa4AMB4ITFDGx7omnGSpcuVAQ0AmGM8qk8GqhDYzAzMmOmzPYGxFCcvbciFVcwwnsbQ4iMI9XFhCP2Sd1XVzKEGzOMhKDb0JI+3jAb32mlMpcyGhmi+k3smLzgtZTf2vzH0hq5saRrc4z8IzSsxQTPeNkT0s9O7lo8VujI97dBXOC5RdSRohXR8As6AVkSc9bpJYR4I= install: - | @@ -20,9 +21,9 @@ install: script: - | flake8 . - conda build -c intake -c defaults -c conda-forge ./conda -- conda install -y -c conda-forge 'datashader==0.7.0' pytest pytest-cov coverage -- conda install -y --use-local intake + conda build $CHANNELS ./conda --no-test +- conda install -y $CHANNELS 'datashader==0.7.0' pytest pytest-cov coverage +- conda install -y $CHANNELS --use-local intake_parquet - py.test --verbose --cov=intake_parquet tests - | if [ -n "$TRAVIS_TAG" ]; then From 23396ac5e34b135709b04fbf2a15fbee15eea1f8 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Fri, 24 May 2019 09:18:30 -0400 Subject: [PATCH 6/7] Fixing up travis env declaration --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index f5fae95..c6eadc1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,7 +5,7 @@ env: matrix: - CONDA_PY=36 global: - CHANNELS: -c intake -c defaults -c conda-forge + CHANNELS="-c intake -c defaults -c conda-forge" secure: ujj1B51Q6iG0f65hMj1bLUNgxHN0rh3dBKkQb3dELJ+OzU8r6YBXjrZp6PS0OM1PGbR5tcLwfmZG8u9QR1NTDAfYlVKXcd86SWqZl3XTQT7IBn7Z0aqV4ILJfpxacSm7Uu7lu9IKAySrDhhi+k4HbUkYk+xEL72e0F2QS07QldmPU3poCxJfpGiavGumiK6JzQ/Pmln+y6SYCtbf/6E4Ub/9jmBJ6XwyXAXS1zaUCbCnaMnLox3WIDaOInSmAa+yUh6E7uWAMOYF3w89ZGa5Q69TXLVZitUDRut1Faouu8RZbAsCdRHtR1CtqciLZiUf0kaWibDXpKoCQcnQIliDGbZIEBa191xOB8joTtb8F2Hz4QhzQ8tBZDa9hxOA/XlGFmBvNh27eMHO2PfJu08s3S/FZGaMjHuGsECvOmVor6lcHPqSE7gmRGUxiH3VQ++agtGw4I8qD83fsKRvnCR+YopViEgtjMAaXHWSNa4AMB4ITFDGx7omnGSpcuVAQ0AmGM8qk8GqhDYzAzMmOmzPYGxFCcvbciFVcwwnsbQ4iMI9XFhCP2Sd1XVzKEGzOMhKDb0JI+3jAb32mlMpcyGhmi+k3smLzgtZTf2vzH0hq5saRrc4z8IzSsxQTPeNkT0s9O7lo8VujI97dBXOC5RdSRohXR8As6AVkSc9bpJYR4I= install: - | @@ -22,7 +22,7 @@ script: - | flake8 . conda build $CHANNELS ./conda --no-test -- conda install -y $CHANNELS 'datashader==0.7.0' pytest pytest-cov coverage +- conda install -y $CHANNELS 'datashader==0.7.0' pytest pytest-cov coverage - conda install -y $CHANNELS --use-local intake_parquet - py.test --verbose --cov=intake_parquet tests - | From 9b8d848cb03147fe854512af53f4e8920332a712 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Tue, 28 May 2019 16:30:30 -0400 Subject: [PATCH 7/7] Using read_parquet method directly --- intake_parquet/source.py | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/intake_parquet/source.py b/intake_parquet/source.py index 48a7d46..afbe834 100644 --- a/intake_parquet/source.py +++ b/intake_parquet/source.py @@ -118,31 +118,17 @@ def to_spatial(self): More info at http://datashader.org/user_guide/2_Points.html - This whole function is copied from datashader. - https://github.com/pyviz/datashader/blob/815596bbf72017c6b8014cfc96354601a3529b04/datashader/spatial/points.py#L297-L312 + NOTE: This method only works for local or cached data. """ try: - from datashader.spatial.points import SpatialPointsFrame + from datashader.spatial import read_parquet except ImportError: raise ImportError('SpatialPointsFrame not found in this ' 'version of datashader. Get latest using ' '`conda install -c pyviz datashader`.') - import json - import fastparquet as fp - frame = self.to_dask() urlpath = self._get_cache(self._urlpath)[0] - pf = fp.ParquetFile(urlpath) - # Check for spatial points metadata - if 'SpatialPointsFrame' in pf.key_value_metadata: - # Load metadata - props = json.loads(pf.key_value_metadata['SpatialPointsFrame']) - else: - props = None - - # Call DataFrame constructor with the internals of frame - return SpatialPointsFrame(frame.dask, frame._name, frame._meta, - frame.divisions, props) + return read_parquet(urlpath) def _close(self): self._df = None