Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use polars for seaexplorer data file load #120

Merged
merged 15 commits into from
Nov 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/getting-started-seaexplorer.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ There are four top-levels to the `deployment.yaml`

The example script is relatively straight forward if there is no intermediate processing. See {ref}`ExProc`, below.

Data comes from an input directory, and is translated to raw glider-dependent netCDF files and put in a new directory. These files are useful of their own right, but are not CF compliant. These files are then merged into a single monolithic netCDF file, and this is translated to a CF-compliant timeseries file. Finally individual profiles are saved and a 2-D 1-m grid in time-depth is saved.
Data comes from an input directory, and is translated to raw glider-dependent parquet files files and put in a new directory. These files are useful of their own right. Apache Parquet is a columnar oriented format for storing tabular data. Parquet files take up less space than netCDF or csv and are much faster to read and write. These files can be opened with [polars.read_parquet](https://pola-rs.github.io/polars-book/user-guide/howcani/io/parquet.html) or [pandas.read_parquet](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_parquet.html). These files are then merged into a single monolithic parquet file, and this is translated to a CF-compliant timeseries netCDF file. Finally individual profiles are saved and a 2-D 1-m grid in time-depth is saved.

It is likely that between these steps the user will want to add any screening steps, or adjustments to the calibrations. PyGlider does not provide those steps.

Expand Down
2 changes: 1 addition & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ by optional steps to create profiles or depth-time grids.

```python
seaexplorer.raw_to_rawnc(rawdir, rawncdir, deploymentyaml)
seaexplorer.merge_rawnc(rawncdir, rawncdir, deploymentyaml, kind='sub')
seaexplorer.merge_parquet(rawncdir, rawncdir, deploymentyaml, kind='sub')
outname = seaexplorer.raw_to_timeseries(rawncdir, tsdir, deploymentyaml, kind='sub')
# optional...
ncprocess.extract_timeseries_profiles(outname, profiledir, deploymentyaml)
Expand Down
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@ dependencies:
- scipy
- bitstring
- pooch
- polars
- pip:
- dbdreader
197 changes: 112 additions & 85 deletions pyglider/seaexplorer.py

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ dependencies:
- pytest
- pooch
- matplotlib
- polars
- pip:
- dbdreader

Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
# turn seaexplorer zipped csvs into nc files.
seaexplorer.raw_to_rawnc(rawdir, rawncdir, deploymentyaml)
# merge individual netcdf files into single netcdf files *.gli*.nc and *.pld1*.nc
seaexplorer.merge_rawnc(rawncdir, rawncdir, deploymentyaml, kind='sub')
seaexplorer.merge_parquet(rawncdir, rawncdir, deploymentyaml, kind='sub')
# Make level-0 timeseries netcdf file from the raw files...
outname = seaexplorer.raw_to_timeseries(rawncdir, l0tsdir, deploymentyaml, kind='sub')
ncprocess.extract_timeseries_profiles(outname, profiledir, deploymentyaml)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# turn *.EBD and *.DBD into *.ebd.nc and *.dbd.nc netcdf files.
seaexplorer.raw_to_rawnc(rawdir, rawncdir, deploymentyaml)
# merge individual neetcdf files into single netcdf files *.ebd.nc and *.dbd.nc
seaexplorer.merge_rawnc(rawncdir, rawncdir, deploymentyaml, kind='sub')
seaexplorer.merge_parquet(rawncdir, rawncdir, deploymentyaml, kind='sub')

# Make level-1 timeseries netcdf file from th raw files...
outname = seaexplorer.raw_to_timeseries(rawncdir, l0tsdir, deploymentyaml, kind='sub')
Expand Down
8 changes: 4 additions & 4 deletions tests/test_pyglider.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
deploymentyaml = str(example_dir / 'example-seaexplorer/deploymentRealtime.yml')
l0tsdir = str(example_dir / 'example-seaexplorer/L0-timeseries-test/') + '/'
seaexplorer.raw_to_rawnc(rawdir, rawncdir, deploymentyaml)
seaexplorer.merge_rawnc(rawncdir, rawncdir, deploymentyaml, kind='sub')
seaexplorer.merge_parquet(rawncdir, rawncdir, deploymentyaml, kind='sub')
outname = seaexplorer.raw_to_L0timeseries(rawncdir, l0tsdir,
deploymentyaml, kind='sub')
output = xr.open_dataset(outname)
Expand All @@ -39,7 +39,7 @@ def test_example_seaexplorer(var):
# Test that each variable and its coordinates match
assert output[var].attrs == test_data[var].attrs
if var not in ['time']:
np.testing.assert_allclose(output[var].values, test_data[var].values)
np.testing.assert_allclose(output[var].values, test_data[var].values, rtol=1e-5)
else:
dt0 = output[var].values - np.datetime64('2000-01-01')
dt1 = test_data[var].values - np.datetime64('2000-01-01')
Expand Down Expand Up @@ -98,7 +98,7 @@ def test_example_slocum(var):
assert output_slocum[var].attrs == test_data_slocum[var].attrs
if var not in ['time']:
np.testing.assert_allclose(output_slocum[var].values,
test_data_slocum[var].values)
test_data_slocum[var].values, rtol=1e-6)
else:
dt0 = output_slocum[var].values - np.datetime64('2000-01-01')
dt1 = test_data_slocum[var].values - np.datetime64('2000-01-01')
Expand Down Expand Up @@ -163,7 +163,7 @@ def test_example_slocum_littleendian(var):
assert output_slocum_le[var].attrs == test_data_slocum_le[var].attrs
if var not in ['time']:
np.testing.assert_allclose(output_slocum_le[var].values,
test_data_slocum_le[var].values)
test_data_slocum_le[var].values, rtol=1e-6)
else:
dt0 = output_slocum_le[var].values - np.datetime64('2000-01-01')
dt1 = test_data_slocum_le[var].values - np.datetime64('2000-01-01')
Expand Down
19 changes: 9 additions & 10 deletions tests/test_seaexplorer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import xarray as xr
import polars as pl
import pytest
from pathlib import Path
import sys
import os
os.system('rm tests/data/realtime_rawnc/*')
library_dir = Path(__file__).parent.parent.absolute()
Expand All @@ -13,7 +12,7 @@
def test__outputname():
fnout, filenum = seaexplorer._outputname('tests/data/realtime_raw/sea035.12.pld1.sub.36',
'tests/data/realtime_rawnc/')
assert fnout == 'tests/data/realtime_rawnc/sea035.0012.pld1.sub.0036.nc'
assert fnout == 'tests/data/realtime_rawnc/sea035.0012.pld1.sub.0036.parquet'
assert filenum == 36


Expand Down Expand Up @@ -44,20 +43,20 @@ def test_raw_to_rawnc():
def test__needsupdating():
ftype = 'pld1'
fin = 'tests/data/realtime_raw/sea035.12.pld1.sub.36'
fout = 'tests/data/realtime_rawnc/sea035.0012.pld1.sub.0036.nc'
fout = 'tests/data/realtime_rawnc/sea035.0012.pld1.sub.0036.parquet'
result_badpath = seaexplorer._needsupdating(ftype, fin, 'baz')
result_goodpath = seaexplorer._needsupdating(ftype, fin, fout)
assert result_badpath is True
assert result_goodpath is False


def test_merge_rawnc():
result_default = seaexplorer.merge_rawnc(
result_default = seaexplorer.merge_parquet(
'tests/data/realtime_rawnc/',
'tests/data/realtime_rawnc/',
example_dir / 'example-seaexplorer/deploymentRealtime.yml')

result_sub = seaexplorer.merge_rawnc(
result_sub = seaexplorer.merge_parquet(
'tests/data/realtime_rawnc/',
'tests/data/realtime_rawnc/',
example_dir / 'example-seaexplorer/deploymentRealtime.yml',
Expand All @@ -68,11 +67,11 @@ def test_merge_rawnc():

def test__interp_gli_to_pld():
# function should interpolate values from the glider dataset to sampling frequency of payload dataset
glider = xr.open_dataset('tests/data/realtime_rawnc/sea035.0012.gli.sub.0036.nc')
ds = xr.open_dataset('tests/data/realtime_rawnc/sea035.0012.pld1.sub.0036.nc')
val = glider.Pitch
glider = pl.read_parquet('tests/data/realtime_rawnc/sea035.0012.gli.sub.0036.parquet')
ds = pl.read_parquet('tests/data/realtime_rawnc/sea035.0012.pld1.sub.0036.parquet')
val = glider.select("Pitch").to_numpy()[:, 0]
pitch_interp = seaexplorer._interp_gli_to_pld(glider, ds, val, None)
assert len(pitch_interp) == len(ds.time)
assert len(pitch_interp) == ds.shape[0]


def test_raw_to_timeseries():
Expand Down