c-proof · callumrollo · Nov 4, 2022 · Oct 6, 2022 · Oct 6, 2022 · Oct 6, 2022
diff --git a/docs/getting-started-seaexplorer.md b/docs/getting-started-seaexplorer.md
@@ -30,7 +30,7 @@ There are four top-levels to the `deployment.yaml`
 
 The example script is relatively straight forward if there is no intermediate processing.  See {ref}`ExProc`, below.
 
-Data comes from an input directory, and is translated to raw glider-dependent netCDF files and put in a new directory.  These files are useful of their own right, but are not CF compliant.  These files are then merged into a single monolithic netCDF file, and this is translated to a CF-compliant timeseries file.  Finally individual profiles are saved and a 2-D 1-m grid in time-depth is saved.
+Data comes from an input directory, and is translated to raw glider-dependent parquet files files and put in a new directory. These files are useful of their own right. Apache Parquet is a columnar oriented format for storing tabular data. Parquet files take up less space than netCDF or csv and are much faster to read and write. These files can be opened with [polars.read_parquet](https://pola-rs.github.io/polars-book/user-guide/howcani/io/parquet.html) or [pandas.read_parquet](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_parquet.html). These files are then merged into a single monolithic parquet file, and this is translated to a CF-compliant timeseries netCDF file.  Finally individual profiles are saved and a 2-D 1-m grid in time-depth is saved.
 
 It is likely that between these steps the user will want to add any screening steps, or adjustments to the calibrations.  PyGlider does not provide those steps.
 

diff --git a/docs/index.md b/docs/index.md
@@ -10,7 +10,7 @@ by optional steps to create profiles or depth-time grids.
 
 ```python
 seaexplorer.raw_to_rawnc(rawdir, rawncdir, deploymentyaml)
-seaexplorer.merge_rawnc(rawncdir, rawncdir, deploymentyaml, kind='sub')
+seaexplorer.merge_parquet(rawncdir, rawncdir, deploymentyaml, kind='sub')
 outname = seaexplorer.raw_to_timeseries(rawncdir, tsdir, deploymentyaml, kind='sub')
 # optional...
 ncprocess.extract_timeseries_profiles(outname, profiledir, deploymentyaml)

diff --git a/environment.yml b/environment.yml
@@ -13,5 +13,6 @@ dependencies:
   - scipy
   - bitstring
   - pooch
+  - polars
   - pip:
     - dbdreader
diff --git a/pyglider/seaexplorer.py b/pyglider/seaexplorer.py
diff --git a/tests/environment.yml b/tests/environment.yml
@@ -15,6 +15,7 @@ dependencies:
   - pytest
   - pooch
   - matplotlib
+  - polars
   - pip:
     - dbdreader
 
diff --git a/tests/example-data/example-seaexplorer-legato-flntu-arod-ad2cp/process_deploymentRealTime.py b/tests/example-data/example-seaexplorer-legato-flntu-arod-ad2cp/process_deploymentRealTime.py
@@ -22,7 +22,7 @@
     # turn seaexplorer zipped csvs into nc files.
     seaexplorer.raw_to_rawnc(rawdir, rawncdir, deploymentyaml)
     # merge individual netcdf files into single netcdf files *.gli*.nc and *.pld1*.nc
-    seaexplorer.merge_rawnc(rawncdir, rawncdir, deploymentyaml, kind='sub')
+    seaexplorer.merge_parquet(rawncdir, rawncdir, deploymentyaml, kind='sub')
     # Make level-0 timeseries netcdf file from the raw files...
     outname = seaexplorer.raw_to_timeseries(rawncdir, l0tsdir, deploymentyaml, kind='sub')
     ncprocess.extract_timeseries_profiles(outname, profiledir, deploymentyaml)

diff --git a/tests/example-data/example-seaexplorer/process_deploymentRealTime.py b/tests/example-data/example-seaexplorer/process_deploymentRealTime.py
@@ -26,7 +26,7 @@
     # turn *.EBD and *.DBD into *.ebd.nc and *.dbd.nc netcdf files.
     seaexplorer.raw_to_rawnc(rawdir, rawncdir, deploymentyaml)
         # merge individual neetcdf files into single netcdf files *.ebd.nc and *.dbd.nc
-    seaexplorer.merge_rawnc(rawncdir, rawncdir, deploymentyaml, kind='sub')
+    seaexplorer.merge_parquet(rawncdir, rawncdir, deploymentyaml, kind='sub')
 
         # Make level-1 timeseries netcdf file from th raw files...
     outname = seaexplorer.raw_to_timeseries(rawncdir, l0tsdir, deploymentyaml, kind='sub')

diff --git a/tests/test_pyglider.py b/tests/test_pyglider.py
@@ -16,7 +16,7 @@
 deploymentyaml = str(example_dir / 'example-seaexplorer/deploymentRealtime.yml')
 l0tsdir = str(example_dir / 'example-seaexplorer/L0-timeseries-test/') + '/'
 seaexplorer.raw_to_rawnc(rawdir, rawncdir, deploymentyaml)
-seaexplorer.merge_rawnc(rawncdir, rawncdir, deploymentyaml, kind='sub')
+seaexplorer.merge_parquet(rawncdir, rawncdir, deploymentyaml, kind='sub')
 outname = seaexplorer.raw_to_L0timeseries(rawncdir, l0tsdir,
                                           deploymentyaml, kind='sub')
 output = xr.open_dataset(outname)
@@ -39,7 +39,7 @@ def test_example_seaexplorer(var):
     # Test that each variable and its coordinates match
     assert output[var].attrs == test_data[var].attrs
     if var not in ['time']:
-        np.testing.assert_allclose(output[var].values, test_data[var].values)
+        np.testing.assert_allclose(output[var].values, test_data[var].values, rtol=1e-5)
     else:
         dt0 = output[var].values - np.datetime64('2000-01-01')
         dt1 = test_data[var].values - np.datetime64('2000-01-01')
@@ -98,7 +98,7 @@ def test_example_slocum(var):
     assert output_slocum[var].attrs == test_data_slocum[var].attrs
     if var not in ['time']:
         np.testing.assert_allclose(output_slocum[var].values,
-                                   test_data_slocum[var].values)
+                                   test_data_slocum[var].values, rtol=1e-6)
     else:
         dt0 = output_slocum[var].values - np.datetime64('2000-01-01')
         dt1 = test_data_slocum[var].values - np.datetime64('2000-01-01')
@@ -163,7 +163,7 @@ def test_example_slocum_littleendian(var):
     assert output_slocum_le[var].attrs == test_data_slocum_le[var].attrs
     if var not in ['time']:
         np.testing.assert_allclose(output_slocum_le[var].values,
-                                   test_data_slocum_le[var].values)
+                                   test_data_slocum_le[var].values, rtol=1e-6)
     else:
         dt0 = output_slocum_le[var].values - np.datetime64('2000-01-01')
         dt1 = test_data_slocum_le[var].values - np.datetime64('2000-01-01')

diff --git a/tests/test_seaexplorer.py b/tests/test_seaexplorer.py
@@ -1,7 +1,6 @@
-import xarray as xr
+import polars as pl
 import pytest
 from pathlib import Path
-import sys
 import os
 os.system('rm tests/data/realtime_rawnc/*')
 library_dir = Path(__file__).parent.parent.absolute()
@@ -13,7 +12,7 @@
 def test__outputname():
     fnout, filenum = seaexplorer._outputname('tests/data/realtime_raw/sea035.12.pld1.sub.36',
                                              'tests/data/realtime_rawnc/')
-    assert fnout == 'tests/data/realtime_rawnc/sea035.0012.pld1.sub.0036.nc'
+    assert fnout == 'tests/data/realtime_rawnc/sea035.0012.pld1.sub.0036.parquet'
     assert filenum == 36
 
 
@@ -44,20 +43,20 @@ def test_raw_to_rawnc():
 def test__needsupdating():
     ftype = 'pld1'
     fin = 'tests/data/realtime_raw/sea035.12.pld1.sub.36'
-    fout = 'tests/data/realtime_rawnc/sea035.0012.pld1.sub.0036.nc'
+    fout = 'tests/data/realtime_rawnc/sea035.0012.pld1.sub.0036.parquet'
     result_badpath = seaexplorer._needsupdating(ftype, fin, 'baz')
     result_goodpath = seaexplorer._needsupdating(ftype, fin, fout)
     assert result_badpath is True
     assert result_goodpath is False
 
 
 def test_merge_rawnc():
-    result_default = seaexplorer.merge_rawnc(
+    result_default = seaexplorer.merge_parquet(
             'tests/data/realtime_rawnc/',
             'tests/data/realtime_rawnc/',
             example_dir / 'example-seaexplorer/deploymentRealtime.yml')
 
-    result_sub = seaexplorer.merge_rawnc(
+    result_sub = seaexplorer.merge_parquet(
             'tests/data/realtime_rawnc/',
             'tests/data/realtime_rawnc/',
             example_dir / 'example-seaexplorer/deploymentRealtime.yml',
@@ -68,11 +67,11 @@ def test_merge_rawnc():
 
 def test__interp_gli_to_pld():
     # function should interpolate values from the glider dataset to sampling frequency of payload dataset
-    glider = xr.open_dataset('tests/data/realtime_rawnc/sea035.0012.gli.sub.0036.nc')
-    ds = xr.open_dataset('tests/data/realtime_rawnc/sea035.0012.pld1.sub.0036.nc')
-    val = glider.Pitch
+    glider = pl.read_parquet('tests/data/realtime_rawnc/sea035.0012.gli.sub.0036.parquet')
+    ds = pl.read_parquet('tests/data/realtime_rawnc/sea035.0012.pld1.sub.0036.parquet')
+    val = glider.select("Pitch").to_numpy()[:, 0]
     pitch_interp = seaexplorer._interp_gli_to_pld(glider, ds, val, None)
-    assert len(pitch_interp) == len(ds.time)
+    assert len(pitch_interp) == ds.shape[0]
 
 
 def test_raw_to_timeseries():
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,5 +13,6 @@ dependencies: @@
       - scipy
       - bitstring
       - pooch
+      - polars
       - pip:
         - dbdreader