Merge pull request #68 from MeteoSwiss/develop

v0.3.0.dev0
MeteoSwiss · Feb 2, 2022 · 84ff4fc · 84ff4fc
2 parents f6e2ff4 + e6ae4f5
commit 84ff4fc
Show file tree

Hide file tree

Showing 21 changed files with 825 additions and 314 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -4,15 +4,18 @@ The format is inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0
 This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 
-## [v0.3.0]
+## [v0.3.0.dev0]
 ### Added:
+ - [fpavogt, 2022-01-27] Add tests for the .plots.secondary module.
 ### Fixed:
+ - [fpavogt, 2022-01-27] Fix #66 and #67.
+ - [fpavogt, 2022-01-26] Fix #62, #63 and #64.
 ### Changed:
+ - [fpavogt, 2022-02-01] Implement review feedback, including name change for `scaled` to `apply_scaling`.
 ### Deprecated:
 ### Removed:
 ### Security:
 
-
 ## [v0.2.1.dev0]
 ### Fixed:
  - [fpavogt, 2022-01-21] Fix #58.

diff --git a/docs/source/running.rst b/docs/source/running.rst
@@ -5,6 +5,45 @@
 Using ampycloud
 =================
 
+A no-words example for those that want to get started quickly
+-------------------------------------------------------------
+
+::
+
+    from datetime import datetime
+    import ampycloud
+    from ampycloud.utils import mocker
+    from ampycloud.plots import diagnostic
+
+    # Generate the canonical demo dataset for ampycloud
+    # Your data should have *exactly* this structure
+    mock_data = mocker.canonical_demo_data()
+
+    # Run the ampycloud algorithm on it
+    chunk = ampycloud.run(mock_data, geoloc='Mock data', ref_dt=datetime.now())
+
+    # Get the resulting METAR message
+    print(chunk.metar_msg())
+
+    # Display the full information available for the layers found
+    print(chunk.layers)
+
+    # And for the most motivated, plot the diagnostic diagram
+    diagnostic(chunk, upto='layers', show=True, save_stem='ampycloud_demo')
+
+
+The input data
+--------------
+
+The ampycloud algorithm is meant to process cloud base *hits* from ceilometer observations. A given
+set of hits to be processed by the ampycloud package must be stored inside a
+:py:class:`pandas.DataFrame` with a specific set of characteristics outlined below. Users can use
+the following utility function to check whether a given :py:class:`pandas.DataFrame` meets all the
+requirements of ampycloud.
+
+.. autofunction:: ampycloud.utils.utils.check_data_consistency
+    :noindex:
+
 .. _running:
 
 Running the algorithm

diff --git a/src/ampycloud/core.py b/src/ampycloud/core.py
@@ -155,52 +155,27 @@ def reset_prms() -> None:
     dynamic.AMPYCLOUD_PRMS = dynamic.get_default_prms()
 
 @log_func_call(logger)
-def run(data : pd.DataFrame, geoloc : str = None, ref_dt : str = None) -> CeiloChunk:
+def run(data : pd.DataFrame, geoloc : str = None,
+        ref_dt : Union[str, datetime] = None) -> CeiloChunk:
     """ Runs the ampycloud algorithm on a given dataset.
 
     Args:
-        data (pd.DataFrame): the data to be processed, as a py:class:`pandas.DataFrame`.
+        data (pd.DataFrame): the data to be processed, as a :py:class:`pandas.DataFrame`.
         geoloc (str, optional): the name of the geographic location where the data was taken.
             Defaults to None.
-        ref_dt (str, optional): reference date and time of the observations, corresponding to
-            Delta t = 0. Defaults to None.
+        ref_dt (str|datetime.datetime, optional): reference date and time of the observations,
+            corresponding to Delta t = 0. Defaults to None. Note that if a datetime instance
+            is specified, it will be turned almost immediately to str via ``str(ref_dt)``.
 
     Returns:
         :py:class:`.data.CeiloChunk`: the data chunk with all the processing outcome bundled
         cleanly.
 
-    All that is required to run the ampycloud algorithm is a properly formatted dataset. At the
-    moment, specifying ``geoloc`` and ``ref_dt`` serves no purpose other than to enhance plots
-    (should they be created). There is no special requirements for ``geoloc`` and ``ref_dt``: as
-    long as they are strings, you can set them to whatever you please.
-
-    The input ``data`` must be a :py:class:`pandas.DataFrame` with the following column names
-    (types):
-    ::
-
-        'ceilo' (str), 'dt' (float), 'alt' (float), 'type' (int)
-
-    The ``ceilo`` columns contains the names/ids of the ceilometers as ``str``.
-
-    The ``dt`` column contains time deltas, in s, between a given ceilometer observation and
-    ``ref_dt``.
-
-    The ``alt`` column contains the cloud base hit altitudes reported by the ceilometers, in ft
-    above ground.
-
-    The ``type`` column contains integer that correspond to the hit *sequence id*. E.g. if a given
-    ceilometer is reporting multiple hits for a given timestep (corresponding to a cloud level 1,
-    cloud level 2, cloud level 3, etc ...), the ``type`` of these measurements could be ``1``,
-    ``2``, ``3``, ... Any data point with a ``type`` of ``-1`` will be flagged in the ampycloud
-    plots as a vertical Visibility (VV) hits, **but it will not be treated any differently than any
-    other regular hit**. Type ``0`` corresponds to no (cloud) detection.
-
-    It is possible to obtain an example of the required ``data`` format from the
-    :py:func:`.utils.mocker.canonical_demo_data` routine of the package, like so:
-    ::
-
-        from ampycloud.utils import mocker
-        mock_data = mocker.canonical_demo_data()
+    All that is required to run the ampycloud algorithm is
+    `a properly formatted dataset <https://meteoswiss.github.io/ampycloud/running.html#the-input-data>`__.
+    At the moment, specifying ``geoloc`` and ``ref_dt`` serves no purpose other than to enhance
+    plots (should they be created). There is no special requirements for ``geoloc`` and ``ref_dt``:
+    as long as they are strings, you can set them to whatever you please.
 
     .. important ::
         ampycloud treats Vertical Visibility hits no differently than any other hit. Hence, it is up
@@ -261,7 +236,7 @@ def run(data : pd.DataFrame, geoloc : str = None, ref_dt : str = None) -> CeiloC
     logger.info('Starting an ampycloud run at %s', starttime)
 
     # First, let's create an CeiloChunk instance ...
-    chunk = CeiloChunk(data, geoloc = geoloc, ref_dt = ref_dt)
+    chunk = CeiloChunk(data, geoloc = geoloc, ref_dt = str(ref_dt))
 
     # Go through the ampycloud cascade:
     # Run the slicing ...

diff --git a/src/ampycloud/data.py b/src/ampycloud/data.py
@@ -19,11 +19,10 @@
 # Import from this package
 from .errors import AmpycloudError
 from .logger import log_func_call
-from . import scaler
-from . import cluster
-from . import layer
+from . import scaler, cluster, layer
 from . import wmo, icao
-from . import dynamic
+from . import dynamic, hardcoded
+from .utils import utils
 
 # Instantiate the module logger
 logger = logging.getLogger(__name__)
@@ -32,7 +31,7 @@ class AbstractChunk(ABC):
     """ Abstract parent class for data chunk classes."""
 
     #: dict: required data columns
-    DATA_COLS = {'ceilo': str, 'dt': float, 'alt': float, 'type': int}
+    DATA_COLS = copy.deepcopy(hardcoded.REQ_DATA_COLS)
 
     @abstractmethod
     def __init__(self, data : pd.DataFrame, geoloc : str = None, ref_dt : str = None) -> None:
@@ -44,8 +43,8 @@ def __init__(self, data : pd.DataFrame, geoloc : str = None, ref_dt : str = None
         self._msa = copy.deepcopy(dynamic.AMPYCLOUD_PRMS.MSA)
         self._msa_hit_buffer = copy.deepcopy(dynamic.AMPYCLOUD_PRMS.MSA_HIT_BUFFER)
 
-        # Chunk data and required column names
-        self._data = self._cleanup_pdf(data)
+        # Assign the data using **a deep copy** to avoid messing with the original one.
+        self._data = self._cleanup_pdf(copy.deepcopy(data))
 
         # Name of the geographic location of the observations
         self._geoloc = geoloc
@@ -86,36 +85,10 @@ def _cleanup_pdf(self, data : pd.DataFrame) -> pd.DataFrame:
 
         """
 
-        # First things first, make sure I was fed a pandas DataFrame
-        if not isinstance(data, pd.DataFrame):
-            raise AmpycloudError('Ouch ! I was expecting data as a pandas DataFrame,'+
-                                 f' not: {type(data)}')
-
-        # Make sure the dataframe is not empty.
-        # Note: an empty dataframe = no measurements. This is NOT the same as "measuring" clear sky
-        # conditions, which would result in NaNs.
-        # If I have no measurements, I cannot issue a METAR. It would make no sense.
-        if len(data) == 0:
-            raise AmpycloudError("Ouch ! len(data) is 0. I can't work with no data !")
-
-        # Check that all the required columns are present in the data, with the correct format
-        for (col, type_req) in self.DATA_COLS.items():
-            # If the requried column is missing, raise an Exception
-            if col not in data.columns:
-                raise AmpycloudError(f'Ouch ! Column {col} is missing from the input data.')
-            # If the column has the wrong data type, try to fix it on the fly.
-            if type_in := data[col].dtype != type_req:
-                logger.info('Adjusting the dtype of column %s from %s to %s',
-                            col, type_in, type_req)
-                data[col] = data[col].astype(type_req)
-
-        # Drop any columns that I do not need for processing
-        for key in data.columns:
-            if key not in self.DATA_COLS.keys():
-                logger.info('Dropping the superfluous %s column from the input data.', key)
-                data.drop((key), axis=1, inplace=True)
-
-        # Drop any hits that is too high
+        # Begin with a thorough inspection of the dataset
+        data = utils.check_data_consistency(data, req_cols=self.DATA_COLS)
+
+        # Then also drop any hits that is too high
         if self.msa is not None:
             hit_alt_lim = self.msa + self.msa_hit_buffer
             logger.info('Cropping hits above MSA+buffer: %s ft', str(hit_alt_lim))
@@ -222,10 +195,10 @@ def data_rescaled(self, dt_mode : str = None, alt_mode : str = None,
         out = copy.deepcopy(self.data)
 
         # Deal with the dt first
-        out['dt'] = scaler.scaling(out['dt'], dt_mode, **dt_kwargs)
+        out['dt'] = scaler.apply_scaling(out['dt'], dt_mode, **dt_kwargs)
 
         # Then the altitudes
-        out['alt'] = scaler.scaling(out['alt'], alt_mode, **alt_kwargs)
+        out['alt'] = scaler.apply_scaling(out['alt'], alt_mode, **alt_kwargs)
 
         return out
 
@@ -624,11 +597,11 @@ def find_layers(self) -> None:
             # 1) Layer density is large enough
             cond1 = self.groups.at[ind, 'okta'] < \
                     dynamic.AMPYCLOUD_PRMS.LAYERING_PRMS.min_okta_to_split
-            # 2) I have more than one valid point !
-            cond2 = len(gro_alts[~np.isnan(gro_alts)]) == 1
+            # 2) I have more than three valid points (since I will look for up to 3 components)
+            cond2 = len(gro_alts[~np.isnan(gro_alts)]) < 3
             if cond1 or cond2:
                 # Add some useful info to the log
-                logger.info('Skipping the layering because: MIN_OKTA [%s] | 1PT [%s]',
+                logger.info('Skipping the layering because: MIN_OKTA [%s] | 3PT [%s]',
                              cond1, cond2)
                 # Here, set ncomp to -1 to show clearly that I did NOT actually check it ...
                 self.groups.at[ind, 'ncomp'] = -1

diff --git a/src/ampycloud/hardcoded.py b/src/ampycloud/hardcoded.py
@@ -0,0 +1,14 @@
+"""
+Copyright (c) 2022 MeteoSwiss, contributors listed in AUTHORS.
+
+Distributed under the terms of the 3-Clause BSD License.
+
+SPDX-License-Identifier: BSD-3-Clause
+
+Module contains: hardcoded data
+"""
+
+from pandas import StringDtype
+
+#: dict: the columns & associated types required for the pandas DataFrame fed to ampycloud.
+REQ_DATA_COLS = {'ceilo': StringDtype(), 'dt': float, 'alt': float, 'type': int}
diff --git a/src/ampycloud/layer.py b/src/ampycloud/layer.py
@@ -19,7 +19,7 @@
 # Import from this module
 from .errors import AmpycloudError, AmpycloudWarning
 from .logger import log_func_call
-from .scaler import minmax_scaling
+from .scaler import minmax_scale
 from .utils import utils
 
 # Instantiate the module logger
@@ -195,7 +195,7 @@ def ncomp_from_gmm(vals : np.ndarray,
 
     # Rescale the data if warranted
     if rescale_0_to_x is not None:
-        vals = minmax_scaling(vals, min_range=0) * rescale_0_to_x
+        vals = minmax_scale(vals) * rescale_0_to_x
 
     # I will only look for at most 3 layers.
     ncomp = np.array([1, 2, 3])