diff --git a/CHANGELOG b/CHANGELOG index 2c30975..9678f89 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,15 +4,18 @@ The format is inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0 This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [v0.3.0] +## [v0.3.0.dev0] ### Added: + - [fpavogt, 2022-01-27] Add tests for the .plots.secondary module. ### Fixed: + - [fpavogt, 2022-01-27] Fix #66 and #67. + - [fpavogt, 2022-01-26] Fix #62, #63 and #64. ### Changed: + - [fpavogt, 2022-02-01] Implement review feedback, including name change for `scaled` to `apply_scaling`. ### Deprecated: ### Removed: ### Security: - ## [v0.2.1.dev0] ### Fixed: - [fpavogt, 2022-01-21] Fix #58. diff --git a/docs/source/running.rst b/docs/source/running.rst index 55cea6b..a9f33e7 100644 --- a/docs/source/running.rst +++ b/docs/source/running.rst @@ -5,6 +5,45 @@ Using ampycloud ================= +A no-words example for those that want to get started quickly +------------------------------------------------------------- + +:: + + from datetime import datetime + import ampycloud + from ampycloud.utils import mocker + from ampycloud.plots import diagnostic + + # Generate the canonical demo dataset for ampycloud + # Your data should have *exactly* this structure + mock_data = mocker.canonical_demo_data() + + # Run the ampycloud algorithm on it + chunk = ampycloud.run(mock_data, geoloc='Mock data', ref_dt=datetime.now()) + + # Get the resulting METAR message + print(chunk.metar_msg()) + + # Display the full information available for the layers found + print(chunk.layers) + + # And for the most motivated, plot the diagnostic diagram + diagnostic(chunk, upto='layers', show=True, save_stem='ampycloud_demo') + + +The input data +-------------- + +The ampycloud algorithm is meant to process cloud base *hits* from ceilometer observations. A given +set of hits to be processed by the ampycloud package must be stored inside a +:py:class:`pandas.DataFrame` with a specific set of characteristics outlined below. Users can use +the following utility function to check whether a given :py:class:`pandas.DataFrame` meets all the +requirements of ampycloud. + +.. autofunction:: ampycloud.utils.utils.check_data_consistency + :noindex: + .. _running: Running the algorithm diff --git a/src/ampycloud/core.py b/src/ampycloud/core.py index 57aa478..d142669 100644 --- a/src/ampycloud/core.py +++ b/src/ampycloud/core.py @@ -155,52 +155,27 @@ def reset_prms() -> None: dynamic.AMPYCLOUD_PRMS = dynamic.get_default_prms() @log_func_call(logger) -def run(data : pd.DataFrame, geoloc : str = None, ref_dt : str = None) -> CeiloChunk: +def run(data : pd.DataFrame, geoloc : str = None, + ref_dt : Union[str, datetime] = None) -> CeiloChunk: """ Runs the ampycloud algorithm on a given dataset. Args: - data (pd.DataFrame): the data to be processed, as a py:class:`pandas.DataFrame`. + data (pd.DataFrame): the data to be processed, as a :py:class:`pandas.DataFrame`. geoloc (str, optional): the name of the geographic location where the data was taken. Defaults to None. - ref_dt (str, optional): reference date and time of the observations, corresponding to - Delta t = 0. Defaults to None. + ref_dt (str|datetime.datetime, optional): reference date and time of the observations, + corresponding to Delta t = 0. Defaults to None. Note that if a datetime instance + is specified, it will be turned almost immediately to str via ``str(ref_dt)``. Returns: :py:class:`.data.CeiloChunk`: the data chunk with all the processing outcome bundled cleanly. - All that is required to run the ampycloud algorithm is a properly formatted dataset. At the - moment, specifying ``geoloc`` and ``ref_dt`` serves no purpose other than to enhance plots - (should they be created). There is no special requirements for ``geoloc`` and ``ref_dt``: as - long as they are strings, you can set them to whatever you please. - - The input ``data`` must be a :py:class:`pandas.DataFrame` with the following column names - (types): - :: - - 'ceilo' (str), 'dt' (float), 'alt' (float), 'type' (int) - - The ``ceilo`` columns contains the names/ids of the ceilometers as ``str``. - - The ``dt`` column contains time deltas, in s, between a given ceilometer observation and - ``ref_dt``. - - The ``alt`` column contains the cloud base hit altitudes reported by the ceilometers, in ft - above ground. - - The ``type`` column contains integer that correspond to the hit *sequence id*. E.g. if a given - ceilometer is reporting multiple hits for a given timestep (corresponding to a cloud level 1, - cloud level 2, cloud level 3, etc ...), the ``type`` of these measurements could be ``1``, - ``2``, ``3``, ... Any data point with a ``type`` of ``-1`` will be flagged in the ampycloud - plots as a vertical Visibility (VV) hits, **but it will not be treated any differently than any - other regular hit**. Type ``0`` corresponds to no (cloud) detection. - - It is possible to obtain an example of the required ``data`` format from the - :py:func:`.utils.mocker.canonical_demo_data` routine of the package, like so: - :: - - from ampycloud.utils import mocker - mock_data = mocker.canonical_demo_data() + All that is required to run the ampycloud algorithm is + `a properly formatted dataset `__. + At the moment, specifying ``geoloc`` and ``ref_dt`` serves no purpose other than to enhance + plots (should they be created). There is no special requirements for ``geoloc`` and ``ref_dt``: + as long as they are strings, you can set them to whatever you please. .. important :: ampycloud treats Vertical Visibility hits no differently than any other hit. Hence, it is up @@ -261,7 +236,7 @@ def run(data : pd.DataFrame, geoloc : str = None, ref_dt : str = None) -> CeiloC logger.info('Starting an ampycloud run at %s', starttime) # First, let's create an CeiloChunk instance ... - chunk = CeiloChunk(data, geoloc = geoloc, ref_dt = ref_dt) + chunk = CeiloChunk(data, geoloc = geoloc, ref_dt = str(ref_dt)) # Go through the ampycloud cascade: # Run the slicing ... diff --git a/src/ampycloud/data.py b/src/ampycloud/data.py index eacaff0..1cec10a 100644 --- a/src/ampycloud/data.py +++ b/src/ampycloud/data.py @@ -19,11 +19,10 @@ # Import from this package from .errors import AmpycloudError from .logger import log_func_call -from . import scaler -from . import cluster -from . import layer +from . import scaler, cluster, layer from . import wmo, icao -from . import dynamic +from . import dynamic, hardcoded +from .utils import utils # Instantiate the module logger logger = logging.getLogger(__name__) @@ -32,7 +31,7 @@ class AbstractChunk(ABC): """ Abstract parent class for data chunk classes.""" #: dict: required data columns - DATA_COLS = {'ceilo': str, 'dt': float, 'alt': float, 'type': int} + DATA_COLS = copy.deepcopy(hardcoded.REQ_DATA_COLS) @abstractmethod def __init__(self, data : pd.DataFrame, geoloc : str = None, ref_dt : str = None) -> None: @@ -44,8 +43,8 @@ def __init__(self, data : pd.DataFrame, geoloc : str = None, ref_dt : str = None self._msa = copy.deepcopy(dynamic.AMPYCLOUD_PRMS.MSA) self._msa_hit_buffer = copy.deepcopy(dynamic.AMPYCLOUD_PRMS.MSA_HIT_BUFFER) - # Chunk data and required column names - self._data = self._cleanup_pdf(data) + # Assign the data using **a deep copy** to avoid messing with the original one. + self._data = self._cleanup_pdf(copy.deepcopy(data)) # Name of the geographic location of the observations self._geoloc = geoloc @@ -86,36 +85,10 @@ def _cleanup_pdf(self, data : pd.DataFrame) -> pd.DataFrame: """ - # First things first, make sure I was fed a pandas DataFrame - if not isinstance(data, pd.DataFrame): - raise AmpycloudError('Ouch ! I was expecting data as a pandas DataFrame,'+ - f' not: {type(data)}') - - # Make sure the dataframe is not empty. - # Note: an empty dataframe = no measurements. This is NOT the same as "measuring" clear sky - # conditions, which would result in NaNs. - # If I have no measurements, I cannot issue a METAR. It would make no sense. - if len(data) == 0: - raise AmpycloudError("Ouch ! len(data) is 0. I can't work with no data !") - - # Check that all the required columns are present in the data, with the correct format - for (col, type_req) in self.DATA_COLS.items(): - # If the requried column is missing, raise an Exception - if col not in data.columns: - raise AmpycloudError(f'Ouch ! Column {col} is missing from the input data.') - # If the column has the wrong data type, try to fix it on the fly. - if type_in := data[col].dtype != type_req: - logger.info('Adjusting the dtype of column %s from %s to %s', - col, type_in, type_req) - data[col] = data[col].astype(type_req) - - # Drop any columns that I do not need for processing - for key in data.columns: - if key not in self.DATA_COLS.keys(): - logger.info('Dropping the superfluous %s column from the input data.', key) - data.drop((key), axis=1, inplace=True) - - # Drop any hits that is too high + # Begin with a thorough inspection of the dataset + data = utils.check_data_consistency(data, req_cols=self.DATA_COLS) + + # Then also drop any hits that is too high if self.msa is not None: hit_alt_lim = self.msa + self.msa_hit_buffer logger.info('Cropping hits above MSA+buffer: %s ft', str(hit_alt_lim)) @@ -222,10 +195,10 @@ def data_rescaled(self, dt_mode : str = None, alt_mode : str = None, out = copy.deepcopy(self.data) # Deal with the dt first - out['dt'] = scaler.scaling(out['dt'], dt_mode, **dt_kwargs) + out['dt'] = scaler.apply_scaling(out['dt'], dt_mode, **dt_kwargs) # Then the altitudes - out['alt'] = scaler.scaling(out['alt'], alt_mode, **alt_kwargs) + out['alt'] = scaler.apply_scaling(out['alt'], alt_mode, **alt_kwargs) return out @@ -624,11 +597,11 @@ def find_layers(self) -> None: # 1) Layer density is large enough cond1 = self.groups.at[ind, 'okta'] < \ dynamic.AMPYCLOUD_PRMS.LAYERING_PRMS.min_okta_to_split - # 2) I have more than one valid point ! - cond2 = len(gro_alts[~np.isnan(gro_alts)]) == 1 + # 2) I have more than three valid points (since I will look for up to 3 components) + cond2 = len(gro_alts[~np.isnan(gro_alts)]) < 3 if cond1 or cond2: # Add some useful info to the log - logger.info('Skipping the layering because: MIN_OKTA [%s] | 1PT [%s]', + logger.info('Skipping the layering because: MIN_OKTA [%s] | 3PT [%s]', cond1, cond2) # Here, set ncomp to -1 to show clearly that I did NOT actually check it ... self.groups.at[ind, 'ncomp'] = -1 diff --git a/src/ampycloud/hardcoded.py b/src/ampycloud/hardcoded.py new file mode 100644 index 0000000..4c4f5f1 --- /dev/null +++ b/src/ampycloud/hardcoded.py @@ -0,0 +1,14 @@ +""" +Copyright (c) 2022 MeteoSwiss, contributors listed in AUTHORS. + +Distributed under the terms of the 3-Clause BSD License. + +SPDX-License-Identifier: BSD-3-Clause + +Module contains: hardcoded data +""" + +from pandas import StringDtype + +#: dict: the columns & associated types required for the pandas DataFrame fed to ampycloud. +REQ_DATA_COLS = {'ceilo': StringDtype(), 'dt': float, 'alt': float, 'type': int} diff --git a/src/ampycloud/layer.py b/src/ampycloud/layer.py index 35f29d5..a4bf515 100644 --- a/src/ampycloud/layer.py +++ b/src/ampycloud/layer.py @@ -19,7 +19,7 @@ # Import from this module from .errors import AmpycloudError, AmpycloudWarning from .logger import log_func_call -from .scaler import minmax_scaling +from .scaler import minmax_scale from .utils import utils # Instantiate the module logger @@ -195,7 +195,7 @@ def ncomp_from_gmm(vals : np.ndarray, # Rescale the data if warranted if rescale_0_to_x is not None: - vals = minmax_scaling(vals, min_range=0) * rescale_0_to_x + vals = minmax_scale(vals) * rescale_0_to_x # I will only look for at most 3 layers. ncomp = np.array([1, 2, 3]) diff --git a/src/ampycloud/plots/diagnostics.py b/src/ampycloud/plots/diagnostics.py index 62fa7a2..0f464f0 100644 --- a/src/ampycloud/plots/diagnostics.py +++ b/src/ampycloud/plots/diagnostics.py @@ -19,9 +19,9 @@ from matplotlib import rcParams # Import from this package -from ..scaler import scaling +from .. import scaler from .hardcoded import WIDTH_TWOCOL, MRKS -from .tools import texify +from .tools import texify, get_scaling_kwargs from .. import wmo from ..data import CeiloChunk from .. import dynamic @@ -197,8 +197,8 @@ def show_slices(self) -> None: overlap = dynamic.AMPYCLOUD_PRMS.GROUPING_PRMS.overlap # Get some fake data spanning the entire data range - misc = np.arange(np.nanmin(self._chunk.data['dt']), - np.nanmax(self._chunk.data['dt']), 1) + misc = np.linspace(np.nanmin(self._chunk.data['dt']), + np.nanmax(self._chunk.data['dt']), 3) self._axs[0].fill_between(misc, np.ones_like(misc) * (slice_mean - overlap * slice_std), np.ones_like(misc) * (slice_mean + overlap * slice_std), @@ -417,47 +417,41 @@ def format_primary_axes(self) -> None: def format_slice_axes(self) -> None: """ Format the duplicate axes related to the slicing part. - Todo: - Cleanup the code once #25 is fixed. - """ - # First, get the scaling parameters, and switch them over to a 'descale' mode ... - # TODO: Once issue #25 is fixed, maybe update these lines ... - de_dt_scale_kwargs = {} - de_dt_scale_kwargs.update(dynamic.AMPYCLOUD_PRMS.SLICING_PRMS.dt_scale_kwargs) - de_dt_scale_kwargs['mode'] = 'descale' - de_alt_scale_kwargs = {} - de_alt_scale_kwargs.update(dynamic.AMPYCLOUD_PRMS.SLICING_PRMS.alt_scale_kwargs) - de_alt_scale_kwargs['mode'] = 'descale' - # Here, only proceed if I have actually found some slices ! if self._chunk.n_slices > 0: - # Here, I need to add some vital information to the de_alt_scaling parameters - # Essentially, we need to set min_/max_val items so we can actually "undo" the scaling - # and generate the appropriate side-axis. - if dynamic.AMPYCLOUD_PRMS.SLICING_PRMS.alt_scale_mode == 'minmax': - de_alt_scale_kwargs['min_val'] = np.nanmin(self._chunk.data['alt']) - de_alt_scale_kwargs['max_val'] = np.nanmax(self._chunk.data['alt']) - if dynamic.AMPYCLOUD_PRMS.SLICING_PRMS.dt_scale_mode == 'minmax': - de_dt_scale_kwargs['min_val'] = np.nanmin(self._chunk.data['dt']) - de_dt_scale_kwargs['max_val'] = np.nanmax(self._chunk.data['dt']) + # In order to show secondary_axis, we need to feed the forward/reverse scaling function + # with the actual ones used in the code. Since these are dependant upon the data, + # we need to derive them by hand given what was requested by the user. + (dt_scale_kwargs, dt_descale_kwargs) = \ + get_scaling_kwargs(self._chunk.data['dt'].values, + dynamic.AMPYCLOUD_PRMS.SLICING_PRMS.dt_scale_mode, + dynamic.AMPYCLOUD_PRMS.SLICING_PRMS.dt_scale_kwargs) + (alt_scale_kwargs, alt_descale_kwargs) = \ + get_scaling_kwargs(self._chunk.data['alt'].values, + dynamic.AMPYCLOUD_PRMS.SLICING_PRMS.alt_scale_mode, + dynamic.AMPYCLOUD_PRMS.SLICING_PRMS.alt_scale_kwargs) # Then add the secondary axis, using partial function to define the back-and-forth # conversion functions. secax_x = self._axs[0].secondary_xaxis(1.06, - functions=(partial(scaling, fct=dynamic.AMPYCLOUD_PRMS.SLICING_PRMS.dt_scale_mode, - **dynamic.AMPYCLOUD_PRMS.SLICING_PRMS.dt_scale_kwargs), - partial(scaling, fct=dynamic.AMPYCLOUD_PRMS.SLICING_PRMS.dt_scale_mode, - **de_dt_scale_kwargs))) + functions=(partial(scaler.apply_scaling, + fct=dynamic.AMPYCLOUD_PRMS.SLICING_PRMS.dt_scale_mode, + **dt_scale_kwargs), + partial(scaler.apply_scaling, + fct=dynamic.AMPYCLOUD_PRMS.SLICING_PRMS.dt_scale_mode, + **dt_descale_kwargs))) secax_y = self._axs[0].secondary_yaxis(1.03, - functions=(partial(scaling, fct=dynamic.AMPYCLOUD_PRMS.SLICING_PRMS.alt_scale_mode, - **dynamic.AMPYCLOUD_PRMS.SLICING_PRMS.alt_scale_kwargs), - partial(scaling, fct=dynamic.AMPYCLOUD_PRMS.SLICING_PRMS.alt_scale_mode, - **de_alt_scale_kwargs))) + functions=(partial(scaler.apply_scaling, + fct=dynamic.AMPYCLOUD_PRMS.SLICING_PRMS.alt_scale_mode, + **alt_scale_kwargs), + partial(scaler.apply_scaling, + fct=dynamic.AMPYCLOUD_PRMS.SLICING_PRMS.alt_scale_mode, + **alt_descale_kwargs))) # Add the axis labels secax_x.set_xlabel(texify(r'\smaller Slicing $\Delta t$')) @@ -470,46 +464,41 @@ def format_slice_axes(self) -> None: def format_group_axes(self) -> None: """ Format the duplicate axes related to the grouping part. - Todo: - Cleanup the code once #25 is fixed. - """ - # First, get the scaling parameters, and switch them over to a 'descale' mode ... - # TODO: once issue #25 is fixed, maybe update these lines ... - de_dt_scale_kwargs = {} - de_dt_scale_kwargs.update(dynamic.AMPYCLOUD_PRMS.GROUPING_PRMS.dt_scale_kwargs) - de_dt_scale_kwargs['mode'] = 'descale' - de_alt_scale_kwargs = {} - de_alt_scale_kwargs.update(dynamic.AMPYCLOUD_PRMS.GROUPING_PRMS.alt_scale_kwargs) - de_alt_scale_kwargs['mode'] = 'descale' - - # Here, I need to add some vital information to the de_alt_scaling parameters - # Essentially, we need to set min_/max_val items so we can actually "undo" the scaling - # and generate the appropriate side-axis. - if dynamic.AMPYCLOUD_PRMS.GROUPING_PRMS.alt_scale_mode == 'minmax': - de_alt_scale_kwargs['min_val'] = np.nanmin(self._chunk.data['alt']) - de_alt_scale_kwargs['max_val'] = np.nanmax(self._chunk.data['alt']) - if dynamic.AMPYCLOUD_PRMS.GROUPING_PRMS.dt_scale_mode == 'minmax': - de_dt_scale_kwargs['min_val'] = np.nanmin(self._chunk.data['dt']) - de_dt_scale_kwargs['max_val'] = np.nanmax(self._chunk.data['dt']) - # Only proceed if I have found some clusters ... if self._chunk.n_groups > 0: + # In order to show secondary_axis, we need to feed the forward/reverse scaling function + # with the actual ones used in the code. Since these are dependant upon the data, + # we need to derive them by hand given what was requested by the user. + (dt_scale_kwargs, dt_descale_kwargs) = \ + get_scaling_kwargs(self._chunk.data['dt'].values, + dynamic.AMPYCLOUD_PRMS.GROUPING_PRMS.dt_scale_mode, + dynamic.AMPYCLOUD_PRMS.GROUPING_PRMS.dt_scale_kwargs) + + (alt_scale_kwargs, alt_descale_kwargs) = \ + get_scaling_kwargs(self._chunk.data['alt'].values, + dynamic.AMPYCLOUD_PRMS.GROUPING_PRMS.alt_scale_mode, + dynamic.AMPYCLOUD_PRMS.GROUPING_PRMS.alt_scale_kwargs) + # Then add the secondary axis, using partial function to define the back-and-forth # conversion functions. secax_x = self._axs[0].secondary_xaxis(1.25, - functions=(partial(scaling, fct=dynamic.AMPYCLOUD_PRMS.GROUPING_PRMS.dt_scale_mode, - **dynamic.AMPYCLOUD_PRMS.GROUPING_PRMS.dt_scale_kwargs), - partial(scaling, fct=dynamic.AMPYCLOUD_PRMS.GROUPING_PRMS.dt_scale_mode, - **de_dt_scale_kwargs))) + functions=(partial(scaler.apply_scaling, + fct=dynamic.AMPYCLOUD_PRMS.GROUPING_PRMS.dt_scale_mode, + **dt_scale_kwargs), + partial(scaler.apply_scaling, + fct=dynamic.AMPYCLOUD_PRMS.GROUPING_PRMS.dt_scale_mode, + **dt_descale_kwargs))) secax_y = self._axs[0].secondary_yaxis(1.14, - functions=(partial(scaling, fct=dynamic.AMPYCLOUD_PRMS.GROUPING_PRMS.alt_scale_mode, - **dynamic.AMPYCLOUD_PRMS.GROUPING_PRMS.alt_scale_kwargs), - partial(scaling, fct=dynamic.AMPYCLOUD_PRMS.GROUPING_PRMS.alt_scale_mode, - **de_alt_scale_kwargs))) + functions=(partial(scaler.apply_scaling, + fct=dynamic.AMPYCLOUD_PRMS.GROUPING_PRMS.alt_scale_mode, + **alt_scale_kwargs), + partial(scaler.apply_scaling, + fct=dynamic.AMPYCLOUD_PRMS.GROUPING_PRMS.alt_scale_mode, + **alt_descale_kwargs))) # Add the axis labels secax_x.set_xlabel(texify(r'\smaller Grouping $\Delta t$')) diff --git a/src/ampycloud/plots/secondary.py b/src/ampycloud/plots/secondary.py index 2d17f0a..35698e7 100644 --- a/src/ampycloud/plots/secondary.py +++ b/src/ampycloud/plots/secondary.py @@ -9,6 +9,7 @@ """ # Import from Python +from typing import Union import logging import numpy as np import matplotlib.pyplot as plt @@ -16,7 +17,7 @@ # Import from this package from ..logger import log_func_call -from ..scaler import scaling +from ..scaler import apply_scaling from .. import dynamic from .hardcoded import WIDTH_TWOCOL from .tools import texify, set_mplstyle @@ -26,7 +27,8 @@ @set_mplstyle @log_func_call(logger) -def scaling_fcts(show: bool = True, save : bool = False) -> None: +def scaling_fcts(show: bool = True, + save_stem : str = None, save_fmts : Union[list, str] = None) -> None: """ Plots the different scaling functions. This is a small utility routine to rapidly see the different altitude scaling options used by @@ -36,18 +38,27 @@ def scaling_fcts(show: bool = True, save : bool = False) -> None: Args: show (bool, optional): show the plot, or not. Defaults to True. - save (bool, optional): save the plot to pdf, or not. Defaults to False. + save_stem (str, optional): if set, will save the plot with this stem (which can include a + path as well). Defaults to None. + save_fmts (list|str, optional): a list of file formats to export the plot to. Defaults to + None = ['png']. - """ - logger.info('Plotting style: %s', dynamic.AMPYCLOUD_PRMS.MPL_STYLE) + Example: + :: + + from ampycloud.plots.secondary import scaling_fcts + + scaling_fcts(show=True, save_stem='ampycloud_scaling_fcts', save_fmts=['pdf']) + + """ # Create the figure, with a suitable width. fig = plt.figure(figsize=(WIDTH_TWOCOL, 4.0)) # Use gridspec for a fine control of the figure area. fig_gs = gridspec.GridSpec(1, 3, height_ratios=[1], width_ratios=[1, 1, 1], - left=0.09, right=0.96, bottom=0.18, top=0.9, + left=0.07, right=0.96, bottom=0.18, top=0.9, wspace=0.15, hspace=0.05) ax1 = fig.add_subplot(fig_gs[0, 0]) @@ -58,25 +69,29 @@ def scaling_fcts(show: bool = True, save : bool = False) -> None: alts = np.arange(0, 25000, 10) # Plot the slicing scale - ax1.plot(alts, scaling(alts, fct='const', **{'scale':10}), c='k', lw=2) - ax1.set_title(texify(r'\smaller const')) + ax1.plot(alts, apply_scaling(alts, fct='shift-and-scale', **{'scale':1000}), c='k', lw=2) + ax1.set_title(texify(r'\smaller shift-and-scale')) - ax2.plot(alts, scaling(alts, fct='minmax'), c='k', lw=2) - ax2.set_title(texify(r'\smaller minmax')) + ax2.plot(alts, apply_scaling(alts, fct='minmax-scale'), c='k', lw=2) + ax2.set_title(texify(r'\smaller minmax-scale')) - ax3.plot(alts, scaling(alts, fct='step', + ax3.plot(alts, apply_scaling(alts, fct='step-scale', **dynamic.AMPYCLOUD_PRMS.GROUPING_PRMS['alt_scale_kwargs']), c='k', lw=2) - ax3.set_title(texify(r'\smaller step')) + ax3.set_title(texify(r'\smaller step-scale')) for ax in [ax1, ax2, ax3]: - ax.set_xlabel('Alt. [ft]') + ax.set_xlabel('x') + + ax1.set_ylabel('Scaled x') - ax1.set_ylabel('Scaled Atl.') + if save_fmts is None: + save_fmts = ['png'] - if save: - fn = 'ampycloud_scaling_fcts.pdf' - plt.savefig(fn) - logger.info('%s saved to disk.', fn) + if save_stem is not None: + for fmt in save_fmts: + out = f'{save_stem}.{fmt}' + plt.savefig(out) + logger.info('%s saved to disk.', out) if show: plt.show() diff --git a/src/ampycloud/plots/tools.py b/src/ampycloud/plots/tools.py index 66ac9f0..531cd05 100644 --- a/src/ampycloud/plots/tools.py +++ b/src/ampycloud/plots/tools.py @@ -13,6 +13,8 @@ from typing import Callable from functools import wraps from pathlib import Path +import copy +import numpy as np import matplotlib.pyplot as plt from matplotlib import rcParams import yaml @@ -20,7 +22,7 @@ # Import from ampycloud from ..errors import AmpycloudError from ..logger import log_func_call -from .. import dynamic +from .. import dynamic, scaler # Instantiate the module logger logger = logging.getLogger(__name__) @@ -145,3 +147,44 @@ def texify(msg : str) -> str: msg = msg.replace(r'\it', r'') return msg + +@log_func_call(logger) +def get_scaling_kwargs(data : np.ndarray, mode: str, kwargs : dict) -> tuple: + """ Utility function to extract the **actual, deterministic** parameters required to scale the + data, given a set of user-defined parameters. + + This is a utility function to aid in the drawing of secondary axis that require to derive the + "reverse scaling function". + + Args: + data (pd.Series): the data that was originally scaled by the user. + mode (str): the name of the scaling used by the user. Must be any mode supported by + :py:func:`ampycloud.scaler.convert_kwargs` (e.g. ``shift-and-scale``, ``minmax-scale``, + ``step-scale``). + kwargs (dict): the scaling parameter set by the user. + + Return: + tuple: (scale_kwargs, descale_kwargs), the two dict with parameters for the forward/backward + scaling. + + Todo: + Cleanup the code once #25 is fixed. + + """ + + # TODO: Once issue #25 is fixed, maybe update these lines ... + # Let's create a storage dict, and fill it with what I got from the user. + scale_kwargs = {} + scale_kwargs.update(kwargs) + + # Then let's also be explicit about the scaling mode + scale_kwargs['mode'] = 'do' + + # Use the actual scaler routine used to covert "user params" into "scaling params" + scale_kwargs = scaler.convert_kwargs(data, mode, **kwargs) + + # Now get a copy, change the mode to 'undo', and we're done ! + descale_kwargs = copy.deepcopy(scale_kwargs) + descale_kwargs['mode'] = 'undo' + + return (scale_kwargs, descale_kwargs) diff --git a/src/ampycloud/prms/ampycloud_default_prms.yml b/src/ampycloud/prms/ampycloud_default_prms.yml index e5d54fa..f6583f7 100644 --- a/src/ampycloud/prms/ampycloud_default_prms.yml +++ b/src/ampycloud/prms/ampycloud_default_prms.yml @@ -42,17 +42,15 @@ SLICING_PRMS: linkage: average affinity: manhattan # Time scaling mode - dt_scale_mode: const + dt_scale_mode: shift-and-scale # Time scaling parameters dt_scale_kwargs: scale: 100000 - mode: scale # Altitude scaling mode - alt_scale_mode: minmax + alt_scale_mode: minmax-scale # Altitude scaling parameters alt_scale_kwargs: min_range: 1000 - mode: scale # Clustering parameters GROUPING_PRMS: @@ -65,18 +63,16 @@ GROUPING_PRMS: linkage: single affinity: manhattan # Time scaling mode - dt_scale_mode: const + dt_scale_mode: shift-and-scale # Time scaling parameters dt_scale_kwargs: scale: 180 - mode: scale # Altitude scaling mode - alt_scale_mode: step + alt_scale_mode: step-scale # Altitude scaling parameters alt_scale_kwargs: steps: [8000, 14000] scales: [100, 500, 1000] - mode: scale # Distance from the mean altitude (in std) below which two slices are considered "overlapping" overlap: 2.0 diff --git a/src/ampycloud/scaler.py b/src/ampycloud/scaler.py index 3455c60..231d92a 100644 --- a/src/ampycloud/scaler.py +++ b/src/ampycloud/scaler.py @@ -21,100 +21,106 @@ logger = logging.getLogger(__name__) @log_func_call(logger) -def const_scaling(vals : np.ndarray, - scale : Union[int, float], mode : str = 'scale') -> np.ndarray: - """ Scale (or descale) values by a constant. +def shift_and_scale(vals : np.ndarray, shift : Union[int, float] = None, + scale : Union[int, float] = 1, mode : str = 'do') -> np.ndarray: + """ Shift (by a constant) and scale (by a constant) the data. Args: - vals (ndarray): values to (de-)scale. - scale (int|float): the scaling value. - mode (str, optional): whether to 'scale' or 'descale', i.e. undo the scaling. + vals (ndarray): values to (un-)shift-and-scale. + shift (int|float, optional): amount to shift the data by. If not specified, it will be set + to ``max(vals)``. + scale (int|float, optional): the scaling value. Defaults to 1. + mode (str, optional): whether to 'do' or 'undo' the shift-and-scale. Returns: - ndarray: vals/scale if mode=='scale', and val * scale if mode=='descale'. + np.ndarray: the (un-)shifted-and-scaled array. + + This function converts x to (x-shift)/scale if ``mode='do'``, and to x * scale + shift if + ``mode='undo'``. + """ - if mode == 'scale': - return vals/scale + # Let's deal with None shift ... this is important to ensure that the ampycloud algorithm + # always deals with the respective time separation between measurements, when doing the + # clustering. + if shift is None: + shift = np.nanmax(vals) - if mode == 'descale': - return vals * scale + # Implement the scaling routine + if mode == 'do': + return (vals-shift)/scale + if mode == 'undo': + return vals * scale + shift raise AmpycloudError(f' Ouch ! mode unknown: {mode}') @log_func_call(logger) -def minmax_scaling(vals : np.ndarray, - min_range : Union[float, int] = 0, - mode : str = 'scale', - min_val : Union[float, int] = None, - max_val : Union[float, int] = None) -> np.ndarray: - """ Rescale the data onto a [0, 1] interval, possibly using a minimum interval range. - - Note: - Descaling with this function is a little bit annoying, because the data itself does not - contain enough information to know what interval vals should be mapped onto. - Hence the min_val and max_val kwargs, so that the user can tell us ... +def minmax_scale(vals : np.ndarray, + min_val : Union[float, int] = None, + max_val : Union[float, int] = None, + mode : str = 'do') -> np.ndarray: + """ Rescale the data onto a [0, 1] interval, possibly forcing a specific and/or minimum + interval range. Args: - vals (ndarray): values to (de-)scale. - min_range (int|float, optional): will map the [0, 1] interval to - [val_mid-min_range/2, val_mid+min_range/2] rather than [val_min, val_max], - if val_max-val_min < min_range. Note: val_mid=(val_max+val_min)/2. Must be >=0. - Defaults to 0. + vals (ndarray): values to (un-)minmax-scale. mode (str, optional): whether to 'scale' or 'descale', i.e. undo the scaling. - min_val (int|float, optional): min (target) interval value, in case mode='descale'. + min_val (int|float, optional): value to be mapped to 0. If not set, will be ``min(vals)``. Defaults to None. - max_val (int|float, optional): max (target) interval value, in case mode='descale'. + max_val (int|float, optional): value to be mapped to 1. If not set, will be ``max(vals)``. Defaults to None. Returns: - ndarray: scaled/descaled values. + ndarray: The (un-)minmax-scaled array. """ - # Complain if min_range is smaller than 0. Else, I risk dividing by 0 in certai circumstances - # (i.e. when a single point is being fed). - if min_range < 0: - raise AmpycloudError(f'Ouch ! min_range must be >0, and not: {min_range}') - - if mode == 'scale': - # What is the range of the data - val_range = np.nanmax(vals) - np.nanmin(vals) - val_mid = (np.nanmax(vals) + np.nanmin(vals))/2 - - # Deal with the cases where all the values are the same by returning nans - if val_range == 0: - # If the user did not set a min_range, let's do it for them - if min_range == 0: - min_range = 1 - - # Else, deal with the values as usual ... - if val_range >= min_range: - mymax = np.nanmax(vals) - mymin = np.nanmin(vals) - # ... unless I have a range smaller than the min_range specified by the user. - else: - mymax = val_mid + min_range/2 - mymin = val_mid - min_range/2 + # Compute the "default" edges if warranted + if min_val is None: + min_val = np.nanmin(vals) + if max_val is None: + max_val = np.nanmax(vals) - return (vals-mymin)/(mymax-mymin) + if mode == 'do': + return (vals-min_val)/(max_val-min_val) - if mode == 'descale': - # Compute the original range and mid point - val_range = max_val - min_val - val_mid = (max_val + min_val)/2 + if mode == 'undo': + return vals * (max_val - min_val) + min_val - # Descale, properly handling the case where min_range matters. - if val_range >= min_range: - return vals * (max_val - min_val) + min_val + raise AmpycloudError(f' Ouch ! mode unknown: {mode}') - return vals * min_range + (val_mid - min_range/2) +@log_func_call(logger) +def minrange2minmax(vals : np.ndarray, min_range : Union[int, float] = 0) -> tuple: + """ Transform a minimum range into a pair of min/max values. + + Args: + vals (np.ndarray): values to assess. + min_range (int|float, optional): mininum range to meet. Defaults to 0. + + Returns: + tuple: the min and max values of the data range of at least min_range in size. + + + Essentially, if max(vals)-min(vals) >= min_range, this function returns + ``[min(vals), max(vals)]``. Else, it returns ``[val_mid-min_range/2, val_mid+min_range/2]``, + with ```val_mid=(max(vals)+min(vals))/2``. + + """ + + val_range = np.nanmax(vals) - np.nanmin(vals) + if val_range >= min_range: + return (np.nanmin(vals), np.nanmax(vals)) + + # Compute the middle of the data + val_mid = (np.nanmax(vals) + np.nanmin(vals))/2 + + # Build a symetric range around it + return (val_mid - min_range/2, val_mid + min_range/2) - raise AmpycloudError(f' Ouch ! mode unknown: {mode}') @log_func_call(logger) -def step_scaling(vals : np.ndarray, - steps : list, scales : list, mode : str = 'scale') -> np.ndarray: +def step_scale(vals : np.ndarray, + steps : list, scales : list, mode : str = 'do') -> np.ndarray: """ Scales values step-wise, with different constants bewteen specific steps. Values are divided by scales[i] between steps[i-1:i]. @@ -128,10 +134,10 @@ def step_scaling(vals : np.ndarray, steps (list, optional): the step **edges**. E.g. [8000, 14000]. scales (list, optional): the scaling values (=dividers) for each step. E.g. [100, 500, 1000]. Must have len(scales) = len(steps)+1. - mode (str, optional): whether to 'scale' or 'descale', i.e. undo the scaling. + mode (str, optional): whether to 'do' or 'undo' the scaling. Returns: - ndarray: (de-)scaled values + ndarray: (un-)step-scaled values """ # Some sanity checks @@ -155,36 +161,119 @@ def step_scaling(vals : np.ndarray, out = np.full_like(vals, np.nan, dtype=float) # Start scaling things, one step after another - for (sid, scale) in enumerate(scales): + for (sid, sval) in enumerate(scales): # What is this specific step offset (to ensure continuity between steps) ? cont_corr = np.concatenate((np.array([steps[0]/scales[0]]), np.diff(steps)/scales[1:-1])) cont_corr = np.sum(cont_corr[:sid]) - if mode == 'scale': + if mode == 'do': # Which values belong to that step ? cond = (edges_in[sid] <= vals) * (vals < edges_in[sid+1]) # Apply the scaling - out[cond] = (vals[cond] - offsets[sid])/scale + cont_corr + out[cond] = (vals[cond] - offsets[sid])/sval + cont_corr - if mode == 'descale': + elif mode == 'undo': # Which value belongs to that step ? cond = (vals >= edges_out[sid]) * (vals < edges_out[sid+1]) # Apply the descaling - out[cond] = (vals[cond] - cont_corr) * scale + offsets[sid] + out[cond] = (vals[cond] - cont_corr) * sval + offsets[sid] + + else: + raise AmpycloudError(f'Ouch ! mode unknown: {mode}') return out @log_func_call(logger) -def scaling(vals : np.ndarray, fct : str = None, **kwargs : dict) -> np.ndarray: +def convert_kwargs(vals : np.ndarray, fct : str, **kwargs : dict) -> dict: + """ Converts the user-input keywords such that they can be fed to the underlying scaling + functions. + + Args: + vals (np.ndarray): the values to be processed. + fct (str): the scaling mode, e.g. 'shift-and-scale', etc .... + **kwargs: dict of keyword arguments to be converted, if warranted. + + Returns: + dict: the data-adjusted set of kwargs. + + Note: + This function was first introduced to accomodate the creation of a secondary axis on the + ampycloud diagnostic plots. It is a buffer that allows to separate "user" scaling + keywords from the "deterministic" scaling keywords required to get a specific scaling, no + matter the underlying dataset (as is required for plotting a secondary axis). + + Essentially, this function allows to feed either "user" or "deterministic" keywords to + :py:func:`.apply_scaling`, such that the former will be turned into the latter, and the latter will + remain untouched. + + """ + + if fct == 'shift-and-scale': + # In this case, the only data I may need to derive from the data is the shift. + if 'shift' in kwargs.keys(): + # Already set - do nothing + return kwargs + if 'mode' in kwargs.keys(): + if kwargs['mode'] == 'do': + kwargs['shift'] = np.nanmax(vals) + elif kwargs['mode'] == 'undo': + raise AmpycloudError('Ouch ! I cannot get `shift` from the shift-and-scaled data !') + else: + raise AmpycloudError(f"Ouch ! mode unknown: {kwargs['mode']}") + return kwargs + + # 'mode' is not set -> it will be "do" by default. + kwargs['shift'] = np.nanmax(vals) + return kwargs + + if fct == 'minmax-scale': + # In this case, the challenge lies with identifying min_val and max_val, knowing that the + # user may specify a min_range value. + if 'min_val' in kwargs.keys() and 'max_val' in kwargs.keys(): + # Already specified ... do nothing + return kwargs + if 'mode' in kwargs.keys(): + if kwargs['mode'] == 'do': + if 'min_range' in kwargs.keys(): + min_range = kwargs['min_range'] + kwargs.pop('min_range', None) + else: + min_range = 0 + (kwargs['min_val'], kwargs['max_val']) = minrange2minmax(vals, min_range) + return kwargs + + if kwargs['mode']=='undo': + raise AmpycloudError('Ouch ! I cannot get `min_val` and `max_val` from' + + ' minmax-scaled data !') + + raise AmpycloudError(f"Ouch ! mode unknown: {kwargs['mode']}") + + # 'mode' not set -> will default to 'do' + if 'min_range' in kwargs.keys(): + min_range = kwargs['min_range'] + kwargs.pop('min_range', None) + else: + min_range = 0 + (kwargs['min_val'], kwargs['max_val']) = minrange2minmax(vals, min_range) + return kwargs + + if fct == 'step-scale': + # Nothing to be done here + return kwargs + + raise AmpycloudError(f'Ouch ! scaling fct unknown: {fct}') + +@log_func_call(logger) +def apply_scaling(vals : np.ndarray, fct : str = None, **kwargs : dict) -> np.ndarray: """ Umbrella scaling routine, that gathers all the individual ones under a single entry point. Args: vals (ndarray): values to scale. fct (str, optional): name of the scaling function to use. Can be one of - ['const', 'minmax', or 'step']. Defaults to None = do nothing. + ['shift-and-scale', 'minmax-scale', or 'step-scale']. Defaults to None = do nothing. **kwargs: keyword arguments that will be fed to the underlying scaling function. Returns: @@ -199,13 +288,16 @@ def scaling(vals : np.ndarray, fct : str = None, **kwargs : dict) -> np.ndarray: if np.all(np.isnan(vals)): return vals - if fct == 'const': - return const_scaling(vals, **kwargs) + # Process the user-supplied kwargs into kwargs I can feed the functions. + kwargs = convert_kwargs(vals, fct, **kwargs) + + if fct == 'shift-and-scale': + return shift_and_scale(vals, **kwargs) - if fct == 'minmax': - return minmax_scaling(vals, **kwargs) + if fct == 'minmax-scale': + return minmax_scale(vals, **kwargs) - if fct == 'step': - return step_scaling(vals, **kwargs) + if fct == 'step-scale': + return step_scale(vals, **kwargs) raise AmpycloudError(f'Ouch ! Scaling function name unknown: {fct}') diff --git a/src/ampycloud/utils/mocker.py b/src/ampycloud/utils/mocker.py index c20306d..bc6f8ac 100644 --- a/src/ampycloud/utils/mocker.py +++ b/src/ampycloud/utils/mocker.py @@ -18,6 +18,7 @@ from ..logger import log_func_call from ..errors import AmpycloudError from . import utils +from .. import hardcoded # Instantiate the module logger logger = logging.getLogger(__name__) @@ -156,6 +157,9 @@ def mock_layers(n_ceilos : int, lookback_time : float, hit_gap: float, if a>0 and np.isnan(alt): layers.drop(index=alts.index[a], inplace=True) + elif np.isnan(alt): + # A non-detection should be type 0 + layers.loc[alts.index[a], 'type'] = 0 else: layers.loc[alts.index[a], 'type'] = a+1 @@ -171,10 +175,8 @@ def mock_layers(n_ceilos : int, lookback_time : float, hit_gap: float, out = out.sort_values(['dt', 'alt']).reset_index(drop=True) # Fix the dtypes - out.loc[:, 'dt'] = out['dt'].astype(float) - out.loc[:, 'alt'] = out['alt'].astype(float) - out.loc[:, 'type'] = out['type'].astype(int) - out.loc[:, 'ceilo'] = out['ceilo'].astype(str) + for (col, tpe) in hardcoded.REQ_DATA_COLS.items(): + out.loc[:, col] = out[col].astype(tpe) return out diff --git a/src/ampycloud/utils/utils.py b/src/ampycloud/utils/utils.py index d64913a..1794910 100644 --- a/src/ampycloud/utils/utils.py +++ b/src/ampycloud/utils/utils.py @@ -10,12 +10,168 @@ # Import from Python import logging +import warnings import contextlib +import copy import numpy as np +import pandas as pd + +# Import from this package +from ..errors import AmpycloudError, AmpycloudWarning +from ..logger import log_func_call +from .. import hardcoded # Instantiate the module logger logger = logging.getLogger(__name__) +@log_func_call(logger) +def check_data_consistency(pdf : pd.DataFrame, + req_cols : dict = None) -> pd.DataFrame: + """ Assesses whether a given :py:class:`pandas.DataFrame` is compatible with the requirements + of ampycloud. + + Args: + pdf (pd.DataFrame): the data to check. + req_cols (dict): A dictionary in which keys correspond to the required columns, and their + value are the column type. Defaults to None = the ampycloud requirements. + + Returns: + pd.DataFrame: the data, possibly cleaned-up of superfluous columns, and with corrected + dtypes. + + + This function will raise an :py:class:`ampycloud.errors.AmpycloudError` and/or + an :py:class:`ampycloud.errors.AmpycloudWarning` if it identifies very bad and/or very weird + things in ``pdf``. + + Specifically, the input ``pdf`` must be a :py:class:`pandas.DataFrame` with the following + column names/types (formally defined in :py:data:`ampycloud.hardcoded.REQ_DATA_COLS`): + :: + + 'ceilo'/pd.StringDtype(), 'dt'/float, 'alt'/float, 'type'/int + + The ``ceilo`` column contains the names/ids of the ceilometers as ``pd.StringDtype()``. + See `the pandas documentation `__ + for more info about this type. + + The ``dt`` column contains time deltas, in seconds, between a given ceilometer + observation and ``ref_dt`` (i.e. ``obs_time-ref_dt``). Ideally, ``ref_dt`` would be the issuing + time of the METAR message, such that ``dt`` values are negative, with the smallest one + corresponding to the oldest measurement. + + The ``alt`` column contains the cloud base hit altitudes reported by the ceilometers, in ft + above ground. + + The ``type`` column contains integers that correspond to the hit *sequence id*. If a given + ceilometer is reporting multiple hits for a given timestep (corresponding to a cloud level 1, + cloud level 2, cloud level 3, etc ...), the ``type`` of these measurements would be ``1``, + ``2``, ``3``, etc ... Any data point with a ``type`` of ``-1`` will be flagged in the ampycloud + plots as a vertical Visibility (VV) hits, **but it will not be treated any differently than any + other regular hit**. Type ``0`` corresponds to no (cloud) detection, in which case the + corresponding hit altitude should be a NaN. + + Important: + A **non-detection** corresponds to a valid measurement with a ``dt`` value, a ``type 0``, + and ``NaN`` as the altitude. It should not be confused with a **non-observation**, + when no data was acquired at all ! + + If it all sounds confusing, it is possible to obtain an example of the required data format + from the :py:func:`.utils.mocker.canonical_demo_data` routine of the package, like so:: + + from ampycloud.utils import mocker + mock_data = mocker.canonical_demo_data() + + As mentionned above, it is also possible to verify if a given :py:class:`pandas.DataFrame` is + meeting the ampycloud requirements ahead of time via + :py:func:`ampycloud.utils.utils.check_data_consistency`:: + + from ampycloud.utils.utils import check_data_consistency + checked_pdf = check_data_consistency(pdf) + + This will raise an :py:class:`ampycloud.errors.AmpycloudError` if: + + * ``pdf`` is not a :py:class:`pandas.DataFrame`. + * ``pdf`` is missing a required column. + * ``pdf`` has a length of 0. + + In addition, this will raise an :py:class:`ampycloud.errors.AmpycloudWarning` if: + + * any of ``pdf`` column type is not as expected. Note that in this case, the code will try + to correct the type on the fly. + * ``pdf`` has any superfluous columns. In this case, the code will drop them automatically. + * Any hit altitude is negative. + * Any ``type 0`` hit has a non-NaN altitude. + * Any ``type 1`` hit has a NaN altitude. + * Any ``type 2`` hit does not have a coincident ``type 1`` hit. + * Any ``type 3`` hit does not have a coincident ``type 2`` hit. + + """ + + # Begin by making a deep copy of the data to avoid messing with the user stuff + data = copy.deepcopy(pdf) + + # Deal with the None default value for the columns + if req_cols is None: + req_cols = hardcoded.REQ_DATA_COLS + + # First things first, make sure I was fed a pandas DataFrame + if not isinstance(data, pd.DataFrame): + raise AmpycloudError('Ouch ! I was expecting data as a pandas DataFrame,'+ + f' not: {type(data)}') + + # Make sure the dataframe is not empty. + # Note: an empty dataframe = no measurements. This is NOT the same as "measuring" clear sky + # conditions, which would result in NaNs. + # If I have no measurements, I cannot issue a AutoMETAR. It would make no sense. + if len(data) == 0: + raise AmpycloudError("Ouch ! len(data) is 0. I can't work with no data !") + + # Check that all the required columns are present in the data, with the correct format + for (col, type_req) in req_cols.items(): + # If the required column is missing, raise an Exception. + if col not in data.columns: + raise AmpycloudError(f'Ouch ! Column {col} is missing from the input data.') + # If the column has the wrong data type, complain as well. + if (type_in := data[col].dtype) != type_req: + warnings.warn(f'Huh ! Column {col} has type {type_in} instead of {type_req}.', + AmpycloudWarning) + logger.warning('Adjusting the dtype of column %s from %s to %s', + col, type_in, type_req) + data[col] = data[col].astype(type_req) + + # Drop any columns that I do not need for processing + for key in data.columns: + if key not in req_cols.keys(): + warnings.warn(f'Huh ! Column {key} is not required by ampycloud.', + AmpycloudWarning) + logger.warning('Dropping the superfluous %s column from the input data.', key) + data.drop((key), axis=1, inplace=True) + + # A brief sanity check of the altitudes. We do not issue Errors, since the code can cope + # with those elements: we simply raise Warnings. + msgs = [] + if np.any(data.loc[:, 'alt'].values < 0): + msgs += ['Huh ! Some hit altitudes are negative ?!'] + if not np.all(np.isnan(data.loc[data.type == 0, 'alt'])): + msgs += ['Huh ! Some type=0 hits have non-NaNs altitude values ?!'] + if np.any(np.isnan(data.loc[data.type == 1, 'alt'])): + msgs += ['Huh ! Some type=1 hits have NaNs altitude values ?!'] + if not np.all(np.in1d(data.loc[data.type==2, 'dt'].values, + data.loc[data.type==1, 'dt'].values)): + msgs += ['Huh ! Some type=2 hits have no coincident type=1 hits ?!'] + if not np.all(np.in1d(data.loc[data.type==3, 'dt'].values, + data.loc[data.type==2, 'dt'].values)): + msgs += ['Huh ! Some type=3 hits have no coincident type=2 hits ?!'] + + # Now save all those messages to the log, and raise Warnings as well. + for msg in msgs: + warnings.warn(msg, AmpycloudWarning) + logger.warning(msg) + + # All done + return data + + @contextlib.contextmanager def tmp_seed(seed : int): """ Temporarily reset the :py:func:`numpy.random.seed` value. diff --git a/src/ampycloud/version.py b/src/ampycloud/version.py index 2aaea97..6525813 100644 --- a/src/ampycloud/version.py +++ b/src/ampycloud/version.py @@ -9,4 +9,4 @@ """ #:str: the one-and-only place where the ampycloud version is set. -VERSION = '0.2.1.dev0' +VERSION = '0.3.0.dev0' diff --git a/test/ampycloud/plots/test_core.py b/test/ampycloud/plots/test_core.py index 58f0cfc..4641d26 100644 --- a/test/ampycloud/plots/test_core.py +++ b/test/ampycloud/plots/test_core.py @@ -17,6 +17,41 @@ from ampycloud.core import demo from ampycloud.plots.core import diagnostic + +def test_large_dts(mpls): + """ Test that all goes well if the user sets very large dts. + + Args: + mpls: False, or the value of MPL_STYLE requested by the user. This is set automatically + by a fixture that fetches the corresponding command line argument. + See conftest.py for details. + + """ + + if mpls: + dynamic.AMPYCLOUD_PRMS.MPL_STYLE = mpls + + # Let's create some data with only NaNs + data = pd.DataFrame([['1', 7e5, 1000, 1], ['1', 7e5+1, 1001, 1], ['1', 7e5+2, 1002, 1]], + columns=['ceilo', 'dt', 'alt', 'type']) + data['ceilo'] = data['ceilo'].astype(pd.StringDtype()) + data['dt'] = data['dt'].astype(float) + data['alt'] = data['alt'].astype(float) + data['type'] = data['type'].astype(int) + + # Run ampycloud + chunk = run(data) + + # Uncomment the line below once pytst 7.0 can be pip-installed + #with pytest.does_not_warn(): + if True: + diagnostic(chunk, upto='layers', show_ceilos=False, show=False, + save_stem='pytest_large_dts', save_fmts='png', + ref_metar_origin='Large dts', ref_metar='OVC010') + + assert Path('pytest_large_dts.pdf').exists + + def test_diagnostic(mpls): """ Test the raw_data plot. @@ -43,7 +78,7 @@ def test_diagnostic(mpls): # Create the diagnsotic plots at the four different upto levels for sufx in sufxs: diagnostic(chunk, upto=sufx, show_ceilos=True, show=False, - save_stem=base_name+sufx, save_fmts='png', + save_stem=base_name+sufx, save_fmts=['pdf', 'png'], ref_metar_origin='Mock data', ref_metar='FEW009 SCT018 BKN038') assert Path(base_name+sufx+'.pdf').exists @@ -66,9 +101,9 @@ def test_empty_plot(mpls): dynamic.AMPYCLOUD_PRMS.MPL_STYLE = mpls # Let's create some data with only NaNs - data = pd.DataFrame([['1', -100, np.nan, 1], ['1', -99, np.nan, 1]], + data = pd.DataFrame([['1', -100, np.nan, 0], ['1', -99, np.nan, 0]], columns=['ceilo', 'dt', 'alt', 'type']) - data['ceilo'] = data['ceilo'].astype(str) + data['ceilo'] = data['ceilo'].astype(pd.StringDtype()) data['dt'] = data['dt'].astype(float) data['alt'] = data['alt'].astype(float) data['type'] = data['type'].astype(int) diff --git a/test/ampycloud/plots/test_secondary.py b/test/ampycloud/plots/test_secondary.py new file mode 100644 index 0000000..acede0e --- /dev/null +++ b/test/ampycloud/plots/test_secondary.py @@ -0,0 +1,32 @@ +""" +Copyright (c) 2022 MeteoSwiss, contributors listed in AUTHORS. + +Distributed under the terms of the 3-Clause BSD License. + +SPDX-License-Identifier: BSD-3-Clause + +Module content: tests for the plots.secondary module +""" + +# Import from Python +from pathlib import Path + +from ampycloud import dynamic +from ampycloud.plots.secondary import scaling_fcts + + +def test_scaling_fcts(mpls): + """ Test the scaling_fcts plot. + + Args: + mpls: False, or the value of MPL_STYLE requested by the user. This is set automatically + by a fixture that fetches the corresponding command line argument. + See conftest.py for details. + + """ + + if mpls: + dynamic.AMPYCLOUD_PRMS.MPL_STYLE = mpls + + scaling_fcts(show=False, save_stem='pytest_scaling_fcts', save_fmts=['png']) + assert Path('pytest_large_dts.png').exists diff --git a/test/ampycloud/test_core.py b/test/ampycloud/test_core.py index 00e5f94..e2b2073 100644 --- a/test/ampycloud/test_core.py +++ b/test/ampycloud/test_core.py @@ -12,11 +12,12 @@ #Import from Python import os from pathlib import Path +from datetime import datetime import numpy as np import pandas as pd # Import from ampycloud -from ampycloud import dynamic, reset_prms +from ampycloud import dynamic, hardcoded from ampycloud.utils import mocker from ampycloud.data import CeiloChunk from ampycloud.core import copy_prm_file, reset_prms, run, metar, demo @@ -43,7 +44,7 @@ def test_reset_prms(): ref_val = dynamic.AMPYCLOUD_PRMS.OKTA_LIM0 dynamic.AMPYCLOUD_PRMS.OKTA_LIM0 = -1 assert dynamic.AMPYCLOUD_PRMS.OKTA_LIM0 == -1 - assert dynamic.AMPYCLOUD_PRMS.SLICING_PRMS.dt_scale_mode == 'const' + assert dynamic.AMPYCLOUD_PRMS.SLICING_PRMS.dt_scale_mode == 'shift-and-scale' # Then try to reset it reset_prms() @@ -81,12 +82,11 @@ def test_run_single_point(): dynamic.OKTA_LIM0 = 0 # Let's create some data with a single valid point - data = pd.DataFrame([['1', -100, 2000, 1], ['1', -99, np.nan, 1]], + data = pd.DataFrame([['1', -100, 2000, 1], ['1', -99, np.nan, 0]], columns=['ceilo', 'dt', 'alt', 'type']) - data['ceilo'] = data['ceilo'].astype(str) - data['dt'] = data['dt'].astype(float) - data['alt'] = data['alt'].astype(float) - data['type'] = data['type'].astype(int) + # Set the proper column types + for (col, tpe) in hardcoded.REQ_DATA_COLS.items(): + data.loc[:, col] = data.loc[:, col].astype(tpe) # Run the code out = run(data) @@ -102,9 +102,20 @@ def test_run_single_point(): reset_prms() def test_demo(): - """ test the demo routine. """ + """ Test the demo routine. """ mock_data, chunk = demo() assert isinstance(chunk, CeiloChunk) assert isinstance(mock_data, pd.DataFrame) + +def test_datetime(): + """ Test the ability to feed datetime instances. """ + + # Get some random data + mock_data, _ = demo() + # Here, feed a datetime.datetime instance ... + out = run(mock_data, ref_dt=datetime(year=2022, month=1, day=26)) + + # and make sure it gets converted to str properly. + assert out.ref_dt == '2022-01-26 00:00:00' diff --git a/test/ampycloud/test_data.py b/test/ampycloud/test_data.py index 8380a1d..6d78797 100644 --- a/test/ampycloud/test_data.py +++ b/test/ampycloud/test_data.py @@ -10,13 +10,14 @@ # Import from Python import numpy as np +import pandas as pd from pytest import raises # Import from the module to test from ampycloud.errors import AmpycloudError from ampycloud.data import CeiloChunk from ampycloud.utils import mocker -from ampycloud import dynamic, reset_prms +from ampycloud import dynamic, reset_prms, hardcoded def test_ceilochunk_init(): """ Test the init method of the CeiloChunk class. """ @@ -167,3 +168,29 @@ def test_ceilochunk_2lay(): # Assert the final METAR code is correct assert chunk.metar_msg() == 'SCT009 SCT019' + +def test_layering_singlepts(): + """ Test the layering step when there is a single time steps. See #62 for the motivation. """ + + mock_data = pd.DataFrame(np.array([['dummy', -1, 2300, 1], + ['dummy', -1, 4000, 2], + ['dummy', -1, 4500, 3], + ['dummy', -1, np.nan, 0]]), + columns=['ceilo', 'dt', 'alt', 'type']) + + # Set the proper column types + for (col, tpe) in hardcoded.REQ_DATA_COLS.items(): + mock_data.loc[:, col] = mock_data.loc[:, col].astype(tpe) + + # Instantiate a CeiloChunk entity ... + chunk = CeiloChunk(mock_data) + + # Do the dance ... + chunk.find_slices() + chunk.find_groups() + chunk.find_layers() + + # Check that the GMM was never executed + assert np.all(chunk.groups.loc[:, 'ncomp'] == -1) + # Check that the resulting layers and groups are the same + assert np.all(chunk.groups.loc[:, 'code']==chunk.layers.loc[:, 'code']) diff --git a/test/ampycloud/test_scaler.py b/test/ampycloud/test_scaler.py index 38f3cfe..18c6d49 100644 --- a/test/ampycloud/test_scaler.py +++ b/test/ampycloud/test_scaler.py @@ -9,18 +9,31 @@ """ # Import from Python +from pytest import raises import numpy as np # Import from this package -from ampycloud.scaler import const_scaling, minmax_scaling, step_scaling, scaling - -def test_const_scaling(): - """ Test the const_scaling routine. """ - - assert const_scaling(np.ones(1), 10, mode='scale') == 0.1 - assert const_scaling(np.ones(1), 10, mode='descale') == 10 - -def test_minmax_scaling(): +from ampycloud.scaler import shift_and_scale, minmax_scale, minrange2minmax, step_scale +from ampycloud.scaler import convert_kwargs, apply_scaling +from ampycloud.errors import AmpycloudError + +def test_shift_and_scale(): + """ Test the shift_and_scale routine. """ + + assert shift_and_scale(np.ones(1), shift=0, scale=10, mode='do') == 0.1 + assert np.isnan(shift_and_scale(np.array([1, np.nan]), mode='do')[1]) + assert shift_and_scale(np.ones(1), scale=10, mode='do') == 0 + assert shift_and_scale(np.ones(1), shift=0, mode='do') == 1 + assert np.all(shift_and_scale(np.array([1, 2]), scale=10, mode='do') == np.array([-0.1, 0])) + assert shift_and_scale(np.zeros(1), shift=10, scale=10, mode='undo') == 10 + assert np.all(shift_and_scale(np.array([7e5,7e5+1,7e5+2]), scale=10, mode='do') == \ + np.array([-0.2, -0.1, 0])) + tmp = np.array([1, 12, 3]) + # Check that this is indeed circular + assert np.all(shift_and_scale(shift_and_scale(tmp, scale=11, mode='do'), + scale=11, mode='undo', shift=np.nanmax(tmp)) == tmp) + +def test_minmax_scale(): """ Test the minmax_scaling function, inculding the descaling mode. """ # Generate random values, making sure they span a large enough range @@ -29,9 +42,8 @@ def test_minmax_scaling(): # Add a nan for good measure vals[0] = np.nan - out = minmax_scaling(vals, min_range=1000, mode='scale') - deout = minmax_scaling(out, min_range=1000, mode='descale', min_val=np.nanmin(vals), - max_val=np.nanmax(vals)) + out = minmax_scale(vals, mode='do') + deout = minmax_scale(out, min_val=np.nanmin(vals), max_val=np.nanmax(vals), mode='undo') # Check conversion is between 0 and 1 assert np.nanmax(out) <= 1 @@ -41,65 +53,83 @@ def test_minmax_scaling(): # Check NaNs stay NaNs assert np.all(np.isnan(deout[np.isnan(vals)])) - # Repeat, this time with a min-range - vals = np.array([100, 200, 400]) - - out = minmax_scaling(vals, min_range=1000, mode='scale') - deout = minmax_scaling(out, min_range=1000, mode='descale', min_val=np.nanmin(vals), - max_val=np.nanmax(vals)) - - # Make sure we are between 0 and 1 - assert np.nanmax(out) <= 1 - assert np.nanmin(out) >= 0 - # Go further and make sure we are actually much smaller than that - assert np.round(np.nanmax(out)-np.nanmin(out), 1) == 0.3 - # Check that all gets properly descaled as well - assert np.all(np.round(deout[~np.isnan(vals)], 5) == np.round(vals[~np.isnan(vals)], 5)) - -def test_step_scaling(): - """ Test the step scaling function. """ +def test_step_scale(): + """ Test the step scale function. """ steps = [10, 20] scales = [10, 100, 1000] vals = np.array([-1, 5, 15, 25, 35]) - out = step_scaling(vals, steps=steps, scales=scales, mode='scale') + out = step_scale(vals, steps=steps, scales=scales, mode='do') assert np.all(out == np.array([-0.1, 0.5, 1.05, 1.105, 1.115])) - deout = step_scaling(out, steps=steps, scales=scales, mode='descale') + deout = step_scale(out, steps=steps, scales=scales, mode='undo') assert np.all(np.round(deout, 1) == vals) # Check the continuity condition vals = np.arange(0, 25000, 10) - out = step_scaling(vals, steps=[3000, 7000, 10000, 14000], scales=[100, 500, 50, 1000, 10], - mode='scale') + out = step_scale(vals, steps=[3000, 7000, 10000, 14000], scales=[100, 500, 50, 1000, 10], + mode='do') # Basic check to make sure we are always increasing assert np.all(np.diff(out) > 0) # Check for gaps by looking at the slopes # If a slope value were to appear only once, it would be a gap. assert np.all([np.count_nonzero(np.diff(out)==item) for item in np.unique(np.diff(out))]) # Check I can undo the scaling - deout = step_scaling(out, steps=[3000, 7000, 10000, 14000], scales=[100, 500, 50, 1000, 10], - mode='descale') + deout = step_scale(out, steps=[3000, 7000, 10000, 14000], scales=[100, 500, 50, 1000, 10], + mode='undo') assert np.all(np.round(deout, 1) == vals) -def test_scaling(): +def test_convert_kwargs(): + """ Test the convert_kwargs() function """ + + data = np.array([1, 2, 10]) + kwargs = {'min_range': 5} + + # 'min_range' is a "user" keyword + out = convert_kwargs(data, fct='minmax-scale', **kwargs) + assert 'min_range' not in out.keys() + assert 'min_val' in out.keys() + assert 'max_val' in out.keys() + + kwargs = {} + assert 'shift' in convert_kwargs(data, fct='shift-and-scale', **kwargs).keys() + + +def test_apply_scaling(): """ Test the umbrella scaling function, and its ability to correctly summon the different scaling functions. """ - out = scaling(np.ones(1), fct='const', scale=10, mode='scale') - assert out == 0.1 + out = apply_scaling(np.ones(1), fct='shift-and-scale', scale=10, mode='do') + assert out == 0 + + out = apply_scaling(np.array((10, 20)), fct='shift-and-scale', scale=10, shift=15, mode='do') + assert (out == np.array([-0.5, 0.5])).all() # Try to feed args in the wrong order ... - out = scaling(np.ones(1), fct='step', scales=[10, 100, 1000], - mode='scale', steps=[10, 20]) + out = apply_scaling(np.ones(1), fct='step-scale', scales=[10, 100, 1000], mode='do', + steps=[10, 20]) assert out == 0.1 # Try to feed only NaNs - out = scaling(np.ones(10)*np.nan, fct='minmax', mode='scale', min_range=1000) + out = apply_scaling(np.ones(10)*np.nan, fct='minmax-scale', mode='do', min_range=1000) assert np.all(np.isnan(out)) # Try when a single point is being fed - out = scaling(np.ones(1), fct='minmax', mode='scale', min_range=1000) + out = apply_scaling(np.ones(1), fct='minmax-scale', mode='do', min_range=1000) assert out == 0.5 + + # Try with a min-range + din = np.array([1,5,9]) + out = apply_scaling(din, fct='minmax-scale', min_range=10, mode='do') + assert np.all(out == np.array([0.1, 0.5, 0.9])) + + # Make sure undoing the minmax-scale without enough info crashes properly. + with raises(AmpycloudError): + apply_scaling(out, fct='minmax-scale', mode='undo') + + # Now actually try the undoing + (min_val, max_val) = minrange2minmax(din, min_range=10) + reout = apply_scaling(out, fct='minmax-scale', mode='undo', min_val=min_val, max_val=max_val) + assert np.all(reout == din) diff --git a/test/ampycloud/test_scientific_stability.py b/test/ampycloud/test_scientific_stability.py index b672ad0..6e9d20d 100644 --- a/test/ampycloud/test_scientific_stability.py +++ b/test/ampycloud/test_scientific_stability.py @@ -11,6 +11,7 @@ # Import from Python from pathlib import Path import pickle +import pandas as pd # Import from ampycloud import ampycloud @@ -50,6 +51,9 @@ def test_scientific_stability(mpls : str, do_sciplots: bool) -> None: with open(ref_data_file, 'rb') as f: data = pickle.load(f) + # Fix the dtype of the ceilo column + data.loc[:, 'ceilo'] = data.loc[:, 'ceilo'].astype(pd.StringDtype()) + # Get other useful info geoloc = ref_data_file.stem.split('_')[0] ref_dt = ref_data_file.stem.split('_')[1].replace('-', ' ') diff --git a/test/ampycloud/utils/test_utils.py b/test/ampycloud/utils/test_utils.py index c8db1c9..08e65ee 100644 --- a/test/ampycloud/utils/test_utils.py +++ b/test/ampycloud/utils/test_utils.py @@ -9,10 +9,85 @@ """ # Import form Python +from pytest import raises, warns import numpy as np +import pandas as pd # Import from ampycloud -from ampycloud.utils.utils import tmp_seed +from ampycloud.utils.utils import check_data_consistency, tmp_seed +from ampycloud.utils.mocker import canonical_demo_data +from ampycloud.errors import AmpycloudError, AmpycloudWarning +from ampycloud import hardcoded + +def test_check_data_consistency(): + """ This routine tests the check_data_consistency method. """ + + # Uncomment the line below once pytst 7.0 can be pip-installed + #with pytest.does_not_warn(): + if True: + out = check_data_consistency(canonical_demo_data()) + + # Make sure the canonical demo data is perfectly compatible with the ampycloud requirements + assert np.all(out == canonical_demo_data()) + + # Now, let's check specific elements that should raise errors or warnings + with raises(AmpycloudError): + # Empty DataFrame + data = pd.DataFrame(columns=['ceilo', 'dt', 'alt', 'type']) + check_data_consistency(data) + with raises(AmpycloudError): + # Missing column + data = pd.DataFrame(np.array([['a', 1, 1]]), columns=['ceilo', 'alt', 'type']) + for col in ['ceilo', 'alt', 'type']: + data.loc[:, col] = data.loc[:,col].astype(hardcoded.REQ_DATA_COLS[col]) + check_data_consistency(data) + with warns(AmpycloudWarning): + # Bad data type + data = pd.DataFrame(np.array([['a', 0, 1, 1]]), columns=['ceilo', 'dt', 'alt', 'type']) + check_data_consistency(data) + with warns(AmpycloudWarning): + # Extra key + data = pd.DataFrame(np.array([['a', 0, 1, 1, 99]]), + columns=['ceilo', 'dt', 'alt', 'type', 'extra']) + for (col, tpe) in hardcoded.REQ_DATA_COLS.items(): + data.loc[:, col] = data.loc[:, col].astype(tpe) + check_data_consistency(data) + with warns(AmpycloudWarning): + # Negative alts + data = pd.DataFrame(np.array([['a', 0, -1, 1]]), + columns=['ceilo', 'dt', 'alt', 'type']) + for (col, tpe) in hardcoded.REQ_DATA_COLS.items(): + data.loc[:, col] = data.loc[:, col].astype(tpe) + check_data_consistency(data) + with warns(AmpycloudWarning): + # Type 0 should be NaN + data = pd.DataFrame(np.array([['a', 0, 1, 0]]), + columns=['ceilo', 'dt', 'alt', 'type']) + for (col, tpe) in hardcoded.REQ_DATA_COLS.items(): + data.loc[:, col] = data.loc[:, col].astype(tpe) + check_data_consistency(data) + with warns(AmpycloudWarning): + # Type 1 should not be NaN + data = pd.DataFrame(np.array([['a', 0, np.nan, 1]]), + columns=['ceilo', 'dt', 'alt', 'type']) + for (col, tpe) in hardcoded.REQ_DATA_COLS.items(): + data.loc[:, col] = data.loc[:, col].astype(tpe) + check_data_consistency(data) + with warns(AmpycloudWarning): + # Missing type 1 pts + data = pd.DataFrame(np.array([['a', 0, 1, 2]]), + columns=['ceilo', 'dt', 'alt', 'type']) + for (col, tpe) in hardcoded.REQ_DATA_COLS.items(): + data.loc[:, col] = data.loc[:, col].astype(tpe) + check_data_consistency(data) + with warns(AmpycloudWarning): + # Missing type 2 pts + data = pd.DataFrame(np.array([['a', 0, 1, 3]]), + columns=['ceilo', 'dt', 'alt', 'type']) + for (col, tpe) in hardcoded.REQ_DATA_COLS.items(): + data.loc[:, col] = data.loc[:, col].astype(tpe) + check_data_consistency(data) + def test_tmp_seed(): """ This routine tests the tmp_seed. """