release 0.6.1 (#111)

* fix deprecation in nafc metqa parser * Add-convert-input-table (#103) * add input_table input to configuration * add another onset timestamp format * use ruff for linter testing (#106) * update dependancies * update action * run all ruff tests and report any failure right after * fix nafc ruff check issues * fix seabird imports * add input-path-list (#108) * add os path separator compatibility to both cli and configuration * add test_version module * fix version to 0.6.0 * add button to readme * Add onset.xlsx format parser (#110) * move daylight saving check to a checks module, add ambiguous input and timezone to both xlsx and csv outputs * match onset parsers ambiguous_timestamp * add makefile * improve daylight saving check and add onset.csv tests * fix batch tests to ignore onset files with daylight saving issue * fix nerc get_platform_vocabulary with the latest changes to nerc api * add star_oddi.DAT test file and fix timestamp parsing * move back test_parsers.py to tests/ * fix test_file_registry.csv to ignore files with daylight saving issue * fix daylight_saving_issue test * make pme wiper compatible with pme parser (#113) * add some pme wiper test data * add pressure attributes to pme parser * deprecate pme.minidot_* parsers to pme.* * capture all header metadata from both wiper and minidot data * fix new warning with star_oddi parser dayfirst missing input * specify dayfirst for star_oddi sensors format
cioos-siooc · Sep 24, 2024 · 1edf61d · 1edf61d
1 parent 3fce6b2
commit 1edf61d
Show file tree

Hide file tree

Showing 22 changed files with 840 additions and 79 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,27 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## `development`
+
+### Added
+
+- Add compatibility with PME wipers txt format.
+
+### Fixed
+
+- Fixed warning regarding star_oddi dayfirst=True missing input
+- Rename pme parsers by removing `minidot_`. New functions are called `pme.txt`,
+`pme.txts`, `pme.cat`. Maintain still a placeholder for those functions.
+
+## `0.6.1` - 2024-08-30
+
+### Added
+
+- Add `onset.xlsx` parser.
+- Make `onset.xlsx` and `onset.csv` raise a `pytz.exception.AmbiguousTimeError`
+   when jumps associated with daylight saving time changes are detected.
+- Add `star_oddi.DAT` ctd test file and fix timestamp format handling.
+
 ## `0.6.0` - 2024-08-20
 
 ### Added

diff --git a/README.md b/README.md
@@ -8,7 +8,11 @@
 
 [![Build documentation](https://github.com/cioos-siooc/ocean-data-parser/actions/workflows/deploy-docs.yaml/badge.svg)](https://github.com/cioos-siooc/ocean-data-parser/actions/workflows/deploy-docs.yaml)
 
-`ocean-data-parser` - a Python package for parsing oceanographic proprietary data formats to [xarray Dataset](https://docs.xarray.dev/en/stable/). Documentation [here](https://cioos-siooc.github.io/ocean-data-parser/).
+`ocean-data-parser` - a Python package for parsing oceanographic proprietary data formats to xarray Datasets[^1].
+
+<div align = center>
+<a href='https://cioos-siooc.github.io/ocean-data-parser/'><kbd> <br> See Full Documentation Here <br> </kbd></a>
+</div>
 
 ## Installation
 
@@ -70,3 +74,5 @@ The `ocean-data-parser` can then be used within either a Python package, script
 All contributions are welcome!
 
 Please create a new [discussion](https://github.com/cioos-siooc/ocean-data-parser/discussions) or [issue](https://github.com/cioos-siooc/ocean-data-parser/issues) within the GitHub repository for any questions, ideas and suggestions.
+
+[^1]: [Xarray package documentation](https://docs.xarray.dev/en/stable/index.html)
diff --git a/docs/user_guide/parsers/index.md b/docs/user_guide/parsers/index.md
@@ -48,6 +48,7 @@ Ocean Data Parser includes the following data format parsers:
 [ONSET](onset.md)
 
 - [onset.csv](onset.md#ocean_data_parser.parsers.onset.csv)
+- [onset.xlsx](onset.md#ocean_data_parser.parsers.onset.xlsx)
 
 [PME](pme.md)
 

diff --git a/makefile b/makefile
@@ -0,0 +1,4 @@
+lint:
+	ruff format .
+	ruff check --fix --select I .
+	ruff check --fix .
diff --git a/ocean_data_parser/metadata/nerc.py b/ocean_data_parser/metadata/nerc.py
@@ -24,15 +24,15 @@ def get_vocabulary_term(vocab: str, id: str) -> dict:
 def get_platform_vocabulary(id: str) -> dict:
     result = get_vocabulary_term("C17", id)
     # Parse the json data in the definition field
-    attrs = json.loads(result["definition"]["@value"])["node"]
+    attrs = json.loads(result["skos:definition"]["@value"])["node"]
     return {
-        "platform_name": result["prefLabel"]["@value"],
+        "platform_name": result["skos:prefLabel"]["@value"],
         "platform_type": attrs.get("platformclass"),
         "country_of_origin": attrs.get("country"),
         "platform_owner": attrs.get("title"),
         "platform_id": id,
         "ices_platform_code": id,
         "wmo_platform_code": attrs.get("IMO"),
         "call_sign": attrs.get("callsign"),
-        "sdn_platform_urn": result["identifier"],
+        "sdn_platform_urn": result["dc:identifier"],
     }
diff --git a/ocean_data_parser/parsers/checks.py b/ocean_data_parser/parsers/checks.py
@@ -0,0 +1,51 @@
+import pandas as pd
+from loguru import logger
+from pytz.exceptions import AmbiguousTimeError
+
+
+def check_daylight_saving(
+    time: pd.Series, ambiguous: str = "raise", fix_log_level: str = "info"
+):
+    """Check if daylight saving issue is present in the time series
+
+    Args:
+        time (pd.Series): time series
+        ambiguous (str, optional): Similar to pandas.Series.tz_localize.
+            options:
+                - "raise": raise when we encounter ambiguous dates (default)
+                - else: warn when we encounter ambiguous dates.
+        fix_log_level (str, optional): Log level to use when fixing the issue.
+
+    Returns:
+        bool: True if daylight saving issue is present
+        pd.Series: time series with daylight saving issue fixed
+    """
+    # Test daylight saving issue
+    dt = time.diff()
+    sampling_interval = dt.median()
+    dst_fall = -pd.Timedelta("1h") + sampling_interval
+    dst_spring = pd.Timedelta("1h") + sampling_interval
+
+    error_message = []
+    if any(dt == dst_fall):
+        error_message += [
+            f"Time gaps (={dst_fall}) for sampling interval of {sampling_interval} "
+            "suggest a Fall daylight saving issue is present"
+        ]
+    if any(dt == dst_spring):
+        error_message += [
+            f"Time gaps (={dst_spring}) for sampling interval of {sampling_interval} "
+            "suggest a Spring daylight saving issue is present"
+        ]
+
+    # Handle errors based on ambiguous
+    if not error_message:
+        pass
+    elif ambiguous == "raise" and error_message:
+        error_message += [
+            "To fix this issue, set ambiguous='warn' or provide "
+            "a local timezone (e.g. 'Canada/Pacific')"
+        ]
+        raise AmbiguousTimeError("\n".join(error_message))
+    else:
+        logger.warning("\n".join(error_message))
diff --git a/ocean_data_parser/parsers/onset.py b/ocean_data_parser/parsers/onset.py
@@ -3,8 +3,8 @@
 data loggers and sensors for environmental monitoring.
 Their Hobo data loggers are widely used for monitoring water
 quality parameters such as temperature, conductivity, and light
-intensity. The present module provides a parser for the CSV files
-generated by the HOBOware software.
+intensity. The present module provides parsers for the different
+data formats generated by the HOBOware and HOBOconnect softwares.
 """
 
 import logging
@@ -15,9 +15,16 @@
 import pandas as pd
 import xarray
 
+from ocean_data_parser.parsers.checks import check_daylight_saving
 from ocean_data_parser.parsers.utils import standardize_dataset
 
-GLOBAL_ATTRIBUTES = {"Convention": "CF-1.6"}
+GLOBAL_ATTRIBUTES = {"instrument_manufacturer": "Onset", "Convention": "CF-1.6"}
+
+TIMEZONE_MAPPING = {
+    "GMT": "UTC",
+    "PDT": "US/Pacific",
+    "PST": "US/Pacific",
+}
 
 logger = logging.getLogger(__name__)
 VARIABLE_NAME_MAPPING = {
@@ -89,7 +96,6 @@ def _get_time_format(time):
 def _parse_onset_csv_header(header_lines):
     full_header = "\n".join(header_lines)
     header = {
-        "instrument_manufacturer": "Onset",
         "history": "",
         "timezone": re.search(r"GMT\s*([\-\+\d\:]*)", full_header),
         "plot_title": re.search(r"Plot Title\: (\w*),+", full_header),
@@ -155,16 +161,20 @@ def csv(
     standardize_variable_names: bool = True,
     encoding: str = "UTF-8",
     errors: str = "strict",
+    timezone: str = None,
+    ambiguous_timestamps: str = "raise",
 ) -> xarray.Dataset:
     """Parses the Onset CSV format generate by HOBOware into a xarray object
 
-    Inputs:
+    Args:
         path: The path to the CSV file
         convert_units_to_si: Whether to standardize data units to SI units
         standardize_variable_names: Rename the variable names a standardize name
-        convention
+            convention
         encoding: File encoding. Defaults to "utf-8"
         errors: Error handling. Defaults to "strict"
+        timezone: Timezone to localize the time variable, overwrites the timezone in header
+        ambiguous_timestamps: How to handle ambiguous time stamps. Defaults to "raise"
     Returns:
         xarray.Dataset
     """
@@ -217,8 +227,10 @@ def csv(
         df["Date Time"] = df["Date Time"].apply(
             lambda x: pd.to_datetime(x, format=_get_time_format(x))
         )
-
-    df["Date Time"] = df["Date Time"].dt.tz_localize(header["timezone"])
+    df["Date Time"] = df["Date Time"].dt.tz_localize(
+        timezone or header["timezone"], ambiguous=ambiguous_timestamps
+    )
+    check_daylight_saving(df["Date Time"], ambiguous_timestamps)
 
     # Convert to dataset
     ds = df.to_xarray()
@@ -257,31 +269,6 @@ def csv(
                 "Unit conversion is not supported if standardize_variable_names=False"
             )
 
-    # Test daylight saving issue
-    # TODO move this daylight saving detection test elsewhere
-    dt = ds["time"].diff("index")
-    sampling_interval = dt.median().values
-    dst_fall = -pd.Timedelta("1h") + sampling_interval
-    dst_spring = pd.Timedelta("1h") + sampling_interval
-    if any(dt == dst_fall):
-        logger.warning(
-            (
-                "Time gaps (=%s) for sampling interval of %s "
-                "suggest a Fall daylight saving issue is present"
-            ),
-            dst_fall,
-            sampling_interval,
-        )
-    if any(dt == dst_spring):
-        logger.warning(
-            (
-                "Time gaps (=%s) for sampling interval of %s "
-                "suggest a Spring daylight saving issue is present"
-            ),
-            dst_fall,
-            sampling_interval,
-        )
-
     ds = standardize_dataset(ds)
     return ds
 
@@ -345,3 +332,94 @@ def _farenheit_to_celsius(farenheit):
         float: Temperature in celsisus
     """
     return (farenheit - 32.0) / 1.8000
+
+
+def xlsx(
+    path: str, timezone: str = None, ambiguous_timestamps: str = "infer"
+) -> xarray.Dataset:
+    """Parses the Onset XLSX format generate by HOBOware into a xarray object
+
+    Args:
+        path: The path to the XLSX file
+        timezone: Timezone to localize the time variable, overwrites the timezone in header
+        ambiguous_timestamps: How to handle ambiguous time stamps. Defaults to "infer"
+    Returns:
+        xarray.Dataset
+    """
+
+    def _format_detail_key(key):
+        """Format detail key to be more readable"""
+        key = re.sub(r"\(.*\)", "", key)
+        return (
+            key.replace(" Info", "")
+            .replace(" ", "_")
+            .replace("-", "_")
+            .lower()
+            .replace("deployment_deployment", "deployment")
+            .replace("device_device", "device")
+            .replace("app_app", "app")
+        )
+
+    def _get_column_and_unit(column):
+        """split column name and unit in parenthesis"""
+        column = column.split(" (")
+        if len(column) == 1:
+            return column[0], None
+        return column[0], column[1].replace(")", "")
+
+    # Read the different sheets from the xlsx file
+    data = pd.read_excel(path, sheet_name="Data", engine="openpyxl")
+    events = pd.read_excel(path, sheet_name="Events", engine="openpyxl")
+    details = (
+        pd.read_excel(
+            path,
+            sheet_name="Details",
+            engine="openpyxl",
+            names=["group", "subgroup", "parameter", "value"],
+        )
+        .ffill(axis=0)
+        .dropna(subset=["parameter", "value"])
+    )
+    details_attrs = {
+        _format_detail_key(f"{row['subgroup']}_{row['parameter']}"): row["value"]
+        for id, row in details.iterrows()
+        if row["group"] == "Devices"
+    }
+
+    variable_attributes = {}
+
+    for var in data.columns:
+        column, unit = _get_column_and_unit(var)
+        column = _format_detail_key(column)
+        if column == "#":
+            column = "record_number"
+        elif column == "date_time":
+            column = "time"
+        variable_attributes[column] = {
+            "long_name": column,
+            "units": unit,
+            "original_name": var,
+        }
+    data.columns = variable_attributes.keys()
+
+    if "time" not in data.columns:
+        raise ValueError("Date Time column not found in header")
+    file_timezone = variable_attributes["time"].pop("units", None)
+    if file_timezone:
+        file_timezone = TIMEZONE_MAPPING.get(file_timezone, file_timezone)
+
+    # Convert to dataset
+    data["time"] = (
+        pd.to_datetime(data["time"], errors="coerce")
+        .dt.tz_localize(timezone or file_timezone, ambiguous=ambiguous_timestamps)
+        .dt.tz_convert("UTC")
+    )
+    check_daylight_saving(data["time"])
+
+    ds = data.to_xarray()
+    for var in variable_attributes:
+        ds[var].attrs = variable_attributes[var]
+    ds.attrs = {**GLOBAL_ATTRIBUTES, "events": events.to_json(), **details_attrs}
+    ds["instrument_type"] = _detect_instrument_type(ds)
+    ds = standardize_dataset(ds)
+    return ds