From 06cd592542fb76a1e33a3041490119da6ee3d479 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 18 Jan 2024 21:59:19 +0000 Subject: [PATCH 01/88] h5 --- cfdm/read_write/netcdf/netcdfread.py | 106 ++++++++++++++++++++++----- 1 file changed, 88 insertions(+), 18 deletions(-) diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index fe43e867c..7ed24d1d5 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -490,10 +490,22 @@ def file_open(self, filename, flatten=True, verbose=None): >>> r.file_open('file.nc') """ + netCDF = False + HDF = False try: - nc = netCDF4.Dataset(filename, "r") - except RuntimeError as error: - raise RuntimeError(f"{error}: {filename}") + nc = h5netcdf.File(filename, "r") + except OSError: + # File is not HDF, so it's probably netCDF3. + try: + nc = netCDF4.Dataset(filename, "r") + except RuntimeError as error: + raise RuntimeError(f"{error}: {filename}") + else: + netCDF = True + except Exception as error: + raise Exception(f"{error}: {filename}") + else: + HDF = True # ------------------------------------------------------------ # If the file has a group structure then flatten it (CF>=1.8) @@ -501,6 +513,11 @@ def file_open(self, filename, flatten=True, verbose=None): g = self.read_vars if flatten and nc.groups: + if HDF: + raise ValueError( + "Can't yet access file with groups via h5netcdf" + ) + # Create a diskless, non-persistent container for the # flattened file flat_file = tempfile.NamedTemporaryFile( @@ -532,6 +549,8 @@ def file_open(self, filename, flatten=True, verbose=None): g["has_groups"] = True g["flat_files"].append(flat_file) + g["netCDF"] = netCDF + g["HDF"] = HDF g["nc"] = nc return nc @@ -999,9 +1018,9 @@ def read( # 'global_attributes' dictionary # ---------------------------------------------------------------- global_attributes = {} - for attr in map(str, nc.ncattrs()): +# for attr in map(str,nc.ncattrs()): + for attr, value in self._file_global_attributes().items(): try: - value = nc.getncattr(attr) if isinstance(value, str): try: global_attributes[attr] = str(value) @@ -1157,7 +1176,8 @@ def read( group_attr = x[-1] flattener_attributes.setdefault(tuple(groups), {})[ group_attr - ] = nc.getncattr(flat_attr) + ] = self._file_variable(flat_attr) +# ] = nc.getncattr(flat_attr) # Remove flattener attributes from the global attributes for attr in ( @@ -1167,13 +1187,14 @@ def read( ): g["global_attributes"].pop(attr, None) - for ncvar in nc.variables: + for ncvar in self._file_variables(): ncvar_basename = ncvar groups = () group_attributes = {} - variable = nc.variables[ncvar] - +# variable = nc.variables[ncvar] + variable = self._file_variable(ncvar) + # -------------------------------------------------------- # Specify the group structure for each variable (CF>=1.8) # TODO @@ -1239,7 +1260,8 @@ def read( except UnicodeDecodeError: pass - variable_dimensions[ncvar] = tuple(variable.dimensions) +# variable_dimensions[ncvar] = tuple(variable.dimensions) + variable_dimensions[ncvar] = tuple(self._file_variable_dimensions()) variable_dataset[ncvar] = nc variable_filename[ncvar] = g["filename"] variables[ncvar] = variable @@ -1250,7 +1272,8 @@ def read( # Populate dimensions_groups abd dimension_basename # dictionaries - for ncdim in nc.dimensions: +# for ncdim in nc.dimensions: + for ncdim in self._file_dimensions(): ncdim_org = ncdim ncdim_basename = ncdim groups = () @@ -1275,9 +1298,10 @@ def read( dimension_groups[ncdim] = groups dimension_basename[ncdim] = ncdim_basename - dimension_isunlimited[ncdim] = nc.dimensions[ - ncdim_org - ].isunlimited() +# dimension_isunlimited[ncdim] = nc.dimensions[ +# ncdim_org +# ].isunlimited() + dimension_isunlimited[ncdim] = self._file_dimension_isunlimited(ncdim_org) if has_groups: variable_dimensions = { @@ -1325,7 +1349,8 @@ def read( # The netCDF dimensions of the parent file internal_dimension_sizes = {} - for name, dimension in nc.dimensions.items(): +# for name, dimension in nc.dimensions.items(): + for name, dimension in self._file_dimensions().items(): if ( has_groups and dimension_isunlimited[flattener_dimensions[name]] @@ -1334,10 +1359,10 @@ def read( # size from the original grouped dataset, because # unlimited dimensions have size 0 in the flattened # dataset (because it contains no data) (v1.8.8.1) - group, ncdim = self._netCDF4_group( + group, ncdim = self._netCDF4_group( # h5netcdf g["nc_grouped"], flattener_dimensions[name] ) - internal_dimension_sizes[name] = group.dimensions[ncdim].size + internal_dimension_sizes[name] = group.dimensions[ncdim].size # h5netcdf else: internal_dimension_sizes[name] = dimension.size @@ -2479,7 +2504,8 @@ def _parse_geometry(self, parent_ncvar, attributes): # variable in this case. # -------------------------------------------------------- nodes_per_geometry = self.implementation.initialise_Count() - size = g["nc"].dimensions[node_dimension].size +# size = g["nc"].dimensions[node_dimension].size + size = self._file_dimension_size(node_dimension) ones = self.implementation.initialise_Data( array=np.ones((size,), dtype="int32"), copy=False ) @@ -9881,3 +9907,47 @@ def _ugrid_check_connectivity_variable( ok = False return ok + + def _file_global_attributes(self): + g = self.read_vars + nc = g['nc'] + if g['netCDF']: + # NetCDF + return {attr: nc.getncattr(attr) for attr in nc.ncattrs()} + + # HDF + return nc.attrs + + def _file_dimensions(self, var): + g = self.read_vars + return g['nc'].dimensions + + def _file_dimension(self, dim_name): + return self._file_dimensions()[dim_name] + + def _file_dimension_isunlimited(self, dim_name): + return self._file_dimensions()[dim_name].isunlimted() + + def _file_dimension_size(self, dim_name): + return self._file_dimensions()[dim_name].size + + def _file_variables(self): + g = self.read_vars + return g['nc'].variables + + def _file_variable(self, var_name): + return self._file_variables()[var_name] + + def _file_variable_attributes(self, var): + g = self.read_vars + if g['netCDF']: + # NetCDF + return {attr: var.getncattr(attr) for attr in var.ncattrs()} + + # HDF + return var.attrs + + def _file_variable_dimensions(self, var): + return var.dimensions + + From 20c3886a43f4e4d69816fcc30e8904fae758da6a Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 19 Jan 2024 12:16:12 +0000 Subject: [PATCH 02/88] h5 --- cfdm/__init__.py | 1 + cfdm/cfdmimplementation.py | 59 ++++++++++++++++ cfdm/data/__init__.py | 1 + cfdm/mixin/propertiesdata.py | 3 +- cfdm/read_write/netcdf/netcdfread.py | 102 ++++++++++++++++++--------- 5 files changed, 130 insertions(+), 36 deletions(-) diff --git a/cfdm/__init__.py b/cfdm/__init__.py index ca5d289d3..5b210d04f 100644 --- a/cfdm/__init__.py +++ b/cfdm/__init__.py @@ -139,6 +139,7 @@ CompressedArray, Data, GatheredArray, + HDFArray, NetCDFArray, NumpyArray, PointTopologyArray, diff --git a/cfdm/cfdmimplementation.py b/cfdm/cfdmimplementation.py index 6658b1761..041ce7433 100644 --- a/cfdm/cfdmimplementation.py +++ b/cfdm/cfdmimplementation.py @@ -30,6 +30,7 @@ CellConnectivityArray, Data, GatheredArray, + HDFArray, NetCDFArray, PointTopologyArray, RaggedContiguousArray, @@ -2353,6 +2354,62 @@ def initialise_NetCDFArray( missing_values=missing_values, ) + def initialise_HDFArray( + self, + filename=None, + address=None, + dtype=None, + shape=None, + mask=True, + units=False, + calendar=None, + missing_values=None, + ): + """Return a HDF array instance. + + :Parameters: + + filename: `str` + + address: `str` + + dytpe: `numpy.dtype` + + shape: sequence of `int`, optional + + mask: `bool`, optional + + units: `str` or `None` or False, optional + The units of the variable. Set to `None` to indicate + that there are no units. If False (the default) then + the units are considered unset. + + calendar: `str` or `None`, optional + The calendar of the variable. By default, or if set to + `None`, then the CF default calendar is assumed, if + applicable. + + missing_values: `dict`, optional + The missing value indicators defined by the variable + attributes. + + :Returns: + + `HDFArray` + + """ + cls = self.get_class("HDFArray") + return cls( + filename=filename, + address=address, + dtype=dtype, + shape=shape, + mask=mask, + units=units, + calendar=calendar, + missing_values=missing_values, + ) + def initialise_BoundsFromNodesArray(self, **kwargs): """Return a node bounds array. @@ -3707,6 +3764,7 @@ def squeeze(self, construct, axes=None): Data=Data, BoundsFromNodesArray=BoundsFromNodesArray, GatheredArray=GatheredArray, + HDFArray=HDFArray, NetCDFArray=NetCDFArray, PointTopologyArray=PointTopologyArray, RaggedContiguousArray=RaggedContiguousArray, @@ -3750,6 +3808,7 @@ def implementation(): 'Datum': , 'Data': , 'GatheredArray': , + 'HDFArray': , 'NetCDFArray': , 'PointTopologyArray': , 'RaggedContiguousArray': , diff --git a/cfdm/data/__init__.py b/cfdm/data/__init__.py index 23641aa37..98e0b0a2e 100644 --- a/cfdm/data/__init__.py +++ b/cfdm/data/__init__.py @@ -18,6 +18,7 @@ from .boundsfromnodesarray import BoundsFromNodesArray from .cellconnectivityarray import CellConnectivityArray from .gatheredarray import GatheredArray +from .hdfarray import HDFArray from .netcdfarray import NetCDFArray from .numpyarray import NumpyArray from .pointtopologyarray import PointTopologyArray diff --git a/cfdm/mixin/propertiesdata.py b/cfdm/mixin/propertiesdata.py index 625c7d7f8..83a7913e5 100644 --- a/cfdm/mixin/propertiesdata.py +++ b/cfdm/mixin/propertiesdata.py @@ -100,7 +100,8 @@ def __str__(self): if units is None: isreftime = bool(self.get_property("calendar", False)) else: - isreftime = "since" in units + print (type(units), repr(units)) + isreftime = "since" in str(units) if isreftime: units += " " + self.get_property("calendar", "") diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 7ed24d1d5..327dff374 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -9,12 +9,13 @@ from copy import deepcopy from dataclasses import dataclass, field from functools import reduce -from math import nan +from math import nan, prod from typing import Any from urllib.parse import urlparse from uuid import uuid4 import netCDF4 +import h5netcdf import netcdf_flattener import numpy as np from packaging.version import Version @@ -494,18 +495,17 @@ def file_open(self, filename, flatten=True, verbose=None): HDF = False try: nc = h5netcdf.File(filename, "r") - except OSError: + HDF = True + except OSError: # File is not HDF, so it's probably netCDF3. try: + print (1/0) nc = netCDF4.Dataset(filename, "r") + netCDF = True except RuntimeError as error: raise RuntimeError(f"{error}: {filename}") - else: - netCDF = True except Exception as error: raise Exception(f"{error}: {filename}") - else: - HDF = True # ------------------------------------------------------------ # If the file has a group structure then flatten it (CF>=1.8) @@ -514,10 +514,19 @@ def file_open(self, filename, flatten=True, verbose=None): if flatten and nc.groups: if HDF: - raise ValueError( - "Can't yet access file with groups via h5netcdf" - ) - + # TODOHDF: Can't yet use HDF access to process groups + logger.warning( + "WARNING: Using netCDF4 (rather than h5netcdf) " + f"to access file {filename} containing groups" + ) # pragma: no cover + nc.close() + HDF = False + try: + nc = netCDF4.Dataset(filename, "r") + netCDF = True + except RuntimeError as error: + raise RuntimeError(f"{error}: {filename}") + # Create a diskless, non-persistent container for the # flattened file flat_file = tempfile.NamedTemporaryFile( @@ -549,6 +558,12 @@ def file_open(self, filename, flatten=True, verbose=None): g["has_groups"] = True g["flat_files"].append(flat_file) + + if HDF: + print ("Opened with h5netcdf") + else: + print ("Opened with netCDF4") + g["netCDF"] = netCDF g["HDF"] = HDF g["nc"] = nc @@ -1176,7 +1191,7 @@ def read( group_attr = x[-1] flattener_attributes.setdefault(tuple(groups), {})[ group_attr - ] = self._file_variable(flat_attr) + ] = self._file_global_attribute(flat_attr) # ] = nc.getncattr(flat_attr) # Remove flattener attributes from the global attributes @@ -1243,25 +1258,21 @@ def read( variable_grouped_dataset[ncvar] = g["nc_grouped"] variable_attributes[ncvar] = {} - for attr in map(str, variable.ncattrs()): +# for attr in map(str, variable.ncattrs()): + for attr, value in self._file_variable_attributes(variable).items(): try: - variable_attributes[ncvar][attr] = variable.getncattr(attr) - if isinstance(variable_attributes[ncvar][attr], str): + if isinstance(value, str): try: - variable_attributes[ncvar][attr] = str( - variable_attributes[ncvar][attr] - ) + value = str(value) except UnicodeEncodeError: - variable_attributes[ncvar][ - attr - ] = variable_attributes[ncvar][attr].encode( - errors="ignore" - ) + value = value.encode(errors="ignore") except UnicodeDecodeError: pass + variable_attributes[ncvar][attr] = value + # variable_dimensions[ncvar] = tuple(variable.dimensions) - variable_dimensions[ncvar] = tuple(self._file_variable_dimensions()) + variable_dimensions[ncvar] = tuple(self._file_variable_dimensions(variable)) variable_dataset[ncvar] = nc variable_filename[ncvar] = g["filename"] variables[ncvar] = variable @@ -3484,7 +3495,6 @@ def _create_field_or_domain( ) # Set unlimited status of axis - # if nc.dimensions[ncdim].isunlimited(): if g["dimension_isunlimited"][ncdim]: self.implementation.nc_set_unlimited_axis(f, axis) @@ -3510,7 +3520,6 @@ def _create_field_or_domain( # Set unlimited status of axis try: - # if nc.dimensions[ncdim].isunlimited(): if g["dimension_isunlimited"][ncdim]: self.implementation.nc_set_unlimited_axis(f, axis) except KeyError: @@ -5975,7 +5984,7 @@ def _create_netcdfarray( group, name = self._netCDF4_group( g["variable_grouped_dataset"][ncvar], ncvar ) - variable = group.variables.get(name) + variable = group.variables.get(name) # h5netcdf else: variable = g["variables"].get(ncvar) @@ -5994,7 +6003,8 @@ def _create_netcdfarray( ndim = variable.ndim shape = variable.shape - size = variable.size +# size = variable.size + size = self._file_variable_size(variable) if size < 2: size = int(size) @@ -6058,7 +6068,11 @@ def _create_netcdfarray( if return_kwargs_only: return kwargs - array = self.implementation.initialise_NetCDFArray(**kwargs) + if g['netCDF']: + array = self.implementation.initialise_NetCDFArray(**kwargs) + else: + # HDF + array = self.implementation.initialise_HDFArray(**kwargs) return array, kwargs @@ -9918,7 +9932,7 @@ def _file_global_attributes(self): # HDF return nc.attrs - def _file_dimensions(self, var): + def _file_dimensions(self): g = self.read_vars return g['nc'].dimensions @@ -9926,28 +9940,46 @@ def _file_dimension(self, dim_name): return self._file_dimensions()[dim_name] def _file_dimension_isunlimited(self, dim_name): - return self._file_dimensions()[dim_name].isunlimted() + return self._file_dimension(dim_name).isunlimited() def _file_dimension_size(self, dim_name): - return self._file_dimensions()[dim_name].size + return self._file_dimensions(dim_name).size def _file_variables(self): + """ """ g = self.read_vars return g['nc'].variables def _file_variable(self, var_name): return self._file_variables()[var_name] - def _file_variable_attributes(self, var): + def _file_variable_attributes(self, var, names_only=False): g = self.read_vars + if not names_only: + if g['netCDF']: + # NetCDF + return {attr: var.getncattr(attr) for attr in var.ncattrs()} + + # HDF + return var.attrs + if g['netCDF']: # NetCDF - return {attr: var.getncattr(attr) for attr in var.ncattrs()} - + return var.ncattrs() + # HDF - return var.attrs + return list(var.attrs) def _file_variable_dimensions(self, var): return var.dimensions + def _file_variable_size(self, var): + g = self.read_vars + if g['netCDF']: + # NetCDF + return var.size + + # HDF + return prod(var.shape) + From 41e75ceec0f361aabb9a82c43e6947a7ff197063 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 19 Jan 2024 12:16:26 +0000 Subject: [PATCH 03/88] h5 --- cfdm/data/hdfarray.py | 810 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 810 insertions(+) create mode 100644 cfdm/data/hdfarray.py diff --git a/cfdm/data/hdfarray.py b/cfdm/data/hdfarray.py new file mode 100644 index 000000000..0a3ae4b5b --- /dev/null +++ b/cfdm/data/hdfarray.py @@ -0,0 +1,810 @@ +import h5netcdf +import netCDF4 +import numpy as np + +from . import abstract +from .mixin import FileArrayMixin +from .numpyarray import NumpyArray + +_safecast = netCDF4.utils._safecast + +class HDFArray(FileArrayMixin, abstract.Array): + """An underlying array stored in an HDF file. + + .. versionadded:: (cfdm) TODOHDF + + """ + + def __init__( + self, + filename=None, + address=None, + dtype=None, + shape=None, + mask=True, + units=False, + calendar=False, + missing_values=None, + source=None, + copy=True, + ): + """**Initialisation** + + :Parameters: + + filename: (sequence of) `str`, optional + The name of the file(s) containing the array. + + address: (sequence of) `str` or `int`, optional + The identity of the variable in each file defined by + *filename*. Either a netCDF variable name or an + integer HDF variable ID. + + dtype: `numpy.dtype` + The data type of the array in the file. May be `None` + if the numpy data-type is not known (which can be the + case for string types, for example). + + shape: `tuple` + The array dimension sizes in the file. + + size: `int` + Number of elements in the array in the file. + + ndim: `int` + The number of array dimensions in the file. + + mask: `bool` + If True (the default) then mask by convention when + reading data from disk. + + A netCDF array is masked depending on the values of any of + the netCDF variable attributes ``valid_min``, + ``valid_max``, ``valid_range``, ``_FillValue`` and + ``missing_value``. + + units: `str` or `None`, optional + The units of the variable. Set to `None` to indicate + that there are no units. If unset then the units will + be set during the first `__getitem__` call. + + calendar: `str` or `None`, optional + The calendar of the variable. By default, or if set to + `None`, then the CF default calendar is assumed, if + applicable. If unset then the calendar will be set + during the first `__getitem__` call. + + missing_values: `dict`, optional + The missing value indicators defined by the variable + attributes. See `get_missing_values` for details. + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + super().__init__(source=source, copy=copy) + + if source is not None: + try: + shape = source._get_component("shape", None) + except AttributeError: + shape = None + + try: + filename = source._get_component("filename", None) + except AttributeError: + filename = None + + try: + address = source._get_component("address", None) + except AttributeError: + address = None + + try: + dtype = source._get_component("dtype", None) + except AttributeError: + dtype = None + + try: + mask = source._get_component("mask", True) + except AttributeError: + mask = True + + try: + units = source._get_component("units", False) + except AttributeError: + units = False + + try: + calendar = source._get_component("calendar", False) + except AttributeError: + calendar = False + + try: + missing_values = source._get_component("missing_values", None) + except AttributeError: + missing_values = None + + if shape is not None: + self._set_component("shape", shape, copy=False) + + if filename is not None: + if isinstance(filename, str): + filename = (filename,) + else: + filename = tuple(filename) + + self._set_component("filename", filename, copy=False) + + if address is not None: + if isinstance(address, (str, int)): + address = (address,) + else: + address = tuple(address) + + self._set_component("address", address, copy=False) + + if missing_values is not None: + self._set_component( + "missing_values", missing_values.copy(), copy=False + ) + + self._set_component("dtype", dtype, copy=False) + self._set_component("mask", mask, copy=False) + self._set_component("units", units, copy=False) + self._set_component("calendar", calendar, copy=False) + + # By default, close the file after data array access + self._set_component("close", True, copy=False) + + def __getitem__(self, indices): + """Returns a subspace of the array as a numpy array. + + x.__getitem__(indices) <==> x[indices] + + The indices that define the subspace must be either `Ellipsis` or + a sequence that contains an index for each dimension. In the + latter case, each dimension's index must either be a `slice` + object or a sequence of two or more integers. + + Indexing is similar to numpy indexing. The only difference to + numpy indexing (given the restrictions on the type of indices + allowed) is: + + * When two or more dimension's indices are sequences of integers + then these indices work independently along each dimension + (similar to the way vector subscripts work in Fortran). + + .. versionadded:: (cfdm) 1.7.0 + + """ + dataset, address = self.open() + dataset0 = dataset + + mask = self.get_mask() + groups, address = self.get_groups(address) + + if groups: + # Traverse the group structure, if there is one (CF>=1.8). + for g in groups[:-1]: + dataset = dataset.groups[g] # h5netcdf + + dataset = dataset.groups[groups[-1]]# h5netcdf + + if isinstance(address, str): + # Get the variable by netCDF name + variable = dataset.variables[address] + self.variable = variable +# variable.set_auto_mask(mask) # h5netcdf + array = variable[indices] +# array = self.mask_unpack(variable, array) + else: + # Get the variable by netCDF integer ID + for variable in dataset.variables.values(): + if variable._varid == address: + variable.set_auto_mask(mask) + array = variable[indices] + break + + # Set the units, if they haven't been set already. + self._set_units(variable) + + del self.variable + self.close(dataset0) + del dataset, dataset0 + + string_type = isinstance(array, str) + if string_type: + # -------------------------------------------------------- + # A netCDF string type scalar variable comes out as Python + # str object, so convert it to a numpy array. + # -------------------------------------------------------- + array = np.array(array, dtype=f"U{len(array)}") + + if not self.ndim: + # Hmm netCDF4 has a thing for making scalar size 1, 1d + array = array.squeeze() + + kind = array.dtype.kind + if not string_type and kind in "SU": + # == 'S' and array.ndim > (self.ndim - + # getattr(self, 'gathered', 0) - + # getattr(self, 'ragged', 0)): + # -------------------------------------------------------- + # Collapse (by concatenation) the outermost (fastest + # varying) dimension of char array into + # memory. E.g. [['a','b','c']] becomes ['abc'] + # -------------------------------------------------------- + if kind == "U": + array = array.astype("S", copy=False) + + array = netCDF4.chartostring(array) + shape = array.shape + array = np.array([x.rstrip() for x in array.flat], dtype="U") + array = np.reshape(array, shape) + array = np.ma.masked_where(array == "", array) + elif not string_type and kind == "O": + # -------------------------------------------------------- + # A netCDF string type N-d (N>=1) variable comes out as a + # numpy object array, so convert it to numpy string array. + # -------------------------------------------------------- + array = array.astype("U", copy=False) + + # -------------------------------------------------------- + # netCDF4 does not auto-mask VLEN variable, so do it here. + # -------------------------------------------------------- + array = np.ma.where(array == "", np.ma.masked, array) + + return array + + def __repr__(self): + """Called by the `repr` built-in function. + + x.__repr__() <==> repr(x) + + """ + return f"<{self.__class__.__name__}{self.shape}: {self}>" + + def __str__(self): + """Called by the `str` built-in function. + + x.__str__() <==> str(x) + + """ + return f"{self.get_filename(None)}, {self.get_address()}" + + def _set_units(self, var): + """The units and calendar properties. + + These are set from the netCDF variable attributes, but only if + they have already not been defined, either during {{class}} + instantiation or by a previous call to `_set_units`. + + .. versionadded:: (cfdm) 1.10.0.1 + + :Parameters: + + var: `netCDF4.Variable` + The variable containing the units and calendar + definitions. + + :Returns: + + `tuple` + The units and calendar values, either of which may be + `None`. + + """ + # Note: Can't use None as the default since it is a valid + # `units` or 'calendar' value that indicates that the + # attribute has not been set in the dataset. + units = self._get_component("units", False) + if units is False: + try: + units = var.getncattr("units") + except AttributeError: + units = None + + self._set_component("units", units, copy=False) + + calendar = self._get_component("calendar", False) + if calendar is False: + try: + calendar = var.getncattr("calendar") + except AttributeError: + calendar = None + + self._set_component("calendar", calendar, copy=False) + + return units, calendar + + @property + def array(self): + """Return an independent numpy array containing the data. + + .. versionadded:: (cfdm) 1.7.0 + + :Returns: + + `numpy.ndarray` + An independent numpy array of the data. + + **Examples** + + >>> n = numpy.asanyarray(a) + >>> isinstance(n, numpy.ndarray) + True + + """ + return self[...] + + def get_format(self): + """The format of the files. + + .. versionadded:: (cfdm) 1.10.1.0 + + .. seealso:: `get_address`, `get_filename`, `get_formats` + + :Returns: + + `str` + The file format. Always ``'nc'``, signifying netCDF. + + **Examples** + + >>> a.get_format() + 'nc' + + """ + return "nc" + + def get_groups(self, address): + """The netCDF4 group structure of a netCDF variable. + + .. versionadded:: (cfdm) 1.8.6.0 + + :Parameters: + + address: `str` or `int` + The netCDF variable name, or integer varid, from which + to get the groups. + + .. versionadded:: (cfdm) 1.10.1.0 + + :Returns: + + (`list`, `str`) or (`list`, `int`) + The group structure and the name within the group. If + *address* is a varid then an empty list and the varid + are returned. + + **Examples** + + >>> n.get_groups('tas') + ([], 'tas') + + >>> n.get_groups('/tas') + ([], 'tas') + + >>> n.get_groups('/data/model/tas') + (['data', 'model'], 'tas') + + >>> n.get_groups(9) + ([], 9) + + """ + try: + if "/" not in address: + return [], address + except TypeError: + return [], address + + out = address.split("/")[1:] + return out[:-1], out[-1] + + def get_mask(self): + """Whether or not to automatically mask the data. + + .. versionadded:: (cfdm) 1.8.2 + + **Examples** + + >>> b = a.get_mask() + + """ + return self._get_component("mask") + + def get_missing_values(self): + """The missing value indicators from the netCDF variable. + + .. versionadded:: (cfdm) 1.10.0.3 + + :Returns: + + `dict` or `None` + The missing value indicators from the netCDF variable, + keyed by their netCDF attribute names. An empty + dictionary signifies that no missing values are given + in the file. `None` signifies that the missing values + have not been set. + + **Examples** + + >>> a.get_missing_values() + None + + >>> b.get_missing_values() + {} + + >>> c.get_missing_values() + {'missing_value': 1e20, 'valid_range': (-10, 20)} + + >>> d.get_missing_values() + {'valid_min': -999} + + """ + out = self._get_component("missing_values", None) + if out is None: + return + + return out.copy() + + def close(self, dataset): + """Close the dataset containing the data. + + .. versionadded:: (cfdm) 1.7.0 + + :Parameters: + + dataset: `netCDF4.Dataset` + The netCDF dataset to be be closed. + + :Returns: + + `None` + + """ + if self._get_component("close"): + dataset.close() + + def open(self, **kwargs): + """Return an open file object containing the data array. + + When multiple files have been provided an attempt is made to + open each one, in the order stored, and an open file object is + returned from the first file that exists. + + :Returns: + + (`netCDF4.Dataset`, `str`) + The open file object, and the address of the data + within the file. + + """ + return super().open(h5netcdf.File, mode="r", **kwargs) + + def _check_safecast(self, attname): + # check to see that variable attribute exists + # can can be safely cast to variable data type. + msg="""WARNING: %s not used since it cannot be safely cast to variable data type""" % attname + attrs = variable.attrs + if attname in attrs: + attvalue = self.variable.attrs[attname] + att = np.array(attvalue) + setattr(self, attname, attvalue) + else: + return False + try: + atta = np.array(att, self.dtype) + except ValueError: + is_safe = False + warnings.warn(msg) + return is_safe + is_safe = _safecast(att,atta) + if not is_safe: + warnings.warn(msg) + return is_safe + +# def mask_and_scale(self, data): +# """TODOHDF""" +# self.scale = True # h5netcdf +# attrs = self.variable.attrs +# self._Unsigned = attrs.get('_Unsigned', 'false') +# +# # if attribute _Unsigned is "true", and variable has signed integer +# # dtype, return view with corresponding unsigned dtype (issues #656, +# # #794) +# # _Unsigned attribute must be "true" or "True" (string). Issue #1232. +# is_unsigned = getattr(self, '_Unsigned', False) in ["True","true"] +# is_unsigned_int = is_unsigned and data.dtype.kind == 'i' +# if self.scale and is_unsigned_int: # only do this if autoscale option is on. +# dtype_unsigned_int='%su%s' % (data.dtype.byteorder,data.dtype.itemsize) +# data = data.view(dtype_unsigned_int) +# +# # private function for creating a masked array, masking missing_values +# # and/or _FillValues. +# totalmask = np.zeros(data.shape, np.bool_) +# fill_value = None +# safe_missval = self._check_safecast('missing_value') +# if safe_missval: +# mval = numpy.array(self.missing_value, self.dtype) +# if self.scale and is_unsigned_int: +# mval = mval.view(dtype_unsigned_int) +# +# # create mask from missing values. +# mvalmask = np.zeros(data.shape, numpy.bool_) +# if mval.shape == (): # mval a scalar. +# mval = [mval] # make into iterable. +# +# for m in mval: +# # is scalar missing value a NaN? +# try: +# mvalisnan = numpy.isnan(m) +# except TypeError: # isnan fails on some dtypes (issue 206) +# mvalisnan = False +# +# if mvalisnan: +# mvalmask += numpy.isnan(data) +# else: +# mvalmask += data==m +# +# if mvalmask.any(): +# # set fill_value for masked array +# # to missing_value (or 1st element +# # if missing_value is a vector). +# fill_value = mval[0] +# totalmask += mvalmask +# +# # set mask=True for data == fill value +# safe_fillval = self._check_safecast('_FillValue') +# if safe_fillval: +# fval = np.array(self._FillValue, self.dtype) +# if self.scale and is_unsigned_int: +# fval = fval.view(dtype_unsigned_int) +# +# # is _FillValue a NaN? +# try: +# fvalisnan = np.isnan(fval) +# except: # isnan fails on some dtypes (issue 202) +# fvalisnan = False +# +# if fvalisnan: +# mask = np.isnan(data) +# elif (data == fval).any(): +# mask = data==fval +# else: +# mask = None +# +# if mask is not None: +# if fill_value is None: +# fill_value = fval +# +# totalmask += mask +# # issue 209: don't return masked array if variable filling +# # is disabled. +# else: +# if __netcdf4libversion__ < '4.5.1' and\ +# self._grp.file_format.startswith('NETCDF3'): +# # issue #908: no_fill not correct for NETCDF3 files before 4.5.1 +# # before 4.5.1 there was no way to turn off filling on a +# # per-variable basis for classic files. +# no_fill=0 +# else: +# with nogil: +# ierr = nc_inq_var_fill(self._grpid,self._varid,&no_fill,NULL) +# _ensure_nc_success(ierr) +# # if no_fill is not 1, and not a byte variable, then use default fill value. +# # from http://www.unidata.ucar.edu/software/netcdf/docs/netcdf-c/Fill-Values.html#Fill-Values +# # "If you need a fill value for a byte variable, it is recommended +# # that you explicitly define an appropriate _FillValue attribute, as +# # generic utilities such as ncdump will not assume a default fill +# # value for byte variables." +# # Explained here too: +# # http://www.unidata.ucar.edu/software/netcdf/docs/known_problems.html#ncdump_ubyte_fill +# # "There should be no default fill values when reading any byte +# # type, signed or unsigned, because the byte ranges are too +# # small to assume one of the values should appear as a missing +# # value unless a _FillValue attribute is set explicitly." +# # (do this only for non-vlens, since vlens don't have a default _FillValue) +# if not self._isvlen and (no_fill != 1 or self.dtype.str[1:] not in ['u1','i1']): +# fillval = np.array(default_fillvals[self.dtype.str[1:]],self.dtype) +# has_fillval = data == fillval +# # if data is an array scalar, has_fillval will be a boolean. +# # in that case convert to an array. +# if type(has_fillval) == bool: +# has_fillval = np.asarray(has_fillval) +# +# if has_fillval.any(): +# if fill_value is None: +# fill_value = fillval +# +# mask = data == fillval +# totalmask += mask +# # set mask=True for data outside valid_min,valid_max. +# # (issue #576) +# validmin = None; +# validmax = None +# # if valid_range exists use that, otherwise +# # look for valid_min, valid_max. No special +# # treatment of byte data as described at +# # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). +# safe_validrange = self._check_safecast('valid_range') +# safe_validmin = self._check_safecast('valid_min') +# safe_validmax = self._check_safecast('valid_max') +# if safe_validrange and self.valid_range.size == 2: +# validmin = np.array(self.valid_range[0], self.dtype) +# validmax = np.array(self.valid_range[1], self.dtype) +# else: +# if safe_validmin: +# validmin = np.array(self.valid_min, self.dtype) +# +# if safe_validmax: +# validmax = numpy.array(self.valid_max, self.dtype) +# if validmin is not None and self.scale and is_unsigned_int: +# validmin = validmin.view(dtype_unsigned_int) +# +# if validmax is not None and self.scale and is_unsigned_int: +# validmax = validmax.view(dtype_unsigned_int) +# # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). +# # "If the data type is byte and _FillValue +# # is not explicitly defined, +# # then the valid range should include all possible values. +# # Otherwise, the valid range should exclude the _FillValue +# # (whether defined explicitly or by default) as follows. +# # If the _FillValue is positive then it defines a valid maximum, +# # otherwise it defines a valid minimum." +# byte_type = self.dtype.str[1:] in ['u1','i1'] +# if safe_fillval: +# fval = np.array(self._FillValue, self.dtype) +# else: +# fval = np.array(default_fillvals[self.dtype.str[1:]],self.dtype) +# if byte_type: +# fval = None +# +# if self.dtype.kind != 'S': # don't set mask for character data +# # issues #761 and #748: setting valid_min/valid_max to the +# # _FillVaue is too surprising for many users (despite the +# # netcdf docs attribute best practices suggesting clients +# # should do this). +# #if validmin is None and (fval is not None and fval <= 0): +# # validmin = fval +# #if validmax is None and (fval is not None and fval > 0): +# # validmax = fval +# if validmin is not None: +# totalmask += data < validmin +# if validmax is not None: +# totalmask += data > validmax +# +# if fill_value is None and fval is not None: +# fill_value = fval +# # if all else fails, use default _FillValue as fill_value +# # for masked array. +# +# if fill_value is None: +# fill_value = default_fillvals[self.dtype.str[1:]] +# +# # create masked array with computed mask +# masked_values = bool(totalmask.any()) +# if masked_values: +# data = np.ma.masked_array(data, mask=totalmask,fill_value=fill_value) +# else: +# # issue #785: always return masked array, if no values masked +# data = np.ma.masked_array(data) +# +# # issue 515 scalar array with mask=True should be converted +# # to numpy.ma.MaskedConstant to be consistent with slicing +# # behavior of masked arrays. +# if data.shape == () and data.mask.all(): +# # return a scalar numpy masked constant not a 0-d masked array, +# # so that data == numpy.ma.masked. +# data = data[()] # changed from [...] (issue #662) +# +# elif not self.always_mask and not masked_values: +# # issue #809: return a regular numpy array if requested +# # and there are no missing values +# data = np.array(data, copy=False) +# +# # --------------------------- +# # Now scale +# # --------------------------- +# if self.scale and\ +# (self._isprimitive or (self._isvlen and self.dtype != str)) and\ +# valid_scaleoffset: +# # if variable has scale_factor and add_offset attributes, apply +# # them. +# if hasattr(self, 'scale_factor') and hasattr(self, 'add_offset'): +# if self.add_offset != 0.0 or self.scale_factor != 1.0: +# data = data*self.scale_factor + self.add_offset +# else: +# data = data.astype(self.scale_factor.dtype) # issue 913 +# # else if variable has only scale_factor attribute, rescale. +# elif hasattr(self, 'scale_factor') and self.scale_factor != 1.0: +# data = data*self.scale_factor +# # else if variable has only add_offset attribute, add offset. +# elif hasattr(self, 'add_offset') and self.add_offset != 0.0: +# data = data +# +# return data + + def mask_unpack(self, variable, array): + """TODOHDF""" + mu = _Mask_Unpack(variable) + array = mu.mask(array) + array = mu.unpack(array) + return data + + def to_memory(self): + """Bring data on disk into memory. + + .. versionadded:: (cfdm) 1.7.0 + + :Returns: + + `NumpyArray` + The new with all of its data in memory. + + """ + return NumpyArray(self[...]) + + +default_fillvals = netCDF4.default_fillvals +_private_atts = [] + +class _Mask_Unpack(netCDF4.Variable): + """TODOHDF""" + def __init__(self, variable): + self.__dict__['_isprimitive'] = True # h5netcdf + self.__dict__['_isvlen'] = False # h5netcdf + self.__dict__['_isenum'] = False # h5netcdf + self.__dict__['dtype'] = variable.dtype + + attrs = variable.attrs + print (attrs) + self.__dict__['attrs'] = attrs + self.__dict__['_Unsigned'] = 'false' + for attr in ('_FillValue', 'add_offset', 'scale', '_Unsigned', 'valid_max', 'valid_min', 'valid_range', ): + if attr in attrs: + self.__dict__[attr] = attrs[attr] + + def getncattr(self, name, encoding='utf-8'): + """Retrieve a netCDF4 attribute.""" + return self.attrs[name] + + def mask(self, data): + """TODOHDF""" + return self._toma(data) + + def unpack(self, data): + """Unpack non-masked values using scale_factor and add_offset. + + """ + if not scale: + return data + + try: # check to see if scale_factor and add_offset is valid (issue 176). + if hasattr(self,'scale_factor'): float(self.scale_factor) + if hasattr(self,'add_offset'): float(self.add_offset) + valid_scaleoffset = True + except: + valid_scaleoffset = False + if self.scale: + msg = 'invalid scale_factor or add_offset attribute, no unpacking done...' + # warnings.warn(msg) # h5netcdf + + if self.scale and\ + (self._isprimitive or (self._isvlen and self.dtype != str)) and\ + valid_scaleoffset: + # if variable has scale_factor and add_offset attributes, apply + # them. + if hasattr(self, 'scale_factor') and hasattr(self, 'add_offset'): + if self.add_offset != 0.0 or self.scale_factor != 1.0: + data = data*self.scale_factor + self.add_offset + else: + data = data.astype(self.scale_factor.dtype) # issue 913 + # else if variable has only scale_factor attribute, rescale. + elif hasattr(self, 'scale_factor') and self.scale_factor != 1.0: + data = data*self.scale_factor + # else if variable has only add_offset attribute, add offset. + elif hasattr(self, 'add_offset') and self.add_offset != 0.0: + data = data + self.add_offset From 581ad89526e7d94d64495a6459a6a3dc627423a8 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 19 Jan 2024 17:34:22 +0000 Subject: [PATCH 04/88] h5 --- cfdm/data/hdfarray.py | 464 +++++++++++++++++++++--------------------- 1 file changed, 237 insertions(+), 227 deletions(-) diff --git a/cfdm/data/hdfarray.py b/cfdm/data/hdfarray.py index 0a3ae4b5b..bf0c7cbcb 100644 --- a/cfdm/data/hdfarray.py +++ b/cfdm/data/hdfarray.py @@ -485,9 +485,7 @@ def open(self, **kwargs): return super().open(h5netcdf.File, mode="r", **kwargs) def _check_safecast(self, attname): - # check to see that variable attribute exists - # can can be safely cast to variable data type. - msg="""WARNING: %s not used since it cannot be safely cast to variable data type""" % attname + """Check to see that variable attribute exists can can be safely cast to variable data type.""" attrs = variable.attrs if attname in attrs: attvalue = self.variable.attrs[attname] @@ -495,237 +493,249 @@ def _check_safecast(self, attname): setattr(self, attname, attvalue) else: return False + + is_safe = True try: atta = np.array(att, self.dtype) except ValueError: is_safe = False - warnings.warn(msg) - return is_safe - is_safe = _safecast(att,atta) + else: + is_safe = _safecast(att, atta) + if not is_safe: - warnings.warn(msg) + logger.warn( + f"WARNING: {attname} not used since it cannot " + "be safely cast to variable data type" + ) # pragma: no cover + return is_safe -# def mask_and_scale(self, data): -# """TODOHDF""" -# self.scale = True # h5netcdf -# attrs = self.variable.attrs -# self._Unsigned = attrs.get('_Unsigned', 'false') -# -# # if attribute _Unsigned is "true", and variable has signed integer -# # dtype, return view with corresponding unsigned dtype (issues #656, -# # #794) -# # _Unsigned attribute must be "true" or "True" (string). Issue #1232. -# is_unsigned = getattr(self, '_Unsigned', False) in ["True","true"] -# is_unsigned_int = is_unsigned and data.dtype.kind == 'i' -# if self.scale and is_unsigned_int: # only do this if autoscale option is on. -# dtype_unsigned_int='%su%s' % (data.dtype.byteorder,data.dtype.itemsize) -# data = data.view(dtype_unsigned_int) -# -# # private function for creating a masked array, masking missing_values -# # and/or _FillValues. -# totalmask = np.zeros(data.shape, np.bool_) -# fill_value = None -# safe_missval = self._check_safecast('missing_value') -# if safe_missval: -# mval = numpy.array(self.missing_value, self.dtype) -# if self.scale and is_unsigned_int: -# mval = mval.view(dtype_unsigned_int) -# -# # create mask from missing values. -# mvalmask = np.zeros(data.shape, numpy.bool_) -# if mval.shape == (): # mval a scalar. -# mval = [mval] # make into iterable. -# -# for m in mval: -# # is scalar missing value a NaN? -# try: -# mvalisnan = numpy.isnan(m) -# except TypeError: # isnan fails on some dtypes (issue 206) -# mvalisnan = False -# -# if mvalisnan: -# mvalmask += numpy.isnan(data) -# else: -# mvalmask += data==m -# -# if mvalmask.any(): -# # set fill_value for masked array -# # to missing_value (or 1st element -# # if missing_value is a vector). -# fill_value = mval[0] -# totalmask += mvalmask -# -# # set mask=True for data == fill value -# safe_fillval = self._check_safecast('_FillValue') -# if safe_fillval: -# fval = np.array(self._FillValue, self.dtype) -# if self.scale and is_unsigned_int: -# fval = fval.view(dtype_unsigned_int) -# -# # is _FillValue a NaN? -# try: -# fvalisnan = np.isnan(fval) -# except: # isnan fails on some dtypes (issue 202) -# fvalisnan = False -# -# if fvalisnan: -# mask = np.isnan(data) -# elif (data == fval).any(): -# mask = data==fval -# else: -# mask = None -# -# if mask is not None: -# if fill_value is None: -# fill_value = fval -# -# totalmask += mask -# # issue 209: don't return masked array if variable filling -# # is disabled. -# else: -# if __netcdf4libversion__ < '4.5.1' and\ -# self._grp.file_format.startswith('NETCDF3'): -# # issue #908: no_fill not correct for NETCDF3 files before 4.5.1 -# # before 4.5.1 there was no way to turn off filling on a -# # per-variable basis for classic files. -# no_fill=0 -# else: -# with nogil: -# ierr = nc_inq_var_fill(self._grpid,self._varid,&no_fill,NULL) -# _ensure_nc_success(ierr) -# # if no_fill is not 1, and not a byte variable, then use default fill value. -# # from http://www.unidata.ucar.edu/software/netcdf/docs/netcdf-c/Fill-Values.html#Fill-Values -# # "If you need a fill value for a byte variable, it is recommended -# # that you explicitly define an appropriate _FillValue attribute, as -# # generic utilities such as ncdump will not assume a default fill -# # value for byte variables." -# # Explained here too: -# # http://www.unidata.ucar.edu/software/netcdf/docs/known_problems.html#ncdump_ubyte_fill -# # "There should be no default fill values when reading any byte -# # type, signed or unsigned, because the byte ranges are too -# # small to assume one of the values should appear as a missing -# # value unless a _FillValue attribute is set explicitly." -# # (do this only for non-vlens, since vlens don't have a default _FillValue) -# if not self._isvlen and (no_fill != 1 or self.dtype.str[1:] not in ['u1','i1']): -# fillval = np.array(default_fillvals[self.dtype.str[1:]],self.dtype) -# has_fillval = data == fillval -# # if data is an array scalar, has_fillval will be a boolean. -# # in that case convert to an array. -# if type(has_fillval) == bool: -# has_fillval = np.asarray(has_fillval) -# -# if has_fillval.any(): -# if fill_value is None: -# fill_value = fillval -# -# mask = data == fillval -# totalmask += mask -# # set mask=True for data outside valid_min,valid_max. -# # (issue #576) -# validmin = None; -# validmax = None -# # if valid_range exists use that, otherwise -# # look for valid_min, valid_max. No special -# # treatment of byte data as described at -# # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). -# safe_validrange = self._check_safecast('valid_range') -# safe_validmin = self._check_safecast('valid_min') -# safe_validmax = self._check_safecast('valid_max') -# if safe_validrange and self.valid_range.size == 2: -# validmin = np.array(self.valid_range[0], self.dtype) -# validmax = np.array(self.valid_range[1], self.dtype) -# else: -# if safe_validmin: -# validmin = np.array(self.valid_min, self.dtype) -# -# if safe_validmax: -# validmax = numpy.array(self.valid_max, self.dtype) -# if validmin is not None and self.scale and is_unsigned_int: -# validmin = validmin.view(dtype_unsigned_int) -# -# if validmax is not None and self.scale and is_unsigned_int: -# validmax = validmax.view(dtype_unsigned_int) -# # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). -# # "If the data type is byte and _FillValue -# # is not explicitly defined, -# # then the valid range should include all possible values. -# # Otherwise, the valid range should exclude the _FillValue -# # (whether defined explicitly or by default) as follows. -# # If the _FillValue is positive then it defines a valid maximum, -# # otherwise it defines a valid minimum." -# byte_type = self.dtype.str[1:] in ['u1','i1'] -# if safe_fillval: -# fval = np.array(self._FillValue, self.dtype) -# else: -# fval = np.array(default_fillvals[self.dtype.str[1:]],self.dtype) -# if byte_type: -# fval = None -# -# if self.dtype.kind != 'S': # don't set mask for character data -# # issues #761 and #748: setting valid_min/valid_max to the -# # _FillVaue is too surprising for many users (despite the -# # netcdf docs attribute best practices suggesting clients -# # should do this). -# #if validmin is None and (fval is not None and fval <= 0): -# # validmin = fval -# #if validmax is None and (fval is not None and fval > 0): -# # validmax = fval -# if validmin is not None: -# totalmask += data < validmin -# if validmax is not None: -# totalmask += data > validmax -# -# if fill_value is None and fval is not None: -# fill_value = fval -# # if all else fails, use default _FillValue as fill_value -# # for masked array. -# -# if fill_value is None: -# fill_value = default_fillvals[self.dtype.str[1:]] -# -# # create masked array with computed mask -# masked_values = bool(totalmask.any()) -# if masked_values: -# data = np.ma.masked_array(data, mask=totalmask,fill_value=fill_value) -# else: -# # issue #785: always return masked array, if no values masked -# data = np.ma.masked_array(data) -# -# # issue 515 scalar array with mask=True should be converted -# # to numpy.ma.MaskedConstant to be consistent with slicing -# # behavior of masked arrays. -# if data.shape == () and data.mask.all(): -# # return a scalar numpy masked constant not a 0-d masked array, -# # so that data == numpy.ma.masked. -# data = data[()] # changed from [...] (issue #662) -# -# elif not self.always_mask and not masked_values: -# # issue #809: return a regular numpy array if requested -# # and there are no missing values -# data = np.array(data, copy=False) -# -# # --------------------------- -# # Now scale -# # --------------------------- -# if self.scale and\ -# (self._isprimitive or (self._isvlen and self.dtype != str)) and\ -# valid_scaleoffset: -# # if variable has scale_factor and add_offset attributes, apply -# # them. -# if hasattr(self, 'scale_factor') and hasattr(self, 'add_offset'): -# if self.add_offset != 0.0 or self.scale_factor != 1.0: -# data = data*self.scale_factor + self.add_offset -# else: -# data = data.astype(self.scale_factor.dtype) # issue 913 -# # else if variable has only scale_factor attribute, rescale. -# elif hasattr(self, 'scale_factor') and self.scale_factor != 1.0: -# data = data*self.scale_factor -# # else if variable has only add_offset attribute, add offset. -# elif hasattr(self, 'add_offset') and self.add_offset != 0.0: -# data = data -# -# return data + def mask_and_scale(self, data): + """TODOHDF""" + self.scale = True # h5netcdf + attrs = self.variable.attrs + self._Unsigned = attrs.get('_Unsigned', 'false') + + # if attribute _Unsigned is "true", and variable has signed integer + # dtype, return view with corresponding unsigned dtype (issues #656, + # #794) + # _Unsigned attribute must be "true" or "True" (string). Issue #1232. + is_unsigned = getattr(self, '_Unsigned', False) in ["True","true"] + is_unsigned_int = is_unsigned and data.dtype.kind == 'i' + if self.scale and is_unsigned_int: + # only do this if autoscale option is on. + dtype_unsigned_int = f"{data.dtype.byteorder}u{data.dtype.itemsize}" + data = data.view(dtype_unsigned_int) + + # private function for creating a masked array, masking missing_values + # and/or _FillValues. + totalmask = np.zeros(data.shape, np.bool_) + fill_value = None + safe_missval = self._check_safecast('missing_value') + if safe_missval: + mval = np.array(self.missing_value, self.dtype) + if self.scale and is_unsigned_int: + mval = mval.view(dtype_unsigned_int) + + # create mask from missing values. + mvalmask = np.zeros(data.shape, np.bool_) + if mval.shape == (): # mval a scalar. + mval = [mval] # make into iterable. + + for m in mval: + # is scalar missing value a NaN? + try: + mvalisnan = np.isnan(m) + except TypeError: + # isnan fails on some dtypes + mvalisnan = False + + if mvalisnan: + mvalmask += numpy.isnan(data) + else: + mvalmask += data==m + + if mvalmask.any(): + # set fill_value for masked array to missing_value (or + # 1st element if missing_value is a vector). + fill_value = mval[0] + totalmask += mvalmask + + # set mask=True for data == fill value + safe_fillval = self._check_safecast('_FillValue') + if safe_fillval: + fval = np.array(self._FillValue, self.dtype) + if self.scale and is_unsigned_int: + fval = fval.view(dtype_unsigned_int) + + # is _FillValue a NaN? + try: + fvalisnan = np.isnan(fval) + except Exception: + # isnan fails on some dtypes + fvalisnan = False + + if fvalisnan: + mask = np.isnan(data) + elif (data == fval).any(): + mask = data==fval + else: + mask = None + + if mask is not None: + if fill_value is None: + fill_value = fval + + totalmask += mask + # issue 209: don't return masked array if variable filling + # is disabled. + else: + if __netcdf4libversion__ < '4.5.1' and\ + self._grp.file_format.startswith('NETCDF3'): + # issue #908: no_fill not correct for NETCDF3 files before 4.5.1 + # before 4.5.1 there was no way to turn off filling on a + # per-variable basis for classic files. + no_fill=0 + else: + with nogil: + ierr = nc_inq_var_fill(self._grpid,self._varid,&no_fill,NULL) + _ensure_nc_success(ierr) + # if no_fill is not 1, and not a byte variable, then use + # default fill value. from + # http://www.unidata.ucar.edu/software/netcdf/docs/netcdf-c/Fill-Values.html#Fill-Values + # "If you need a fill value for a byte variable, it is + # recommended that you explicitly define an appropriate + # _FillValue attribute, as generic utilities such as + # ncdump will not assume a default fill value for byte + # variables." Explained here too: + # http://www.unidata.ucar.edu/software/netcdf/docs/known_problems.html#ncdump_ubyte_fill + # "There should be no default fill values when reading any + # byte type, signed or unsigned, because the byte ranges + # are too small to assume one of the values should appear + # as a missing value unless a _FillValue attribute is set + # explicitly." (do this only for non-vlens, since vlens + # don't have a default _FillValue) + if not self._isvlen and (no_fill != 1 or self.dtype.str[1:] not in ['u1','i1']): + fillval = np.array(default_fillvals[self.dtype.str[1:]],self.dtype) + has_fillval = data == fillval + # if data is an array scalar, has_fillval will be a boolean. + # in that case convert to an array. + if type(has_fillval) == bool: + has_fillval = np.asarray(has_fillval) + + if has_fillval.any(): + if fill_value is None: + fill_value = fillval + + mask = data == fillval + totalmask += mask + + # Set mask=True for data outside valid_min, valid_max. + validmin = None + validmax = None + # If valid_range exists use that, otherwise look for + # valid_min, valid_max. No special treatment of byte data as + # described at + # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). + safe_validrange = self._check_safecast('valid_range') + safe_validmin = self._check_safecast('valid_min') + safe_validmax = self._check_safecast('valid_max') + if safe_validrange and self.valid_range.size == 2: + validmin = np.array(self.valid_range[0], self.dtype) + validmax = np.array(self.valid_range[1], self.dtype) + else: + if safe_validmin: + validmin = np.array(self.valid_min, self.dtype) + + if safe_validmax: + validmax = numpy.array(self.valid_max, self.dtype) + + if validmin is not None and self.scale and is_unsigned_int: + validmin = validmin.view(dtype_unsigned_int) + + if validmax is not None and self.scale and is_unsigned_int: + validmax = validmax.view(dtype_unsigned_int) + + # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). + # "If the data type is byte and _FillValue is not explicitly + # defined, then the valid range should include all possible + # values. Otherwise, the valid range should exclude the + # _FillValue (whether defined explicitly or by default) as + # follows. If the _FillValue is positive then it defines a + # valid maximum, otherwise it defines a valid minimum." + byte_type = self.dtype.str[1:] in ['u1','i1'] + if safe_fillval: + fval = np.array(self._FillValue, self.dtype) + else: + fval = np.array(default_fillvals[self.dtype.str[1:]],self.dtype) + if byte_type: + fval = None + + if self.dtype.kind != 'S': + # Don't set mask for character data + + # Setting valid_min/valid_max to the _FillVaue is too + # surprising for many users (despite the netcdf docs + # attribute best practices suggesting clients should do + # this). + if validmin is not None: + totalmask += data < validmin + if validmax is not None: + totalmask += data > validmax + + if fill_value is None and fval is not None: + fill_value = fval + + # If all else fails, use default _FillValue as fill_value for + # masked array. + if fill_value is None: + fill_value = default_fillvals[self.dtype.str[1:]] + + # Create masked array with computed mask + masked_values = bool(totalmask.any()) + if masked_values: + data = np.ma.masked_array(data, mask=totalmask,fill_value=fill_value) + else: + # Always return masked array, if no values masked. + data = np.ma.masked_array(data) + + # Scalar array with mask=True should be converted to + # numpy.ma.MaskedConstant to be consistent with slicing + # behavior of masked arrays. + if data.shape == () and data.mask.all(): + # Return a scalar numpy masked constant not a 0-d masked + # array, so that data == numpy.ma.masked. + data = data[()] + + elif not self.always_mask and not masked_values: + # Return a regular numpy array if requested and there are + # no missing values + data = np.array(data, copy=False) + + # --------------------------- + # Now scale + # --------------------------- + if (valid_scaleoffset + and self.scale + and (self._isprimitive or (self._isvlen and self.dtype != str)) + ): + # If variable has scale_factor and add_offset attributes, + # apply them. + if hasattr(self, 'scale_factor') and hasattr(self, 'add_offset'): + if self.add_offset != 0.0 or self.scale_factor != 1.0: + data = data * self.scale_factor + self.add_offset + else: + data = data.astype(self.scale_factor.dtype) + elif hasattr(self, 'scale_factor') and self.scale_factor != 1.0: + # If variable has only scale_factor attribute, + # rescale. + data = data * self.scale_factor + elif hasattr(self, 'add_offset') and self.add_offset != 0.0: + # If variable has only add_offset attribute, add + # offset. + data = data + + return data def mask_unpack(self, variable, array): """TODOHDF""" From 6d2f5f5dbed86ebae55edb48c8f03bf6bdb98e9d Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 22 Jan 2024 14:19:26 +0000 Subject: [PATCH 05/88] h5 --- cfdm/read_write/netcdf/netcdfwrite.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index 81190d6ae..eb6196582 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -4459,6 +4459,7 @@ def file_open(self, filename, mode, fmt, fields): os.remove(filename) try: + nc.set_chunk_cache(16*1024*1024) # 16MiB chunkcache nc = netCDF4.Dataset(filename, mode, format=fmt) except RuntimeError as error: raise RuntimeError(f"{error}: {filename}") From 8492d21961dbc6effa0530938a9edce655ac61a8 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 22 Jan 2024 23:12:20 +0000 Subject: [PATCH 06/88] dev --- cfdm/mixin/propertiesdata.py | 1 - cfdm/read_write/netcdf/netcdfread.py | 25 +++++++++++-------------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/cfdm/mixin/propertiesdata.py b/cfdm/mixin/propertiesdata.py index 83a7913e5..dabf3914d 100644 --- a/cfdm/mixin/propertiesdata.py +++ b/cfdm/mixin/propertiesdata.py @@ -100,7 +100,6 @@ def __str__(self): if units is None: isreftime = bool(self.get_property("calendar", False)) else: - print (type(units), repr(units)) isreftime = "since" in str(units) if isreftime: diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 327dff374..b4743d2b0 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -559,10 +559,10 @@ def file_open(self, filename, flatten=True, verbose=None): g["flat_files"].append(flat_file) - if HDF: - print ("Opened with h5netcdf") - else: - print ("Opened with netCDF4") +# if HDF: +# print ("Opened with h5netcdf") +# else: +# print ("Opened with netCDF4") g["netCDF"] = netCDF g["HDF"] = HDF @@ -1035,6 +1035,7 @@ def read( global_attributes = {} # for attr in map(str,nc.ncattrs()): for attr, value in self._file_global_attributes().items(): + attr = str(attr) try: if isinstance(value, str): try: @@ -1260,17 +1261,13 @@ def read( variable_attributes[ncvar] = {} # for attr in map(str, variable.ncattrs()): for attr, value in self._file_variable_attributes(variable).items(): - try: - if isinstance(value, str): - try: - value = str(value) - except UnicodeEncodeError: - value = value.encode(errors="ignore") - except UnicodeDecodeError: - pass - + attr = str(attr) + if isinstance(value, bytes): + value = value.decode(errors="ignore") + variable_attributes[ncvar][attr] = value - +# print (attr, value, type(value)) + # variable_dimensions[ncvar] = tuple(variable.dimensions) variable_dimensions[ncvar] = tuple(self._file_variable_dimensions(variable)) variable_dataset[ncvar] = nc From 75a4c1c38b823f7b75f6d48e8306a16d98008267 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jan 2024 16:20:31 +0000 Subject: [PATCH 07/88] dev --- cfdm/data/hdfarray.py | 543 ++++++++------------------ cfdm/data/mixin/__init__.py | 1 + cfdm/data/mixin/filearraymixin.py | 13 +- cfdm/data/mixin/xxxmixin.py | 141 +++++++ cfdm/data/netcdfarray.py | 64 +-- cfdm/read_write/netcdf/netcdfread.py | 16 +- cfdm/read_write/netcdf/netcdfwrite.py | 2 +- requirements.txt | 2 + 8 files changed, 362 insertions(+), 420 deletions(-) create mode 100644 cfdm/data/mixin/xxxmixin.py diff --git a/cfdm/data/hdfarray.py b/cfdm/data/hdfarray.py index bf0c7cbcb..fbc151ead 100644 --- a/cfdm/data/hdfarray.py +++ b/cfdm/data/hdfarray.py @@ -1,14 +1,16 @@ import h5netcdf import netCDF4 + import numpy as np from . import abstract -from .mixin import FileArrayMixin +from .mixin import FileArrayMixin, XXXMixin from .numpyarray import NumpyArray _safecast = netCDF4.utils._safecast +default_fillvals = netCDF4.default_fillvals -class HDFArray(FileArrayMixin, abstract.Array): +class HDFArray(XXXMixin, FileArrayMixin, abstract.Array): """An underlying array stored in an HDF file. .. versionadded:: (cfdm) TODOHDF @@ -196,24 +198,30 @@ def __getitem__(self, indices): # Get the variable by netCDF name variable = dataset.variables[address] self.variable = variable -# variable.set_auto_mask(mask) # h5netcdf array = variable[indices] -# array = self.mask_unpack(variable, array) else: # Get the variable by netCDF integer ID for variable in dataset.variables.values(): if variable._varid == address: - variable.set_auto_mask(mask) array = variable[indices] break # Set the units, if they haven't been set already. self._set_units(variable) - del self.variable + if mask: + self.scale = True + self.always_mask = False + self._isvlen = variable.dtype == np.dtype('O') + print('V', self._isvlen) + if not self._isvlen: + array = self._mask(array) + array = self._scale(array) + self.close(dataset0) del dataset, dataset0 - + del self.variable + string_type = isinstance(array, str) if string_type: # -------------------------------------------------------- @@ -226,36 +234,7 @@ def __getitem__(self, indices): # Hmm netCDF4 has a thing for making scalar size 1, 1d array = array.squeeze() - kind = array.dtype.kind - if not string_type and kind in "SU": - # == 'S' and array.ndim > (self.ndim - - # getattr(self, 'gathered', 0) - - # getattr(self, 'ragged', 0)): - # -------------------------------------------------------- - # Collapse (by concatenation) the outermost (fastest - # varying) dimension of char array into - # memory. E.g. [['a','b','c']] becomes ['abc'] - # -------------------------------------------------------- - if kind == "U": - array = array.astype("S", copy=False) - - array = netCDF4.chartostring(array) - shape = array.shape - array = np.array([x.rstrip() for x in array.flat], dtype="U") - array = np.reshape(array, shape) - array = np.ma.masked_where(array == "", array) - elif not string_type and kind == "O": - # -------------------------------------------------------- - # A netCDF string type N-d (N>=1) variable comes out as a - # numpy object array, so convert it to numpy string array. - # -------------------------------------------------------- - array = array.astype("U", copy=False) - - # -------------------------------------------------------- - # netCDF4 does not auto-mask VLEN variable, so do it here. - # -------------------------------------------------------- - array = np.ma.where(array == "", np.ma.masked, array) - + array = self._process_string_and_char(array) return array def __repr__(self): @@ -274,221 +253,11 @@ def __str__(self): """ return f"{self.get_filename(None)}, {self.get_address()}" - def _set_units(self, var): - """The units and calendar properties. - - These are set from the netCDF variable attributes, but only if - they have already not been defined, either during {{class}} - instantiation or by a previous call to `_set_units`. - - .. versionadded:: (cfdm) 1.10.0.1 - - :Parameters: - - var: `netCDF4.Variable` - The variable containing the units and calendar - definitions. - - :Returns: - - `tuple` - The units and calendar values, either of which may be - `None`. - - """ - # Note: Can't use None as the default since it is a valid - # `units` or 'calendar' value that indicates that the - # attribute has not been set in the dataset. - units = self._get_component("units", False) - if units is False: - try: - units = var.getncattr("units") - except AttributeError: - units = None - - self._set_component("units", units, copy=False) - - calendar = self._get_component("calendar", False) - if calendar is False: - try: - calendar = var.getncattr("calendar") - except AttributeError: - calendar = None - - self._set_component("calendar", calendar, copy=False) - - return units, calendar - - @property - def array(self): - """Return an independent numpy array containing the data. - - .. versionadded:: (cfdm) 1.7.0 - - :Returns: - - `numpy.ndarray` - An independent numpy array of the data. - - **Examples** - - >>> n = numpy.asanyarray(a) - >>> isinstance(n, numpy.ndarray) - True - - """ - return self[...] - - def get_format(self): - """The format of the files. - - .. versionadded:: (cfdm) 1.10.1.0 - - .. seealso:: `get_address`, `get_filename`, `get_formats` - - :Returns: - - `str` - The file format. Always ``'nc'``, signifying netCDF. - - **Examples** - - >>> a.get_format() - 'nc' - - """ - return "nc" - - def get_groups(self, address): - """The netCDF4 group structure of a netCDF variable. - - .. versionadded:: (cfdm) 1.8.6.0 - - :Parameters: - - address: `str` or `int` - The netCDF variable name, or integer varid, from which - to get the groups. - - .. versionadded:: (cfdm) 1.10.1.0 - - :Returns: - - (`list`, `str`) or (`list`, `int`) - The group structure and the name within the group. If - *address* is a varid then an empty list and the varid - are returned. - - **Examples** - - >>> n.get_groups('tas') - ([], 'tas') - - >>> n.get_groups('/tas') - ([], 'tas') - - >>> n.get_groups('/data/model/tas') - (['data', 'model'], 'tas') - - >>> n.get_groups(9) - ([], 9) - - """ - try: - if "/" not in address: - return [], address - except TypeError: - return [], address - - out = address.split("/")[1:] - return out[:-1], out[-1] - - def get_mask(self): - """Whether or not to automatically mask the data. - - .. versionadded:: (cfdm) 1.8.2 - - **Examples** - - >>> b = a.get_mask() - - """ - return self._get_component("mask") - - def get_missing_values(self): - """The missing value indicators from the netCDF variable. - - .. versionadded:: (cfdm) 1.10.0.3 - - :Returns: - - `dict` or `None` - The missing value indicators from the netCDF variable, - keyed by their netCDF attribute names. An empty - dictionary signifies that no missing values are given - in the file. `None` signifies that the missing values - have not been set. - - **Examples** - - >>> a.get_missing_values() - None - - >>> b.get_missing_values() - {} - - >>> c.get_missing_values() - {'missing_value': 1e20, 'valid_range': (-10, 20)} - - >>> d.get_missing_values() - {'valid_min': -999} - - """ - out = self._get_component("missing_values", None) - if out is None: - return - - return out.copy() - - def close(self, dataset): - """Close the dataset containing the data. - - .. versionadded:: (cfdm) 1.7.0 - - :Parameters: - - dataset: `netCDF4.Dataset` - The netCDF dataset to be be closed. - - :Returns: - - `None` - - """ - if self._get_component("close"): - dataset.close() - - def open(self, **kwargs): - """Return an open file object containing the data array. - - When multiple files have been provided an attempt is made to - open each one, in the order stored, and an open file object is - returned from the first file that exists. - - :Returns: - - (`netCDF4.Dataset`, `str`) - The open file object, and the address of the data - within the file. - - """ - return super().open(h5netcdf.File, mode="r", **kwargs) - def _check_safecast(self, attname): """Check to see that variable attribute exists can can be safely cast to variable data type.""" - attrs = variable.attrs + attrs = self.variable.attrs if attname in attrs: - attvalue = self.variable.attrs[attname] + attvalue = attrs[attname] att = np.array(attvalue) setattr(self, attname, attvalue) else: @@ -510,21 +279,16 @@ def _check_safecast(self, attname): return is_safe - def mask_and_scale(self, data): + def _mask(self, data): """TODOHDF""" - self.scale = True # h5netcdf attrs = self.variable.attrs - self._Unsigned = attrs.get('_Unsigned', 'false') - - # if attribute _Unsigned is "true", and variable has signed integer - # dtype, return view with corresponding unsigned dtype (issues #656, - # #794) - # _Unsigned attribute must be "true" or "True" (string). Issue #1232. - is_unsigned = getattr(self, '_Unsigned', False) in ["True","true"] + is_unsigned = attrs.get('_Unsigned', False) in ("true", "True") is_unsigned_int = is_unsigned and data.dtype.kind == 'i' + + dtype = data.dtype if self.scale and is_unsigned_int: # only do this if autoscale option is on. - dtype_unsigned_int = f"{data.dtype.byteorder}u{data.dtype.itemsize}" + dtype_unsigned_int = f"{dtype.byteorder}u{dtype.itemsize}" data = data.view(dtype_unsigned_int) # private function for creating a masked array, masking missing_values @@ -540,7 +304,7 @@ def mask_and_scale(self, data): # create mask from missing values. mvalmask = np.zeros(data.shape, np.bool_) if mval.shape == (): # mval a scalar. - mval = [mval] # make into iterable. + mval = (mval,) # make into iterable. for m in mval: # is scalar missing value a NaN? @@ -551,9 +315,9 @@ def mask_and_scale(self, data): mvalisnan = False if mvalisnan: - mvalmask += numpy.isnan(data) + mvalmask += np.isnan(data) else: - mvalmask += data==m + mvalmask += data == m if mvalmask.any(): # set fill_value for masked array to missing_value (or @@ -590,16 +354,10 @@ def mask_and_scale(self, data): # issue 209: don't return masked array if variable filling # is disabled. else: - if __netcdf4libversion__ < '4.5.1' and\ - self._grp.file_format.startswith('NETCDF3'): - # issue #908: no_fill not correct for NETCDF3 files before 4.5.1 - # before 4.5.1 there was no way to turn off filling on a - # per-variable basis for classic files. - no_fill=0 - else: - with nogil: - ierr = nc_inq_var_fill(self._grpid,self._varid,&no_fill,NULL) - _ensure_nc_success(ierr) + no_fill = 0 +# with nogil: +# ierr = nc_inq_var_fill(self._grpid,self._varid,&no_fill,NULL) +# _ensure_nc_success(ierr) # if no_fill is not 1, and not a byte variable, then use # default fill value. from # http://www.unidata.ucar.edu/software/netcdf/docs/netcdf-c/Fill-Values.html#Fill-Values @@ -615,8 +373,8 @@ def mask_and_scale(self, data): # as a missing value unless a _FillValue attribute is set # explicitly." (do this only for non-vlens, since vlens # don't have a default _FillValue) - if not self._isvlen and (no_fill != 1 or self.dtype.str[1:] not in ['u1','i1']): - fillval = np.array(default_fillvals[self.dtype.str[1:]],self.dtype) + if not self._isvlen and (no_fill != 1 or dtype.str[1:] not in ('u1','i1')): + fillval = np.array(default_fillvals[dtype.str[1:]], dtype) has_fillval = data == fillval # if data is an array scalar, has_fillval will be a boolean. # in that case convert to an array. @@ -663,13 +421,16 @@ def mask_and_scale(self, data): # _FillValue (whether defined explicitly or by default) as # follows. If the _FillValue is positive then it defines a # valid maximum, otherwise it defines a valid minimum." - byte_type = self.dtype.str[1:] in ['u1','i1'] if safe_fillval: - fval = np.array(self._FillValue, self.dtype) + fval = np.array(self._FillValue, dtype) else: - fval = np.array(default_fillvals[self.dtype.str[1:]],self.dtype) - if byte_type: + k = dtype.str[1:] + print (k, default_fillvals, self._isvlen) + if k in ('u1','i1'): fval = None + else: + fval = np.array(default_fillvals[k], dtype) + if self.dtype.kind != 'S': # Don't set mask for character data @@ -680,6 +441,7 @@ def mask_and_scale(self, data): # this). if validmin is not None: totalmask += data < validmin + if validmax is not None: totalmask += data > validmax @@ -689,12 +451,12 @@ def mask_and_scale(self, data): # If all else fails, use default _FillValue as fill_value for # masked array. if fill_value is None: - fill_value = default_fillvals[self.dtype.str[1:]] + fill_value = default_fillvals[dtype.str[1:]] # Create masked array with computed mask masked_values = bool(totalmask.any()) if masked_values: - data = np.ma.masked_array(data, mask=totalmask,fill_value=fill_value) + data = np.ma.masked_array(data, mask=totalmask, fill_value=fill_value) else: # Always return masked array, if no values masked. data = np.ma.masked_array(data) @@ -712,109 +474,142 @@ def mask_and_scale(self, data): # no missing values data = np.array(data, copy=False) - # --------------------------- - # Now scale - # --------------------------- - if (valid_scaleoffset - and self.scale - and (self._isprimitive or (self._isvlen and self.dtype != str)) - ): - # If variable has scale_factor and add_offset attributes, - # apply them. - if hasattr(self, 'scale_factor') and hasattr(self, 'add_offset'): - if self.add_offset != 0.0 or self.scale_factor != 1.0: - data = data * self.scale_factor + self.add_offset - else: - data = data.astype(self.scale_factor.dtype) - elif hasattr(self, 'scale_factor') and self.scale_factor != 1.0: - # If variable has only scale_factor attribute, - # rescale. - data = data * self.scale_factor - elif hasattr(self, 'add_offset') and self.add_offset != 0.0: - # If variable has only add_offset attribute, add - # offset. - data = data - return data - - def mask_unpack(self, variable, array): - """TODOHDF""" - mu = _Mask_Unpack(variable) - array = mu.mask(array) - array = mu.unpack(array) + + def _scale(self, data): + # If variable has scale_factor and add_offset attributes, + # apply them. + attrs = self.variable.attrs + scale_factor = attrs.get('scale_factor') + add_offset = attrs.get('add_offset') + try: + if scale_factor is not None: + float(scale_factor) + + if add_offset is not None: + float(add_offset) + except: + logging.warn( + "invalid scale_factor or add_offset attribute, " + "no unpacking done..." + ) + return data + + if scale_factor is not None and add_offset is not None: + if add_offset != 0.0 or scale_factor != 1.0: + data = data * scale_factor + add_offset + else: + data = data.astype(scale_factor.dtype) + elif scale_factor is not None and scale_factor != 1.0: + # If variable has only scale_factor attribute, rescale. + data = data * scale_factor + elif add_offset is not None and add_offset != 0.0: + # If variable has only add_offset attribute, add offset. + data = data + add_offset + return data - def to_memory(self): - """Bring data on disk into memory. + def _set_units(self, var): + """The units and calendar properties. - .. versionadded:: (cfdm) 1.7.0 + These are set from the netCDF variable attributes, but only if + they have already not been defined, either during {{class}} + instantiation or by a previous call to `_set_units`. + + .. versionadded:: (cfdm) 1.10.0.1 + + :Parameters: + + var: `netCDF4.Variable` + The variable containing the units and calendar + definitions. :Returns: - `NumpyArray` - The new with all of its data in memory. + `tuple` + The units and calendar values, either of which may be + `None`. """ - return NumpyArray(self[...]) - - -default_fillvals = netCDF4.default_fillvals -_private_atts = [] - -class _Mask_Unpack(netCDF4.Variable): - """TODOHDF""" - def __init__(self, variable): - self.__dict__['_isprimitive'] = True # h5netcdf - self.__dict__['_isvlen'] = False # h5netcdf - self.__dict__['_isenum'] = False # h5netcdf - self.__dict__['dtype'] = variable.dtype - - attrs = variable.attrs - print (attrs) - self.__dict__['attrs'] = attrs - self.__dict__['_Unsigned'] = 'false' - for attr in ('_FillValue', 'add_offset', 'scale', '_Unsigned', 'valid_max', 'valid_min', 'valid_range', ): - if attr in attrs: - self.__dict__[attr] = attrs[attr] - - def getncattr(self, name, encoding='utf-8'): - """Retrieve a netCDF4 attribute.""" - return self.attrs[name] + # Note: Can't use None as the default since it is a valid + # `units` or 'calendar' value that indicates that the + # attribute has not been set in the dataset. + units = self._get_component("units", False) + if units is False: + try: + units = var.getncattr("units") + except AttributeError: + units = None - def mask(self, data): - """TODOHDF""" - return self._toma(data) - - def unpack(self, data): - """Unpack non-masked values using scale_factor and add_offset. + self._set_component("units", units, copy=False) + + calendar = self._get_component("calendar", False) + if calendar is False: + try: + calendar = var.getncattr("calendar") + except AttributeError: + calendar = None + + self._set_component("calendar", calendar, copy=False) + + return units, calendar + + def get_groups(self, address): + """The netCDF4 group structure of a netCDF variable. + + .. versionadded:: (cfdm) 1.8.6.0 + + :Parameters: + + address: `str` or `int` + The netCDF variable name, or integer varid, from which + to get the groups. + + .. versionadded:: (cfdm) 1.10.1.0 + + :Returns: + + (`list`, `str`) or (`list`, `int`) + The group structure and the name within the group. If + *address* is a varid then an empty list and the varid + are returned. + + **Examples** + + >>> n.get_groups('tas') + ([], 'tas') + + >>> n.get_groups('/tas') + ([], 'tas') + + >>> n.get_groups('/data/model/tas') + (['data', 'model'], 'tas') + + >>> n.get_groups(9) + ([], 9) """ - if not scale: - return data - - try: # check to see if scale_factor and add_offset is valid (issue 176). - if hasattr(self,'scale_factor'): float(self.scale_factor) - if hasattr(self,'add_offset'): float(self.add_offset) - valid_scaleoffset = True - except: - valid_scaleoffset = False - if self.scale: - msg = 'invalid scale_factor or add_offset attribute, no unpacking done...' - # warnings.warn(msg) # h5netcdf - - if self.scale and\ - (self._isprimitive or (self._isvlen and self.dtype != str)) and\ - valid_scaleoffset: - # if variable has scale_factor and add_offset attributes, apply - # them. - if hasattr(self, 'scale_factor') and hasattr(self, 'add_offset'): - if self.add_offset != 0.0 or self.scale_factor != 1.0: - data = data*self.scale_factor + self.add_offset - else: - data = data.astype(self.scale_factor.dtype) # issue 913 - # else if variable has only scale_factor attribute, rescale. - elif hasattr(self, 'scale_factor') and self.scale_factor != 1.0: - data = data*self.scale_factor - # else if variable has only add_offset attribute, add offset. - elif hasattr(self, 'add_offset') and self.add_offset != 0.0: - data = data + self.add_offset + try: + if "/" not in address: + return [], address + except TypeError: + return [], address + + out = address.split("/")[1:] + return out[:-1], out[-1] + + def open(self, **kwargs): + """Return a file object for the dataset and the variable address. + + When multiple files have been provided an attempt is made to + open each one, in the order stored, and a file object is + returned from the first file that exists. + + :Returns: + + (file object, `str`) + The file object for the dataset, and the address of + the data within the file. + + """ + return super().open(h5netcdf.File, mode="r", **kwargs) diff --git a/cfdm/data/mixin/__init__.py b/cfdm/data/mixin/__init__.py index e5dba5957..3a5d3cb47 100644 --- a/cfdm/data/mixin/__init__.py +++ b/cfdm/data/mixin/__init__.py @@ -1,2 +1,3 @@ from .arraymixin import ArrayMixin from .filearraymixin import FileArrayMixin +from .xxxmixin import XXXMixin diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index 165516165..b7f38e7e9 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -177,10 +177,10 @@ def get_formats(self): return (self.get_format(),) * len(self.get_filenames()) def open(self, func, *args, **kwargs): - """Return an open file object containing the data array. + """rn a file object for the dataset and the variable address. When multiple files have been provided an attempt is made to - open each one, in the order stored, and an open file object is + open each one, in the order stored, and a file object is returned from the first file that exists. .. versionadded:: (cfdm) 1.10.1.0 @@ -196,8 +196,8 @@ def open(self, func, *args, **kwargs): :Returns: `tuple` - The open file object, and the address of the data - within the file. + The file object for the dataset, and the address of + the data within the file. """ # Loop round the files, returning as soon as we find one that @@ -216,9 +216,10 @@ def open(self, func, *args, **kwargs): except RuntimeError as error: raise RuntimeError(f"{error}: {filename}") + # Successfully opend a dataset, so return. return nc, address if len(filenames) == 1: - raise FileNotFoundError(f"No such netCDF file: {filenames[0]}") + raise FileNotFoundError(f"No such file: {filenames.pop()}") - raise FileNotFoundError(f"No such netCDF files: {filenames}") + raise FileNotFoundError(f"No such files: {filenames}") diff --git a/cfdm/data/mixin/xxxmixin.py b/cfdm/data/mixin/xxxmixin.py new file mode 100644 index 000000000..a532684b9 --- /dev/null +++ b/cfdm/data/mixin/xxxmixin.py @@ -0,0 +1,141 @@ +import netCDF4 +import numpy as np + +class XXXMixin: + """Mixin class TODOHDF + + .. versionadded:: (cfdm) HDFVER + + """ + + def _process_string_and_char(self, array): + """TODOHDF""" + string_type = isinstance(array, str) + kind = array.dtype.kind + if not string_type and kind in "SU": + # Collapse by concatenation the outermost (fastest + # varying) dimension of char array into + # memory. E.g. [['a','b','c']] becomes ['abc'] + if kind == "U": + array = array.astype("S", copy=False) + + array = netCDF4.chartostring(array) + shape = array.shape + array = np.array([x.rstrip() for x in array.flat], dtype="U") + array = np.reshape(array, shape) + array = np.ma.masked_where(array == "", array) + elif not string_type and kind == "O": + # An N-d (N>=1) string variable comes out as a numpy + # object array, so convert it to numpy string array. + array = array.astype("U", copy=False) + + # Mask the VLEN variable + array = np.ma.where(array == "", np.ma.masked, array) + + return array + + + @property + def array(self): + """Return an independent numpy array containing the data. + + .. versionadded:: (cfdm) 1.7.0 + + :Returns: + + `numpy.ndarray` + An independent numpy array of the data. + + **Examples** + + >>> n = numpy.asanyarray(a) + >>> isinstance(n, numpy.ndarray) + True + + """ + return self[...] + + def close(self, dataset): + """Close the dataset containing the data. + + .. versionadded:: (cfdm) 1.7.0 + + :Parameters: + + dataset: + The dataset to be be closed. + + :Returns: + + `None` + + """ + if self._get_component("close"): + dataset.close() + + def get_format(self): + """The format of the files. + + .. versionadded:: (cfdm) 1.10.1.0 + + .. seealso:: `get_address`, `get_filename`, `get_formats` + + :Returns: + + `str` + The file format. Always ``'nc'``, signifying netCDF. + + **Examples** + + >>> a.get_format() + 'nc' + + """ + return "nc" + + def get_mask(self): + """Whether or not to automatically mask the data. + + .. versionadded:: (cfdm) 1.8.2 + + **Examples** + + >>> b = a.get_mask() + + """ + return self._get_component("mask") + + def get_missing_values(self): + """The missing value indicators from the netCDF variable. + + .. versionadded:: (cfdm) 1.10.0.3 + + :Returns: + + `dict` or `None` + The missing value indicators from the netCDF variable, + keyed by their netCDF attribute names. An empty + dictionary signifies that no missing values are given + in the file. `None` signifies that the missing values + have not been set. + + **Examples** + + >>> a.get_missing_values() + None + + >>> b.get_missing_values() + {} + + >>> c.get_missing_values() + {'missing_value': 1e20, 'valid_range': (-10, 20)} + + >>> d.get_missing_values() + {'valid_min': -999} + + """ + out = self._get_component("missing_values", None) + if out is None: + return + + return out.copy() diff --git a/cfdm/data/netcdfarray.py b/cfdm/data/netcdfarray.py index 247c2747c..65ad2f8e3 100644 --- a/cfdm/data/netcdfarray.py +++ b/cfdm/data/netcdfarray.py @@ -2,11 +2,11 @@ import numpy as np from . import abstract -from .mixin import FileArrayMixin +from .mixin import FileArrayMixin, XXXMixin from .numpyarray import NumpyArray -class NetCDFArray(FileArrayMixin, abstract.Array): +class NetCDFArray(XXXMixin, FileArrayMixin, abstract.Array): """An underlying array stored in a netCDF file. .. versionadded:: (cfdm) 1.7.0 @@ -245,35 +245,37 @@ def __getitem__(self, indices): # Hmm netCDF4 has a thing for making scalar size 1, 1d array = array.squeeze() - kind = array.dtype.kind - if not string_type and kind in "SU": - # == 'S' and array.ndim > (self.ndim - - # getattr(self, 'gathered', 0) - - # getattr(self, 'ragged', 0)): - # -------------------------------------------------------- - # Collapse (by concatenation) the outermost (fastest - # varying) dimension of char array into - # memory. E.g. [['a','b','c']] becomes ['abc'] - # -------------------------------------------------------- - if kind == "U": - array = array.astype("S", copy=False) - - array = netCDF4.chartostring(array) - shape = array.shape - array = np.array([x.rstrip() for x in array.flat], dtype="U") - array = np.reshape(array, shape) - array = np.ma.masked_where(array == "", array) - elif not string_type and kind == "O": - # -------------------------------------------------------- - # A netCDF string type N-d (N>=1) variable comes out as a - # numpy object array, so convert it to numpy string array. - # -------------------------------------------------------- - array = array.astype("U", copy=False) - - # -------------------------------------------------------- - # netCDF4 does not auto-mask VLEN variable, so do it here. - # -------------------------------------------------------- - array = np.ma.where(array == "", np.ma.masked, array) + array = self._process_string_and_char(array) +# +# kind = array.dtype.kind +# if not string_type and kind in "SU": +# # == 'S' and array.ndim > (self.ndim - +# # getattr(self, 'gathered', 0) - +# # getattr(self, 'ragged', 0)): +# # -------------------------------------------------------- +# # Collapse (by concatenation) the outermost (fastest +# # varying) dimension of char array into +# # memory. E.g. [['a','b','c']] becomes ['abc'] +# # -------------------------------------------------------- +# if kind == "U": +# array = array.astype("S", copy=False) +# +# array = netCDF4.chartostring(array) +# shape = array.shape +# array = np.array([x.rstrip() for x in array.flat], dtype="U") +# array = np.reshape(array, shape) +# array = np.ma.masked_where(array == "", array) +# elif not string_type and kind == "O": +# # -------------------------------------------------------- +# # A netCDF string type N-d (N>=1) variable comes out as a +# # numpy object array, so convert it to numpy string array. +# # -------------------------------------------------------- +# array = array.astype("U", copy=False) +# +# # -------------------------------------------------------- +# # netCDF4 does not auto-mask VLEN variable, so do it here. +# # -------------------------------------------------------- +# array = np.ma.where(array == "", np.ma.masked, array) return array diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index b4743d2b0..24a19b8f6 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -494,12 +494,12 @@ def file_open(self, filename, flatten=True, verbose=None): netCDF = False HDF = False try: - nc = h5netcdf.File(filename, "r") + nc = h5netcdf.File(filename, "r", decode_vlen_strings=True) HDF = True except OSError: - # File is not HDF, so it's probably netCDF3. + # File is not HDF. Assume instead that it's netCDF3 and + # open it with netCDF4. try: - print (1/0) nc = netCDF4.Dataset(filename, "r") netCDF = True except RuntimeError as error: @@ -559,11 +559,11 @@ def file_open(self, filename, flatten=True, verbose=None): g["flat_files"].append(flat_file) -# if HDF: -# print ("Opened with h5netcdf") -# else: -# print ("Opened with netCDF4") - + if HDF: + print ("Opened with h5netcdf") + else: + print ("Opened with netCDF4") + g["netCDF"] = netCDF g["HDF"] = HDF g["nc"] = nc diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index eb6196582..f6eb7024b 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -4459,7 +4459,7 @@ def file_open(self, filename, mode, fmt, fields): os.remove(filename) try: - nc.set_chunk_cache(16*1024*1024) # 16MiB chunkcache +# nc.set_chunk_cache(16*1024*1024) # 16MiB chunkcache nc = netCDF4.Dataset(filename, mode, format=fmt) except RuntimeError as error: raise RuntimeError(f"{error}: {filename}") diff --git a/requirements.txt b/requirements.txt index c5f4955b2..90e0ee8ae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ netCDF4>=1.5.4 +h5py>=3.0.0 +h5netcdf>=1.3.0 cftime>=1.6.0 numpy>=1.15 netcdf-flattener>=1.2.0 From b54d37e6a813ddb9101cab91ec90bd97e6f871fd Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jan 2024 23:32:16 +0000 Subject: [PATCH 08/88] dev --- cfdm/data/hdfarray.py | 131 +-- cfdm/data/mixin/__init__.py | 2 +- cfdm/data/mixin/filearraymixin.py | 2 +- .../mixin/{xxxmixin.py => netcdffilemixin.py} | 93 +- cfdm/data/netcdfarray.py | 191 +--- cfdm/flatten.py | 828 ++++++++++++++++++ cfdm/read_write/netcdf/netcdfread.py | 31 +- cfdm/test/test_groups.py | 5 +- 8 files changed, 999 insertions(+), 284 deletions(-) rename cfdm/data/mixin/{xxxmixin.py => netcdffilemixin.py} (57%) create mode 100644 cfdm/flatten.py diff --git a/cfdm/data/hdfarray.py b/cfdm/data/hdfarray.py index fbc151ead..21a5da654 100644 --- a/cfdm/data/hdfarray.py +++ b/cfdm/data/hdfarray.py @@ -4,13 +4,13 @@ import numpy as np from . import abstract -from .mixin import FileArrayMixin, XXXMixin +from .mixin import FileArrayMixin, NetCDFFileMixin from .numpyarray import NumpyArray _safecast = netCDF4.utils._safecast default_fillvals = netCDF4.default_fillvals -class HDFArray(XXXMixin, FileArrayMixin, abstract.Array): +class HDFArray(NetCDFFileMixin, FileArrayMixin, abstract.Array): """An underlying array stored in an HDF file. .. versionadded:: (cfdm) TODOHDF @@ -37,10 +37,9 @@ def __init__( filename: (sequence of) `str`, optional The name of the file(s) containing the array. - address: (sequence of) `str` or `int`, optional + address: (sequence of) `str`, optional The identity of the variable in each file defined by - *filename*. Either a netCDF variable name or an - integer HDF variable ID. + *filename*. Must be a netCDF variable name. dtype: `numpy.dtype` The data type of the array in the file. May be `None` @@ -188,36 +187,24 @@ def __getitem__(self, indices): groups, address = self.get_groups(address) if groups: - # Traverse the group structure, if there is one (CF>=1.8). - for g in groups[:-1]: - dataset = dataset.groups[g] # h5netcdf - - dataset = dataset.groups[groups[-1]]# h5netcdf - - if isinstance(address, str): - # Get the variable by netCDF name - variable = dataset.variables[address] - self.variable = variable - array = variable[indices] - else: - # Get the variable by netCDF integer ID - for variable in dataset.variables.values(): - if variable._varid == address: - array = variable[indices] - break - - # Set the units, if they haven't been set already. - self._set_units(variable) + dataset = self._uuu(dataset, groups) + + # Get the variable by netCDF name + variable = dataset.variables[address] + self.variable = variable + array = variable[indices] if mask: self.scale = True self.always_mask = False self._isvlen = variable.dtype == np.dtype('O') - print('V', self._isvlen) if not self._isvlen: array = self._mask(array) array = self._scale(array) + # Set the units, if they haven't been set already. + self._set_units(variable) + self.close(dataset0) del dataset, dataset0 del self.variable @@ -236,23 +223,7 @@ def __getitem__(self, indices): array = self._process_string_and_char(array) return array - - def __repr__(self): - """Called by the `repr` built-in function. - - x.__repr__() <==> repr(x) - - """ - return f"<{self.__class__.__name__}{self.shape}: {self}>" - - def __str__(self): - """Called by the `str` built-in function. - - x.__str__() <==> str(x) - - """ - return f"{self.get_filename(None)}, {self.get_address()}" - + def _check_safecast(self, attname): """Check to see that variable attribute exists can can be safely cast to variable data type.""" attrs = self.variable.attrs @@ -281,18 +252,19 @@ def _check_safecast(self, attname): def _mask(self, data): """TODOHDF""" + # Private function for creating a masked array, masking + # missing_values and/or _FillValues. + attrs = self.variable.attrs is_unsigned = attrs.get('_Unsigned', False) in ("true", "True") is_unsigned_int = is_unsigned and data.dtype.kind == 'i' dtype = data.dtype if self.scale and is_unsigned_int: - # only do this if autoscale option is on. + # Only do this if autoscale option is on. dtype_unsigned_int = f"{dtype.byteorder}u{dtype.itemsize}" data = data.view(dtype_unsigned_int) - # private function for creating a masked array, masking missing_values - # and/or _FillValues. totalmask = np.zeros(data.shape, np.bool_) fill_value = None safe_missval = self._check_safecast('missing_value') @@ -320,7 +292,7 @@ def _mask(self, data): mvalmask += data == m if mvalmask.any(): - # set fill_value for masked array to missing_value (or + # Set fill_value for masked array to missing_value (or # 1st element if missing_value is a vector). fill_value = mval[0] totalmask += mvalmask @@ -351,13 +323,13 @@ def _mask(self, data): fill_value = fval totalmask += mask - # issue 209: don't return masked array if variable filling - # is disabled. else: + # Don't return masked array if variable filling is disabled. no_fill = 0 # with nogil: # ierr = nc_inq_var_fill(self._grpid,self._varid,&no_fill,NULL) # _ensure_nc_success(ierr) + # if no_fill is not 1, and not a byte variable, then use # default fill value. from # http://www.unidata.ucar.edu/software/netcdf/docs/netcdf-c/Fill-Values.html#Fill-Values @@ -376,8 +348,8 @@ def _mask(self, data): if not self._isvlen and (no_fill != 1 or dtype.str[1:] not in ('u1','i1')): fillval = np.array(default_fillvals[dtype.str[1:]], dtype) has_fillval = data == fillval - # if data is an array scalar, has_fillval will be a boolean. - # in that case convert to an array. + # if data is an array scalar, has_fillval will be a + # boolean. in that case convert to an array. if type(has_fillval) == bool: has_fillval = np.asarray(has_fillval) @@ -425,7 +397,6 @@ def _mask(self, data): fval = np.array(self._FillValue, dtype) else: k = dtype.str[1:] - print (k, default_fillvals, self._isvlen) if k in ('u1','i1'): fval = None else: @@ -509,50 +480,34 @@ def _scale(self, data): return data - def _set_units(self, var): - """The units and calendar properties. + def _get_attr(self, var, attr): + """TODOHDF - These are set from the netCDF variable attributes, but only if - they have already not been defined, either during {{class}} - instantiation or by a previous call to `_set_units`. - - .. versionadded:: (cfdm) 1.10.0.1 + .. versionadded:: (cfdm) HDFVER :Parameters: - var: `netCDF4.Variable` - The variable containing the units and calendar - definitions. + """ - :Returns: + return var.attrs[attr] - `tuple` - The units and calendar values, either of which may be - `None`. + def close(self, dataset): + """Close the dataset containing the data. - """ - # Note: Can't use None as the default since it is a valid - # `units` or 'calendar' value that indicates that the - # attribute has not been set in the dataset. - units = self._get_component("units", False) - if units is False: - try: - units = var.getncattr("units") - except AttributeError: - units = None + .. versionadded:: (cfdm) HDFVER - self._set_component("units", units, copy=False) + :Parameters: - calendar = self._get_component("calendar", False) - if calendar is False: - try: - calendar = var.getncattr("calendar") - except AttributeError: - calendar = None + dataset: `h5netcdf.File` + The netCDF dataset to be be closed. + + :Returns: - self._set_component("calendar", calendar, copy=False) + `None` - return units, calendar + """ + if self._get_component("close"): + dataset.close() def get_groups(self, address): """The netCDF4 group structure of a netCDF variable. @@ -607,9 +562,9 @@ def open(self, **kwargs): :Returns: - (file object, `str`) - The file object for the dataset, and the address of - the data within the file. + (`h5netcdf.File`, `str`) + The open file object, and the address of the data + within the file. """ return super().open(h5netcdf.File, mode="r", **kwargs) diff --git a/cfdm/data/mixin/__init__.py b/cfdm/data/mixin/__init__.py index 3a5d3cb47..fffb784f0 100644 --- a/cfdm/data/mixin/__init__.py +++ b/cfdm/data/mixin/__init__.py @@ -1,3 +1,3 @@ from .arraymixin import ArrayMixin from .filearraymixin import FileArrayMixin -from .xxxmixin import XXXMixin +from .netcdffilemixin import NetCDFFileMixin diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index b7f38e7e9..c4156ad94 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -177,7 +177,7 @@ def get_formats(self): return (self.get_format(),) * len(self.get_filenames()) def open(self, func, *args, **kwargs): - """rn a file object for the dataset and the variable address. + """Return a file object for the dataset and the variable address. When multiple files have been provided an attempt is made to open each one, in the order stored, and a file object is diff --git a/cfdm/data/mixin/xxxmixin.py b/cfdm/data/mixin/netcdffilemixin.py similarity index 57% rename from cfdm/data/mixin/xxxmixin.py rename to cfdm/data/mixin/netcdffilemixin.py index a532684b9..6f6e74238 100644 --- a/cfdm/data/mixin/xxxmixin.py +++ b/cfdm/data/mixin/netcdffilemixin.py @@ -1,13 +1,41 @@ import netCDF4 import numpy as np -class XXXMixin: +class NetCDFFileMixin: """Mixin class TODOHDF .. versionadded:: (cfdm) HDFVER """ + def __repr__(self): + """Called by the `repr` built-in function. + + x.__repr__() <==> repr(x) + + """ + return f"<{self.__class__.__name__}{self.shape}: {self}>" + + def __str__(self): + """Called by the `str` built-in function. + + x.__str__() <==> str(x) + + """ + return f"{self.get_filename(None)}, {self.get_address()}" + + def _get_attr(self, var, attr): + """TODOHDF + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + """ + raise NotImplementedError( + "Must implement {self.__class__.__name__}._get_attr" + ) # pragma: no cover + def _process_string_and_char(self, array): """TODOHDF""" string_type = isinstance(array, str) @@ -34,6 +62,56 @@ def _process_string_and_char(self, array): return array + def _set_units(self, var): + """The units and calendar properties. + + These are set from the netCDF variable attributes, but only if + they have already not been defined, either during {{class}} + instantiation or by a previous call to `_set_units`. + + .. versionadded:: (cfdm) 1.10.0.1 + + :Parameters: + + var: `netCDF4.Variable` + The variable containing the units and calendar + definitions. + + :Returns: + + `tuple` + The units and calendar values, either of which may be + `None`. + + """ + # Note: Can't use None as the default since it is a valid + # `units` or 'calendar' value that indicates that the + # attribute has not been set in the dataset. + units = self._get_component("units", False) + if units is False: + try: + units = self._get_attr(var, "units") + except AttributeError: + units = None + + self._set_component("units", units, copy=False) + + calendar = self._get_component("calendar", False) + if calendar is False: + try: + calendar = self._get_attr(var, "calendar") + except AttributeError: + calendar = None + + self._set_component("calendar", calendar, copy=False) + + return units, calendar + + def _uuu(self, dataset, groups): + for g in groups: #[:-1]: + dataset = dataset.groups[g] + + return dataset #dataset = dataset.groups[groups[-1]] @property def array(self): @@ -139,3 +217,16 @@ def get_missing_values(self): return return out.copy() + + def to_memory(self): + """Bring data on disk into memory. + + .. versionadded:: (cfdm) 1.7.0 + + :Returns: + + `NumpyArray` + The new with all of its data in memory. + + """ + return NumpyArray(self[...]) diff --git a/cfdm/data/netcdfarray.py b/cfdm/data/netcdfarray.py index 65ad2f8e3..062c03d0d 100644 --- a/cfdm/data/netcdfarray.py +++ b/cfdm/data/netcdfarray.py @@ -2,11 +2,11 @@ import numpy as np from . import abstract -from .mixin import FileArrayMixin, XXXMixin +from .mixin import FileArrayMixin, NetCDFFileMixin from .numpyarray import NumpyArray -class NetCDFArray(XXXMixin, FileArrayMixin, abstract.Array): +class NetCDFArray(NetCDFFileMixin, FileArrayMixin, abstract.Array): """An underlying array stored in a netCDF file. .. versionadded:: (cfdm) 1.7.0 @@ -209,10 +209,11 @@ def __getitem__(self, indices): if groups: # Traverse the group structure, if there is one (CF>=1.8). - for g in groups[:-1]: - netcdf = netcdf.groups[g] - - netcdf = netcdf.groups[groups[-1]] + netcdf = self._uuu(netcdf, groups) +# for g in groups[:-1]: +# netcdf = netcdf.groups[g] +# +# netcdf = netcdf.groups[groups[-1]] if isinstance(address, str): # Get the variable by netCDF name @@ -246,37 +247,6 @@ def __getitem__(self, indices): array = array.squeeze() array = self._process_string_and_char(array) -# -# kind = array.dtype.kind -# if not string_type and kind in "SU": -# # == 'S' and array.ndim > (self.ndim - -# # getattr(self, 'gathered', 0) - -# # getattr(self, 'ragged', 0)): -# # -------------------------------------------------------- -# # Collapse (by concatenation) the outermost (fastest -# # varying) dimension of char array into -# # memory. E.g. [['a','b','c']] becomes ['abc'] -# # -------------------------------------------------------- -# if kind == "U": -# array = array.astype("S", copy=False) -# -# array = netCDF4.chartostring(array) -# shape = array.shape -# array = np.array([x.rstrip() for x in array.flat], dtype="U") -# array = np.reshape(array, shape) -# array = np.ma.masked_where(array == "", array) -# elif not string_type and kind == "O": -# # -------------------------------------------------------- -# # A netCDF string type N-d (N>=1) variable comes out as a -# # numpy object array, so convert it to numpy string array. -# # -------------------------------------------------------- -# array = array.astype("U", copy=False) -# -# # -------------------------------------------------------- -# # netCDF4 does not auto-mask VLEN variable, so do it here. -# # -------------------------------------------------------- -# array = np.ma.where(array == "", np.ma.masked, array) - return array def __repr__(self): @@ -295,90 +265,15 @@ def __str__(self): """ return f"{self.get_filename(None)}, {self.get_address()}" - def _set_units(self, var): - """The units and calendar properties. + def _get_attr(self, var, attr): + """TODOHDF - These are set from the netCDF variable attributes, but only if - they have already not been defined, either during {{class}} - instantiation or by a previous call to `_set_units`. - - .. versionadded:: (cfdm) 1.10.0.1 + .. versionadded:: (cfdm) HDFVER :Parameters: - var: `netCDF4.Variable` - The variable containing the units and calendar - definitions. - - :Returns: - - `tuple` - The units and calendar values, either of which may be - `None`. - """ - # Note: Can't use None as the default since it is a valid - # `units` or 'calendar' value that indicates that the - # attribute has not been set in the dataset. - units = self._get_component("units", False) - if units is False: - try: - units = var.getncattr("units") - except AttributeError: - units = None - - self._set_component("units", units, copy=False) - - calendar = self._get_component("calendar", False) - if calendar is False: - try: - calendar = var.getncattr("calendar") - except AttributeError: - calendar = None - - self._set_component("calendar", calendar, copy=False) - - return units, calendar - - @property - def array(self): - """Return an independent numpy array containing the data. - - .. versionadded:: (cfdm) 1.7.0 - - :Returns: - - `numpy.ndarray` - An independent numpy array of the data. - - **Examples** - - >>> n = numpy.asanyarray(a) - >>> isinstance(n, numpy.ndarray) - True - - """ - return self[...] - - def get_format(self): - """The format of the files. - - .. versionadded:: (cfdm) 1.10.1.0 - - .. seealso:: `get_address`, `get_filename`, `get_formats` - - :Returns: - - `str` - The file format. Always ``'nc'``, signifying netCDF. - - **Examples** - - >>> a.get_format() - 'nc' - - """ - return "nc" + return var.getncattr(attr) def get_groups(self, address): """The netCDF4 group structure of a netCDF variable. @@ -424,53 +319,6 @@ def get_groups(self, address): out = address.split("/")[1:] return out[:-1], out[-1] - def get_mask(self): - """Whether or not to automatically mask the data. - - .. versionadded:: (cfdm) 1.8.2 - - **Examples** - - >>> b = a.get_mask() - - """ - return self._get_component("mask") - - def get_missing_values(self): - """The missing value indicators from the netCDF variable. - - .. versionadded:: (cfdm) 1.10.0.3 - - :Returns: - - `dict` or `None` - The missing value indicators from the netCDF variable, - keyed by their netCDF attribute names. An empty - dictionary signifies that no missing values are given - in the file. `None` signifies that the missing values - have not been set. - - **Examples** - - >>> a.get_missing_values() - None - - >>> b.get_missing_values() - {} - - >>> c.get_missing_values() - {'missing_value': 1e20, 'valid_range': (-10, 20)} - - >>> d.get_missing_values() - {'valid_min': -999} - - """ - out = self._get_component("missing_values", None) - if out is None: - return - - return out.copy() - def close(self, dataset): """Close the dataset containing the data. @@ -490,10 +338,10 @@ def close(self, dataset): dataset.close() def open(self): - """Return an open file object containing the data array. + """Return a file object for the dataset and the variable address. When multiple files have been provided an attempt is made to - open each one, in the order stored, and an open file object is + open each one, in the order stored, and a file object is returned from the first file that exists. :Returns: @@ -504,16 +352,3 @@ def open(self): """ return super().open(netCDF4.Dataset, mode="r") - - def to_memory(self): - """Bring data on disk into memory. - - .. versionadded:: (cfdm) 1.7.0 - - :Returns: - - `NumpyArray` - The new with all of its data in memory. - - """ - return NumpyArray(self[...]) diff --git a/cfdm/flatten.py b/cfdm/flatten.py new file mode 100644 index 000000000..2246d793d --- /dev/null +++ b/cfdm/flatten.py @@ -0,0 +1,828 @@ +"""Project: NetCDF Flattener +Copyright (c) 2020 EUMETSAT +License: Apache License 2.0 + +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +""" + +import collections +import hashlib +import logging +import os +import re +import warnings +from enum import Enum + +#from netCDF4 import Dataset +from h5netcdf import File as Dataset + + + +def flatten(input_ds, output_ds, lax_mode=False, _copy_data=True, copy_slices=None): + """Flatten an input NetCDF dataset and write the result in an output NetCDF dataset. + + For variable that are too big to fit in memory, the optional "copy_slices" input allows to copy some or all of the + variables in slices. + + :param input_ds: input netcdf4 dataset + :param output_ds: output netcdf4 dataset + :param lax_mode: if false (default), not resolving a reference halts the execution. If true, continue with warning. + :param _copy_data: if true (default), then all data arrays are copied from the input to the output dataset. + If false, then this does not happen. + Use this option *only* if the data arrays of the flattened dataset are never to be accessed. + If false then consider setting the fill mode for the output netcd4 dataset to "off" for improved performance. + :param copy_slices: dictionary containing variable_name: shape pairs, where variable_name is the path to the + variable name in the original Dataset (for instance /group1/group2/my_variable), and shape is either None for + using default slice value, or a custom slicing shap in the form of a tuple of the same dimension as the variable + (for instance (1000,2000,1500,) for a 3-dimensional variable). If a variable from the Dataset is not contained + in the dict, it will not be sliced and copied normally. + """ + _Flattener(input_ds, lax_mode, _copy_data=_copy_data, copy_slices=copy_slices).flatten(output_ds) + + +def parse_var_attr(input_str): + """Parse variable attribute of any form into a dict: + + * 'time' -> OrderedDict([('time', [])]) + * 'lat lon' -> OrderedDict([('lat', []), ('lon', [])]) + * 'area: time volume: lat lon' -> OrderedDict([('area', ['time']), ('volume', ['lat', 'lon'])]) + + :param input_str: string to parse + :return: parsed string in an OrderedDict + """ + + def subst(s): + """substitute tokens for WORD and SEP (space or end of string)""" + return s.replace('WORD', r'[A-Za-z0-9_#/.\(\)]+').replace( + 'SEP', r'(\s+|$)') + + # Regex for 'dict form': "k1: v1 v2 k2: v3" + pat_value = subst('(?PWORD)SEP') + pat_values = '({})*'.format(pat_value) + pat_mapping = (subst('(?PWORD):SEP(?P{})'.format(pat_values))) + pat_mapping_list = '({})+'.format(pat_mapping) + + # Regex for 'list form': "v1 v2 v3" (including single-item form) + pat_list_item = (subst('(?PWORD)SEP')) + pat_list = '({})+'.format(pat_list_item) + + # Regex for any form: + pat_all = (subst('((?P{})|(?P{}))$'.format(pat_list, pat_mapping_list))) + + m = re.match(pat_all, input_str) + + # Output is always a dict. If input form is a list, dict values are set as empty lists + out = collections.OrderedDict() + + if m is not None: + list_match = m.group('list') + # Parse as a list + if list_match: + for mapping in re.finditer(pat_list_item, list_match): + item = mapping.group('list_item') + out[item] = None + # Parse as a dict: + else: + mapping_list = m.group('mapping_list') + for mapping in re.finditer(pat_mapping, mapping_list): + term = mapping.group('mapping_name') + values = [value.group('value') for value in re.finditer(pat_value, mapping.group('values'))] + out[term] = values + else: + raise ReferenceException("Error while parsing attribute value: '{}'".format(input_str)) + + return out + + +def generate_var_attr_str(d): + """Re-generate the attribute string from a dictionary. + + :param d: dictionary + :return: valid attribute string + """ + parsed_list = [] + for k, v in d.items(): + if v is None: + parsed_list.append(k) + elif not v: + parsed_list.append("{}:".format(k)) + else: + parsed_list.append(k + ': ' + (' '.join(v))) + return ' '.join(parsed_list) + + +class _AttributeProperties(Enum): + """"Utility class containing the properties for each type of variable attribute, defining how contained references + to dimensions and variables should be parsed and processed.""" + ancillary_variables = (0, (False, True, True, False, False, False, False)) + bounds = (1, (False, True, True, False, False, False, False)) + cell_measures = (2, (False, True, False, True, False, False, False)) + climatology = (3, (False, True, True, False, False, False, False)) + coordinates = (4, (False, True, True, False, True, False, False)) + formula_terms = (5, (False, True, False, True, False, False, False)) + geometry = (6, (False, True, True, False, False, False, False)) + grid_mapping = (7, (False, True, True, True, False, False, False)) + interior_ring = (8, (False, True, True, False, False, False, False)) + node_coordinates = (9, (False, True, True, False, False, False, False)) + node_count = (10, (False, True, True, False, False, False, False)) + nodes = (11, (False, True, True, False, False, False, False)) + part_node_count = (12, (False, True, True, False, False, False, False)) + compress = (13, (True, False, True, False, False, False, False)) + instance_dimension = (14, (True, False, True, False, False, False, False)) + sample_dimension = (15, (True, False, True, False, False, False, False)) + cell_methods = (16, (2, 1, True, False, False, True, True)) + + def __init__(self, n, props): + """_AttributeProperties enum constructor + + :param n: enum id + :param props: a tuple containing the attribute's properties (ref_to_dim, ref_to_var, resolve_key, resolve_value, + stop_at_local_apex, accept_standard_names, limit_to_scalar_coordinates): + * ref_to_dim: True or integer if contains references to dimensions (highest int have priority) + * ref_to_var: True or integer if contains references to variables (highest int have priority) + * resolve_key: True if 'keys' have to be resolved in 'key1: value1 key2: value2 value3' or 'key1 key2' + * resolve_value: True if 'values' have to be resolved in 'key1: value1 key2: value2 value3' + * stop_at_local_apex: True if upward research in the hierarchy has to stop at local apex + * accept_standard_names: True if any standard name is valid in place of references (in which case no + exception is raised if a reference cannot be resolved, and the standard name is used in place) + * limit_to_scalar_coordinates: True if references to variables are only resolved if present as well in + the 'coordinates' attributes of the variable, and they are scalar. + """ + self.id = n + self.ref_to_dim = props[0] + self.ref_to_var = props[1] + self.resolve_key = props[2] + self.resolve_value = props[3] + self.stop_at_local_apex = props[4] + self.accept_standard_names = props[5] + self.limit_to_scalar_coordinates = props[6] + + +class _Flattener: + """Utility class contained the input file, the output file being flattened, and all the logic of the flattening + process. + """ + __max_name_len = 256 + __default_separator = '/' + __new_separator = '__' + __pathname_format = "{}/{}" + __mapping_str_format = "{}: {}" + __ref_not_found_error = "REF_NOT_FOUND" + __default_copy_slice_size = 200000000 + + # name of the attributes used to store the mapping between original and flattened names + __attr_map_name = "__flattener_name_mapping_attributes" + __dim_map_name = "__flattener_name_mapping_dimensions" + __var_map_name = "__flattener_name_mapping_variables" + + def __init__(self, input_ds, lax_mode, _copy_data=True, copy_slices=None): + """Constructor. Initializes the Flattener class given the input file. + + :param input_ds: input netcdf dataset + :param lax_mode: if false (default), not resolving a reference halts the execution. If true, continue with warning. + :param _copy_data: if true (default), then all data arrays are copied from the input to the output dataset + If false, then this does not happen. + Use this option *only* if the data arrays of the flattened dataset are never to be accessed. + :param copy_slices: dictionary containing variable_name: shape pairs, where variable_name is the path to the + variable name in the original Dataset (for instance /group1/group2/my_variable), and shape is either None + for using default slice value, or a custom slicing shape in the form of a tuple of the same dimension as the + variable (for instance (1000,2000,1500,) for a 3-dimensional variable). If a variable from the Dataset is + not contained in the dict, it will not be sliced and copied normally. + """ + + self.__attr_map_value = [] + self.__dim_map_value = [] + self.__var_map_value = [] + + self.__dim_map = dict() + self.__var_map = dict() + + self.__lax_mode = lax_mode + + self.__copy_data = _copy_data + self.__copy_slices = copy_slices + + self.__input_file = input_ds + self.__output_file = None + + def filepath(self, ds): + try: + # netCDF4 + return ds.filepath() + except AttributeError: + # h5netcdf + return ds.filename + + def data_model(self, ds): + try: + # netCDF4 + return ds.data_model + except AttributeError: + # h5netcdf + return 'NETCDF4' + + def path(self, group): + try: + # netCDF4 + return group.path + except AttributeError: + # h5netcdf + return group.name + + def ncattrs(self, yyy): + try: + # netCDF4 + return yyy.ncattrs() + except AttributeError: + # h5netcdf + return tuple(yyy.attrs) + + def getncattr(self, yyy, attr): + try: + # netCDF4 + return getattr(yyy, attr) + except AttributeError: + # h5netcdf + return yyy.attrs[attr] + + def flatten(self, output_ds): + """Flattens and write to output file + + :param output_ds: The dataset in which to store the flattened result. + """ +# or output_ds.filepath() == self.__input_file.filepath() \ +# or output_ds.data_model != 'NETCDF4': + if output_ds == self.__input_file \ + or self.filepath(output_ds) == self.filepath(self.__input_file) \ + or self.data_model(output_ds) != 'NETCDF4': + raise ValueError("Invalid inputs. Input and output datasets should be different, and output should be of " + "the 'NETCDF4' format.") + + self.__output_file = output_ds + + # Flatten product + self.process_group(self.__input_file) + + # Add name mapping attributes + self.__output_file.setncattr(self.__attr_map_name, self.__attr_map_value) + self.__output_file.setncattr(self.__dim_map_name, self.__dim_map_value) + self.__output_file.setncattr(self.__var_map_name, self.__var_map_value) + + # Browse flattened variables to rename references: + logging.info("Browsing flattened variables to rename references in attributes:") + for var in self.__output_file.variables.values(): + self.adapt_references(var) + + def process_group(self, input_group): + """Flattens a given group to the output file. + + :param input_group: group to flatten + """ +# logging.info("Browsing group " + input_group.path) + logging.info("Browsing group " + self.path(input_group)) +# for attr_name in input_group.ncattrs(): + for attr_name in self.ncattrs(input_group): + self.flatten_attribute(input_group, attr_name) + + for dim in input_group.dimensions.values(): + self.flatten_dimension(dim) + + for var in input_group.variables.values(): + self.flatten_variable(var) + + for child_group in input_group.groups.values(): + self.process_group(child_group) + + def flatten_attribute(self, input_group, attr_name): + """Flattens a given attribute from a group to the output file. + + :param input_group: group containing the attribute to flatten + :param attr_name: name of the attribute to flatten + """ +# logging.info(" Copying attribute {} from group {} to root".format(attr_name, input_group.path)) + logging.info(" Copying attribute {} from group {} to root".format(attr_name, self.path(input_group))) + + # Create new name + new_attr_name = self.generate_flattened_name(input_group, attr_name) + + # Write attribute +# self.__output_file.setncattr(new_attr_name, input_group.getncattr(attr_name)) + self.__output_file.setncattr(new_attr_name, self.getncattr(input_group, attr_name)) + + # Store new naming for later and in mapping attribute + self.__attr_map_value.append(self.generate_mapping_str(input_group, attr_name, new_attr_name)) + + def flatten_dimension(self, dim): + """Flattens a given dimension to the output file. + + :param dim: dimension to flatten + """ +# logging.info(" Copying dimension {} from group {} to root".format(dim.name, dim.group().path)) + logging.info(" Copying dimension {} from group {} to root".format(dim.name, self.path(dim.group()))) + + # Create new name + new_name = self.generate_flattened_name(dim.group(), dim.name) + + # Write dimension + self.__output_file.createDimension(new_name, (len(dim), None)[dim.isunlimited()]) + + # Store new name in dict for resolving references later + self.__dim_map[self.pathname(dim.group(), dim.name)] = new_name + + # Add to name mapping attribute + self.__dim_map_value.append(self.generate_mapping_str(dim.group(), dim.name, new_name)) + + def flatten_variable(self, var): + """Flattens a given variable to the output file. + + :param var: variable to flatten + """ +# logging.info(" Copying variable {} from group {} to root".format(var.name, var.group().path)) + logging.info(" Copying variable {} from group {} to root".format(var.name, self.path(var.group()))) + + # Create new name + new_name = self.generate_flattened_name(var.group(), var.name) + + # Replace old by new dimension names + new_dims = list(map(lambda x: self.__dim_map[self.pathname(x.group(), x.name)], var.get_dims())) + + # Write variable + fullname = self.pathname(var.group(), var.name) + logging.info("create variable {} from {}".format(new_name, fullname)) + + new_var = self.__output_file.createVariable( + new_name, + var.dtype, + new_dims, + zlib=False, + complevel=4, + shuffle=True, + fletcher32=False, + contiguous=var.chunking() == "contiguous", + chunksizes=var.chunking() if var.chunking() != "contiguous" else None, + endian=var.endian(), + least_significant_digit=None, + fill_value=None) + + if self.__copy_data: + # Find out slice method for variable and copy data + if self.__copy_slices is None or fullname not in self.__copy_slices: + # Copy data as a whole + new_var[:] = var[:] + elif self.__copy_slices[fullname] is None: + # Copy with default slice size + copy_slice = tuple(self.__default_copy_slice_size // len(var.shape) for _ in range(len(var.shape))) + self.copy_var_by_slices(new_var, var, copy_slice) + else: + # Copy in slices + copy_slice = self.__copy_slices[fullname] + self.copy_var_by_slices(new_var, var, copy_slice) + + # Copy attributes + new_var.setncatts(var.__dict__) + + # Store new name in dict for resolving references later + self.__var_map[self.pathname(var.group(), var.name)] = new_name + + # Add to name mapping attribute + self.__var_map_value.append(self.generate_mapping_str(var.group(), var.name, new_name)) + + # Resolve references in variable attributes and replace by absolute path: + self.resolve_references(new_var, var) + + def increment_pos(self, pos, dim, copy_slice_shape, var_shape): + """Increment position vector in a variable along a dimension by the matching slice length along than dimension. + If end of the dimension is reached, recursively increment the next dimensions until a valid position is found. + + :param pos: current position + :param dim: dimension to be incremented + :param copy_slice_shape: shape of the slice + :param var_shape: shape of the variable + :return True if a valid position is found within the variable, False otherwise + """ + # Try to increment dimension + pos[dim] += copy_slice_shape[dim] + + # Test new position + dim_end_reached = pos[dim] > var_shape[dim] + var_end_reached = (dim + 1) >= len(copy_slice_shape) + + # End of this dimension not reached yet + if not dim_end_reached: + return True + # End of this dimension reached. Reset to 0 and try increment next one recursively + elif dim_end_reached and not var_end_reached: + pos[:dim + 1] = [0 for j in range(dim + 1)] + return self.increment_pos(pos, dim + 1, copy_slice_shape, var_shape) + # End of this dimension reached, and no dimension to increment. Finish. + else: + return False + + def copy_var_by_slices(self, new_var, old_var, copy_slice_shape): + """Copy the data of a variable to a new one by slice. + + :param new_var: new variable where to copy data + :param old_var: variable where data should be copied from + :param copy_slice_shape: shape of the slice + """ + logging.info(" copying data of {} in {} slices".format(old_var.name, copy_slice_shape)) + + # Initial position vector + pos = [0 for _ in range(len(copy_slice_shape))] + + # Copy in slices until end reached + var_end_reached = False + while not var_end_reached: + # Create current slice + current_slice = tuple(slice(pos[dim_i], min(old_var.shape[dim_i], pos[dim_i] + dim_l)) for dim_i, dim_l in + enumerate(copy_slice_shape)) + + # Copy data in slice + new_var[current_slice] = old_var[current_slice] + + # Get next position + var_end_reached = not self.increment_pos(pos, 0, copy_slice_shape, old_var.shape) + + def resolve_reference(self, orig_ref, orig_var, attr): + """Resolve the absolute path to a coordinate variable within the group structure. + + :param orig_ref: reference to resolve + :param orig_var: variable originally containing the reference + :param attr: _AttributeProperties object enum item to know if ref to dim or var + :return: absolute path to the reference + """ + ref = orig_ref + absolute_ref = None + ref_type = "" + + # Resolve first as dim (True), or var (False) + resolve_dim_or_var = attr.ref_to_dim > attr.ref_to_var + + # Resolve var (resp. dim) if resolving as dim (resp. var) failed + resolve_alt = (attr.ref_to_dim and attr.ref_to_var) + + # Reference is already given by absolute path + if ref.startswith(self.__default_separator): + method = "absolute" + absolute_ref = ref + + # Reference is given by relative path + elif self.__default_separator in ref: + method = " relative" + + # First tentative as dim OR var + ref_type = "dimension" if resolve_dim_or_var else "variable" + absolute_ref = self.search_by_relative_path(orig_ref, orig_var.group(), resolve_dim_or_var) + + # If failed and alternative possible, second tentative + if absolute_ref is None and resolve_alt: + ref_type = "dimension" if not resolve_dim_or_var else "variable" + absolute_ref = self.search_by_relative_path(orig_ref, orig_var.group(), not resolve_dim_or_var) + + # Reference is to be searched by proximity + else: + method = " proximity" + absolute_ref, ref_type = self.resolve_reference_proximity(ref, resolve_dim_or_var, resolve_alt, orig_var, + attr) + + # Post-search checks and return result + return self.resolve_reference_post_processing(absolute_ref, orig_ref, orig_var, attr, ref_type, method) + + def resolve_reference_proximity(self, ref, resolve_dim_or_var, resolve_alt, orig_var, attr): + """Resolve reference: search by proximity + """ + # First tentative as dim OR var + ref_type = "dimension" if resolve_dim_or_var else "variable" + resolved_var = self.search_by_proximity(ref, orig_var.group(), resolve_dim_or_var, False, + attr.stop_at_local_apex) + + # If failed and alternative possible, second tentative + if resolved_var is None and resolve_alt: + ref_type = "dimension" if not resolve_dim_or_var else "variable" + resolved_var = self.search_by_proximity(ref, orig_var.group(), not resolve_dim_or_var, False, + attr.stop_at_local_apex) + + # If found, create ref string + if resolved_var is not None: + return self.pathname(resolved_var.group(), resolved_var.name), ref_type + else: + return None, "" + + def resolve_reference_post_processing(self, absolute_ref, orig_ref, orig_var, attr, ref_type, method): + """Post-processing operations after resolving reference + """ + # If not found and accept standard name, assume standard name + if absolute_ref is None and attr.accept_standard_names: + logging.info(" coordinate reference to '{}' not resolved. Assumed to be a standard name.".format(orig_ref)) + ref_type = "standard_name" + absolute_ref = orig_ref + # Else if not found, raise exception + elif absolute_ref is None: +# absolute_ref = self.handle_reference_error(orig_ref, orig_var.group().path) + absolute_ref = self.handle_reference_error(orig_ref, self.path(orig_var.group())) + # If found: + else: + logging.info(" {} coordinate reference to {} '{}' resolved as '{}'" + .format(method, ref_type, orig_ref, absolute_ref)) + + # If variables refs are limited to coordinate variable, additional check +# and (("coordinates" not in orig_var.ncattrs() or orig_ref not in orig_var.coordinates) + if ref_type == "variable" and attr.limit_to_scalar_coordinates \ + and (("coordinates" not in self.ncattrs(orig_var) or orig_ref not in self.attr(orig_var, coordinates)) + or self._Flattener__input_file[absolute_ref].ndim > 0): + logging.info(" coordinate reference to '{}' is not a SCALAR COORDINATE variable. " + "Assumed to be a standard name.".format(orig_ref)) + absolute_ref = orig_ref + + # Return result + return absolute_ref + + def search_by_relative_path(self, ref, current_group, search_dim): + """Resolve the absolute path to a reference within the group structure, using search by relative path. + + :param ref: reference to resolve + :param current_group: current group where searching + :param search_dim: if true, search references to dimensions, if false, search references to variables + :return: absolute path to the coordinate + """ + # Go up parent groups + while ref.startswith("../"): + if current_group.parent is None: + return None + ref = ref[3:] + current_group = current_group.parent + + # Go down child groups + ref_split = ref.split(self.__default_separator) + for g in ref_split[:-1]: + try: + current_group = current_group.groups[g] + except KeyError: + return None + + # Get variable or dimension + elt = current_group.dimensions[ref_split[-1]] if search_dim else current_group.variables[ref_split[-1]] + + # Get absolute reference + return self.pathname(elt.group(), elt.name) + + def search_by_proximity(self, ref, current_group, search_dim, local_apex_reached, is_coordinate_variable): + """Resolve the absolute path to a reference within the group structure, using search by proximity. + + First search up in the hierarchy for the reference, until root group is reached. If coordinate variable, search + until local apex is reached, Then search down in siblings. + + :param ref: reference to resolve + :param current_group: current group where searching + :param search_dim: if true, search references to dimensions, if false, search references to variables + :param local_apex_reached: False initially, until apex is reached. + :param is_coordinate_variable: true, if looking for a coordinate variable + :return: absolute path to the coordinate + """ + dims_or_vars = current_group.dimensions if search_dim else current_group.variables + + # Found in current group + if ref in dims_or_vars.keys(): + return dims_or_vars[ref] + + local_apex_reached = local_apex_reached or ref in current_group.dimensions.keys() + + # Check if has to continue looking in parent group + # - normal search: continue until root is reached + # - coordinate variable: continue until local apex is reached + if is_coordinate_variable: + top_reached = local_apex_reached or current_group.parent is None + else: + top_reached = current_group.parent is None + + # Search up + if not top_reached: + return self.search_by_proximity(ref, current_group.parent, search_dim, local_apex_reached, + is_coordinate_variable) + + # If coordinate variable and local apex reached, search down in siblings + elif is_coordinate_variable and local_apex_reached: + found_elt = None + for child_group in current_group.groups.values(): + found_elt = self.search_by_proximity(ref, child_group, search_dim, local_apex_reached, + is_coordinate_variable) + if found_elt is not None: + break + return found_elt + + # If here, did not find + else: + return None + + def __escape_index_error(self, match, group_name): + """Return the group in a match if it exists, an empty string otherwise. + + :param match: regex match + :param group_name: group name + :return: match group + """ + try: + return match.group(group_name) + except IndexError: + return "" + + def resolve_references(self, var, old_var): + """In a given variable, replace all references to other variables in its attributes by absolute references. + + :param var: flattened variable in which references should be renamed with absolute references + :param old_var: original variable (in group structure) + """ + print(_AttributeProperties) + for attr in _AttributeProperties: + if attr.name in var.__dict__: +# attr_value = var.getncattr(attr.name) + attr_value = self.getncattr(var, attr.name) + # Parse attribute value + parsed_attr = parse_var_attr(attr_value) + + # Resolved references in parsed as required by attribute properties + resolved_parsed_attr = collections.OrderedDict() + + for k, v in parsed_attr.items(): + new_k = self.resolve_reference(k, old_var, attr) if attr.resolve_key else k + + new_v = ([self.resolve_reference(x, old_var, attr) for x in parsed_attr[k]] + if attr.resolve_value and parsed_attr[k] is not None else parsed_attr[k]) + + resolved_parsed_attr[new_k] = new_v + + # Re-generate attribute value string with resolved references + var.setncattr(attr.name, generate_var_attr_str(resolved_parsed_attr)) + + def adapt_references(self, var): + """In a given variable, replace all references to variables in attributes by references to the new names in the + flattened NetCDF. All references have to be already resolved as absolute references. + + :param var: flattened variable in which references should be renamed with new names + """ + for attr in _AttributeProperties: + if attr.name in var.__dict__: + #attr_value = var.getncattr(attr.name) + attr_value = self.getncattr(var, attr.name) + # Parse attribute value + parsed_attr = parse_var_attr(attr_value) + + adapted_parsed_attr = collections.OrderedDict() + + for k, v in parsed_attr.items(): + new_k = self.adapt_name(k, attr) if attr.resolve_key else k + + new_v = ([self.adapt_name(x, attr) for x in parsed_attr[k]] + if attr.resolve_value and parsed_attr[k] is not None else parsed_attr[k]) + + adapted_parsed_attr[new_k] = new_v + + new_attr_value = generate_var_attr_str(adapted_parsed_attr) + var.setncattr(attr.name, new_attr_value) + + logging.info(" attribute '{}' in '{}': references '{}' renamed as '{}'" + .format(attr.name, var.name, attr_value, new_attr_value)) + + def adapt_name(self, resolved_ref, attr): + """Return name of flattened reference. If not found, raise exception or continue warning. + + :param resolved_ref: resolved reference to adapt + :param attr: _AttributeProperties object enum item to know in which dict to look for name mapping + :return: adapted reference + """ + # If ref contains Error message, leave as such + if self.__ref_not_found_error in resolved_ref: + return resolved_ref + + # Select highest priority map + if attr.ref_to_dim > attr.ref_to_var: + name_mapping = self.__dim_map + if attr.ref_to_dim < attr.ref_to_var: + name_mapping = self.__var_map + + # Try to find mapping + try: + return name_mapping[resolved_ref] + + # If not found, look in other map if allowed + except KeyError: + + if attr.ref_to_dim and attr.ref_to_var: + name_mapping = self.__dim_map if attr.ref_to_dim < attr.ref_to_var else self.__var_map + try: + return name_mapping[resolved_ref] + except KeyError: + pass + + # If still not found, check if any standard name is allowed + if attr.accept_standard_names: + return resolved_ref + # If not, raise exception + else: + return self.handle_reference_error(resolved_ref) + + def pathname(self, group, name): + """Compose full path name to an element in a group structure: /path/to/group/elt + + :param group: group containing element + :param name: name of the element + :return: pathname + """ + if group.parent is None: + return self.__default_separator + name + else: +# return self.__pathname_format.format(group.path, name) + return self.__pathname_format.format(self.path(group), name) + + def generate_mapping_str(self, input_group, name, new_name): + """Generate a string representing the name mapping of an element before and after flattening. + + :param input_group: group containing the non-flattened element + :param name: name of the non-flattened element + :param new_name: name of the flattened element + :return: string representing the name mapping for the element + """ + original_pathname = self.pathname(input_group, name) + mapping_str = self.__mapping_str_format.format(new_name, original_pathname) + return mapping_str + + def convert_path_to_valid_name(self, pathname): + """Generate valid name from path. + + :param pathname: pathname + :return: valid NetCDF name + """ + return pathname.replace(self.__default_separator, '', 1).replace(self.__default_separator, self.__new_separator) + + def generate_flattened_name(self, input_group, orig_name): + """Convert full path of an element to a valid NetCDF name: + - the name of an element is the concatenation of its containing group and its name, + - replaces / from paths (forbidden as NetCDF name), + - if name is longer than 255 characters, replace path to group by hash, + - if name is still too long, replace complete name by hash. + + :param input_group: group containing element + :param orig_name: original name of the element + :return: new valid name of the element + """ + # If element is at root: no change + if input_group.parent is None: + new_name = orig_name + + # If element in child group, concatenate group path and element name + else: + full_name = self.convert_path_to_valid_name(input_group.path) + self.__new_separator + orig_name + new_name = full_name + + # If resulting name is too long, hash group path + if len(new_name) >= self.__max_name_len: +# group_hash = hashlib.sha1(input_group.path.encode("UTF-8")).hexdigest() + group_hash = hashlib.sha1(self.path(input_group).encode("UTF-8")).hexdigest() + new_name = group_hash + self.__new_separator + orig_name + + # If resulting name still too long, hash everything + if len(new_name) >= self.__max_name_len: + new_name = hashlib.sha1(full_name.encode("UTF-8")).hexdigest() + return new_name + + def handle_reference_error(self, ref, context=None): + """Depending on lax/strict mode, either raise exception or log warning. If lax, return reference placeholder. + + :param ref: reference + :param context: additional context info to add to message + :return: if continue with warning, error replacement name for reference + """ + message = "Reference '{}' could not be resolved".format(ref) + if context is not None: + message = message + " from {}".format(context) + if self.__lax_mode: + warnings.warn(message) + return self.__ref_not_found_error + "_" + ref + else: + raise ReferenceException(message) + + +class ReferenceException(Exception): + """Exception raised when references in attributes cannot be resolved. + + Attributes: + message -- explanation of the error + """ + + def __init__(self, message): + super().__init__(message) diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 24a19b8f6..b638829ed 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -23,6 +23,8 @@ from ...decorators import _manage_log_level_via_verbosity from ...functions import is_log_level_debug from .. import IORead +from ...flatten import flatten as flatten2 + logger = logging.getLogger(__name__) @@ -513,19 +515,19 @@ def file_open(self, filename, flatten=True, verbose=None): g = self.read_vars if flatten and nc.groups: - if HDF: - # TODOHDF: Can't yet use HDF access to process groups - logger.warning( - "WARNING: Using netCDF4 (rather than h5netcdf) " - f"to access file {filename} containing groups" - ) # pragma: no cover - nc.close() - HDF = False - try: - nc = netCDF4.Dataset(filename, "r") - netCDF = True - except RuntimeError as error: - raise RuntimeError(f"{error}: {filename}") + #if HDF: + # # TODOHDF: Can't yet use HDF access to process groups + # logger.warning( + # "WARNING: Using netCDF4 (rather than h5netcdf) " + # f"to access file {filename} containing groups" + # ) # pragma: no cover + # nc.close() + # HDF = False + # try: + # nc = netCDF4.Dataset(filename, "r") + # netCDF = True + # except RuntimeError as error: + # raise RuntimeError(f"{error}: {filename}") # Create a diskless, non-persistent container for the # flattened file @@ -543,7 +545,8 @@ def file_open(self, filename, flatten=True, verbose=None): flat_nc.set_fill_off() # Flatten the file - netcdf_flattener.flatten( +# netcdf_flattener.flatten( + flatten2( nc, flat_nc, lax_mode=True, _copy_data=False ) diff --git a/cfdm/test/test_groups.py b/cfdm/test/test_groups.py index c8323bf96..a011f3108 100644 --- a/cfdm/test/test_groups.py +++ b/cfdm/test/test_groups.py @@ -65,7 +65,8 @@ def test_groups(self): ungrouped_file = ungrouped_file1 grouped_file = grouped_file1 - + grouped_file = 'delme_grouped.nc' + # Add a second grid mapping datum = cfdm.Datum(parameters={"earth_radius": 7000000}) conversion = cfdm.CoordinateConversion( @@ -103,6 +104,8 @@ def test_groups(self): ) nc.close() + grouped_file = grouped_file1 + h = cfdm.read(grouped_file, verbose=1) self.assertEqual(len(h), 1, repr(h)) self.assertTrue(f.equals(h[0], verbose=2)) From 21942d5651a374f423837f3ac937017cb39700e9 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 24 Jan 2024 08:26:45 +0000 Subject: [PATCH 09/88] dev --- cfdm/flatten.py | 118 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 93 insertions(+), 25 deletions(-) diff --git a/cfdm/flatten.py b/cfdm/flatten.py index 2246d793d..c33da588f 100644 --- a/cfdm/flatten.py +++ b/cfdm/flatten.py @@ -28,8 +28,8 @@ import warnings from enum import Enum -#from netCDF4 import Dataset -from h5netcdf import File as Dataset +from netCDF4 import Dataset +#from h5netcdf import File as Dataset @@ -220,15 +220,29 @@ def __init__(self, input_ds, lax_mode, _copy_data=True, copy_slices=None): self.__input_file = input_ds self.__output_file = None - def filepath(self, ds): + def filepath(self, dataset): + """Return the file system path (or the opendap URL) for the Dataset. + + :Returns: + + `str` + + """ try: # netCDF4 - return ds.filepath() + return dataset.filepath() except AttributeError: # h5netcdf - return ds.filename + return dataset.filename def data_model(self, ds): + """Return the netCDF data model version. + + :Returns: + + `str` + + """ try: # netCDF4 return ds.data_model @@ -236,7 +250,29 @@ def data_model(self, ds): # h5netcdf return 'NETCDF4' + def name(self, x): + """Return the netCDF name, without its groups. + + :Returns: + + `str` + + """ + try: + # netCDF4 + return x.name + except AttributeError: + # h5netcdf + name = x.name.split('/')[-1] + def path(self, group): + """Return a simulated unix directory path to a group. + + :Returns: + + `str` + + """ try: # netCDF4 return group.path @@ -244,21 +280,43 @@ def path(self, group): # h5netcdf return group.name - def ncattrs(self, yyy): + def ncattrs(self, x): + """Return netCDF attribute names. + + :Parameters: + + x: variable, group, or dataset + + :Returns: + + `list` + + """ try: # netCDF4 - return yyy.ncattrs() + return x.ncattrs() except AttributeError: # h5netcdf - return tuple(yyy.attrs) + return list(x.attrs) - def getncattr(self, yyy, attr): + def getncattr(self, x, attr): + """Retrieve a netCDF attribute. + + :Parameters: + + x: variable, group, or dataset + + attr: `str` + + :Returns: + + """ try: # netCDF4 - return getattr(yyy, attr) + return getattr(x, attr) except AttributeError: # h5netcdf - return yyy.attrs[attr] + return x.attrs[attr] def flatten(self, output_ds): """Flattens and write to output file @@ -333,19 +391,22 @@ def flatten_dimension(self, dim): :param dim: dimension to flatten """ # logging.info(" Copying dimension {} from group {} to root".format(dim.name, dim.group().path)) - logging.info(" Copying dimension {} from group {} to root".format(dim.name, self.path(dim.group()))) + logging.info(" Copying dimension {} from group {} to root".format(self.name(dim), self.path(dim.group()))) # Create new name - new_name = self.generate_flattened_name(dim.group(), dim.name) +# new_name = self.generate_flattened_name(dim.group(), dim.name) + new_name = self.generate_flattened_name(dim.group(), self.name(dim)) # Write dimension self.__output_file.createDimension(new_name, (len(dim), None)[dim.isunlimited()]) # Store new name in dict for resolving references later - self.__dim_map[self.pathname(dim.group(), dim.name)] = new_name +# self.__dim_map[self.pathname(dim.group(), dim.name)] = new_name + self.__dim_map[self.pathname(dim.group(), self.name(dim))] = new_name # Add to name mapping attribute - self.__dim_map_value.append(self.generate_mapping_str(dim.group(), dim.name, new_name)) +# self.__dim_map_value.append(self.generate_mapping_str(dim.group(), dim.name, new_name)) + self.__dim_map_value.append(self.generate_mapping_str(dim.group(), self.name(dim), new_name)) def flatten_variable(self, var): """Flattens a given variable to the output file. @@ -353,16 +414,17 @@ def flatten_variable(self, var): :param var: variable to flatten """ # logging.info(" Copying variable {} from group {} to root".format(var.name, var.group().path)) - logging.info(" Copying variable {} from group {} to root".format(var.name, self.path(var.group()))) + logging.info(" Copying variable {} from group {} to root".format(self.name(var), self.path(var.group()))) # Create new name - new_name = self.generate_flattened_name(var.group(), var.name) + new_name = self.generate_flattened_name(var.group(), self.name(var)) # Replace old by new dimension names - new_dims = list(map(lambda x: self.__dim_map[self.pathname(x.group(), x.name)], var.get_dims())) +# new_dims = list(map(lambda x: self.__dim_map[self.pathname(x.group(), x.name)], var.get_dims())) + new_dims = list(map(lambda x: self.__dim_map[self.pathname(x.group(), self.name(x))], var.get_dims())) # Write variable - fullname = self.pathname(var.group(), var.name) + fullname = self.pathname(var.group(), self.name(var)) logging.info("create variable {} from {}".format(new_name, fullname)) new_var = self.__output_file.createVariable( @@ -397,10 +459,12 @@ def flatten_variable(self, var): new_var.setncatts(var.__dict__) # Store new name in dict for resolving references later - self.__var_map[self.pathname(var.group(), var.name)] = new_name +# self.__var_map[self.pathname(var.group(), var.name)] = new_name + self.__var_map[self.pathname(var.group(), self.name(var))] = new_name # Add to name mapping attribute - self.__var_map_value.append(self.generate_mapping_str(var.group(), var.name, new_name)) +# self.__var_map_value.append(self.generate_mapping_str(var.group(), var.name, new_name)) + self.__var_map_value.append(self.generate_mapping_str(var.group(), self.name(var), new_name)) # Resolve references in variable attributes and replace by absolute path: self.resolve_references(new_var, var) @@ -440,7 +504,8 @@ def copy_var_by_slices(self, new_var, old_var, copy_slice_shape): :param old_var: variable where data should be copied from :param copy_slice_shape: shape of the slice """ - logging.info(" copying data of {} in {} slices".format(old_var.name, copy_slice_shape)) +# logging.info(" copying data of {} in {} slices".format(old_var.name, copy_slice_shape)) + logging.info(" copying data of {} in {} slices".format(self.name(old_var), copy_slice_shape)) # Initial position vector pos = [0 for _ in range(len(copy_slice_shape))] @@ -519,7 +584,8 @@ def resolve_reference_proximity(self, ref, resolve_dim_or_var, resolve_alt, orig # If found, create ref string if resolved_var is not None: - return self.pathname(resolved_var.group(), resolved_var.name), ref_type +# return self.pathname(resolved_var.group(), resolved_var.name), ref_type + return self.pathname(resolved_var.group(), self.name(resolved_var)), ref_type else: return None, "" @@ -579,7 +645,8 @@ def search_by_relative_path(self, ref, current_group, search_dim): elt = current_group.dimensions[ref_split[-1]] if search_dim else current_group.variables[ref_split[-1]] # Get absolute reference - return self.pathname(elt.group(), elt.name) +# return self.pathname(elt.group(), elt.name) + return self.pathname(elt.group(), self.name(elt)) def search_by_proximity(self, ref, current_group, search_dim, local_apex_reached, is_coordinate_variable): """Resolve the absolute path to a reference within the group structure, using search by proximity. @@ -696,7 +763,8 @@ def adapt_references(self, var): var.setncattr(attr.name, new_attr_value) logging.info(" attribute '{}' in '{}': references '{}' renamed as '{}'" - .format(attr.name, var.name, attr_value, new_attr_value)) + .format(attr.name, self.name(var), attr_value, new_attr_value)) +# .format(attr.name, var.name, attr_value, new_attr_value)) def adapt_name(self, resolved_ref, attr): """Return name of flattened reference. If not found, raise exception or continue warning. From 85d5d88d0a10e3edd8ae7d605ed6729a47149646 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 24 Jan 2024 15:03:30 +0000 Subject: [PATCH 10/88] dev --- cfdm/data/hdfarray.py | 23 ++++- cfdm/data/mixin/filearraymixin.py | 14 ++- cfdm/flatten.py | 122 ++++++++++++++++++++++----- cfdm/read_write/netcdf/netcdfread.py | 39 +++++++-- 4 files changed, 167 insertions(+), 31 deletions(-) diff --git a/cfdm/data/hdfarray.py b/cfdm/data/hdfarray.py index 21a5da654..3a0344a0a 100644 --- a/cfdm/data/hdfarray.py +++ b/cfdm/data/hdfarray.py @@ -1,6 +1,5 @@ import h5netcdf import netCDF4 - import numpy as np from . import abstract @@ -553,6 +552,25 @@ def get_groups(self, address): out = address.split("/")[1:] return out[:-1], out[-1] + def _fff(self, ): + u = urlparse(filename) + if u.scheme == "s3": + # Create an openable s3 file object + endpoint_url = f"https://{u.netloc}" + uri = u.path[1:] + s3 = g['s3'] + if s3 is None: + s3 = {"anon": True, + "client_kwargs": {'endpoint_url': endpoint_url}} + + fs = S3FileSystem(**s3) + filename = fs.open(uri, 'rb') + if is_log_level_detail(logger): + logger.debug( + f" s3: s3fs.S3FileSystem options: {s3}\n" + ) # pragma: no cover + + def open(self, **kwargs): """Return a file object for the dataset and the variable address. @@ -567,4 +585,5 @@ def open(self, **kwargs): within the file. """ - return super().open(h5netcdf.File, mode="r", **kwargs) + return super().open(h5netcdf.File, mode="r", + decode_vlen_strings=True, **kwargs) diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index c4156ad94..bab92c0c4 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -1,5 +1,7 @@ from urllib.parse import urlparse +from s3fs import S3FileSystem + from ...functions import abspath @@ -208,7 +210,15 @@ def open(self, func, *args, **kwargs): if url.scheme == "file": # Convert a file URI into an absolute path filename = url.path - + elif url.scheme == "s3": + # Create an openable s3 file object + endpoint_url = f"https://{url.netloc}" + uri = url.path[1:] + s3 = {"anon": True, + "client_kwargs": {'endpoint_url': endpoint_url}} + fs = S3FileSystem(**s3) + filename = fs.open(uri, 'rb') + try: nc = func(filename, *args, **kwargs) except FileNotFoundError: @@ -220,6 +230,6 @@ def open(self, func, *args, **kwargs): return nc, address if len(filenames) == 1: - raise FileNotFoundError(f"No such file: {filenames.pop()}") + raise FileNotFoundError(f"No such file: {filenames[0]}") raise FileNotFoundError(f"No such files: {filenames}") diff --git a/cfdm/flatten.py b/cfdm/flatten.py index c33da588f..a4f4b9632 100644 --- a/cfdm/flatten.py +++ b/cfdm/flatten.py @@ -234,6 +234,9 @@ def filepath(self, dataset): except AttributeError: # h5netcdf return dataset.filename + + def contiguous(self, variable): + pass def data_model(self, ds): """Return the netCDF data model version. @@ -250,6 +253,52 @@ def data_model(self, ds): # h5netcdf return 'NETCDF4' + def get_dims(self, variable): + """Return + + :Returns: + + `str` + + """ + try: + return variable.get_dims() + except AttributeError: + out = [] + dimension_names = list(variable.dimensions) + group = variable._parent + while dimension_names: + for name, dim in group.dims.items(): + if name in dimension_names[:]: + out.append(dim) + dimension_names.remove(name) + + group = group.parent + if group is None: + break + + return out + + def group(self, x): + """Return a + + :Returns: + + `Group` + + """ + try: + # netCDF4 + return x.group() + except AttributeError: + # h5netcdf + return x._parent +# g = self.__input_file.groups +# for group_name in x.name.split('/')[1:-1]: +# g = g[group_name] +# +# return g + def name(self, x): """Return the netCDF name, without its groups. @@ -265,6 +314,19 @@ def name(self, x): # h5netcdf name = x.name.split('/')[-1] + def parent(self, group): + """Return a simulated unix directory path to a group. + + :Returns: + + `str` + + """ + try: + return group.parent + except AttributeError: + return + def path(self, group): """Return a simulated unix directory path to a group. @@ -278,8 +340,12 @@ def path(self, group): return group.path except AttributeError: # h5netcdf - return group.name - + print(group, dir(group)) + try: + return group.name + except AttributeError: + return "/" + def ncattrs(self, x): """Return netCDF attribute names. @@ -391,22 +457,22 @@ def flatten_dimension(self, dim): :param dim: dimension to flatten """ # logging.info(" Copying dimension {} from group {} to root".format(dim.name, dim.group().path)) - logging.info(" Copying dimension {} from group {} to root".format(self.name(dim), self.path(dim.group()))) + logging.info(" Copying dimension {} from group {} to root".format(self.name(dim), self.path(self.group(dim)))) # Create new name # new_name = self.generate_flattened_name(dim.group(), dim.name) - new_name = self.generate_flattened_name(dim.group(), self.name(dim)) + new_name = self.generate_flattened_name(self.group(dim), self.name(dim)) # Write dimension self.__output_file.createDimension(new_name, (len(dim), None)[dim.isunlimited()]) # Store new name in dict for resolving references later # self.__dim_map[self.pathname(dim.group(), dim.name)] = new_name - self.__dim_map[self.pathname(dim.group(), self.name(dim))] = new_name + self.__dim_map[self.pathname(self.group(dim), self.name(dim))] = new_name # Add to name mapping attribute # self.__dim_map_value.append(self.generate_mapping_str(dim.group(), dim.name, new_name)) - self.__dim_map_value.append(self.generate_mapping_str(dim.group(), self.name(dim), new_name)) + self.__dim_map_value.append(self.generate_mapping_str(self.group(dim), self.name(dim), new_name)) def flatten_variable(self, var): """Flattens a given variable to the output file. @@ -414,17 +480,20 @@ def flatten_variable(self, var): :param var: variable to flatten """ # logging.info(" Copying variable {} from group {} to root".format(var.name, var.group().path)) - logging.info(" Copying variable {} from group {} to root".format(self.name(var), self.path(var.group()))) + logging.info(" Copying variable {} from group {} to root".format(self.name(var), self.path(self.group(var)))) + print ("Copying variable {} from group {} to root".format(self.name(var), self.path(self.group(var)))) # Create new name - new_name = self.generate_flattened_name(var.group(), self.name(var)) +# new_name = self.generate_flattened_name(var.group(), self.name(var)) + new_name = self.generate_flattened_name(self.group(var), self.name(var)) # Replace old by new dimension names # new_dims = list(map(lambda x: self.__dim_map[self.pathname(x.group(), x.name)], var.get_dims())) - new_dims = list(map(lambda x: self.__dim_map[self.pathname(x.group(), self.name(x))], var.get_dims())) + new_dims = list(map(lambda x: self.__dim_map[self.pathname(self.group(x), self.name(x))], self.get_dims(var))) # Write variable - fullname = self.pathname(var.group(), self.name(var)) +# fullname = self.pathname(var.group(), self.name(var)) + fullname = self.pathname(self.group(var), self.name(var)) logging.info("create variable {} from {}".format(new_name, fullname)) new_var = self.__output_file.createVariable( @@ -460,11 +529,11 @@ def flatten_variable(self, var): # Store new name in dict for resolving references later # self.__var_map[self.pathname(var.group(), var.name)] = new_name - self.__var_map[self.pathname(var.group(), self.name(var))] = new_name + self.__var_map[self.pathname(self.group(var), self.name(var))] = new_name # Add to name mapping attribute # self.__var_map_value.append(self.generate_mapping_str(var.group(), var.name, new_name)) - self.__var_map_value.append(self.generate_mapping_str(var.group(), self.name(var), new_name)) + self.__var_map_value.append(self.generate_mapping_str(self.group(var), self.name(var), new_name)) # Resolve references in variable attributes and replace by absolute path: self.resolve_references(new_var, var) @@ -552,12 +621,14 @@ def resolve_reference(self, orig_ref, orig_var, attr): # First tentative as dim OR var ref_type = "dimension" if resolve_dim_or_var else "variable" - absolute_ref = self.search_by_relative_path(orig_ref, orig_var.group(), resolve_dim_or_var) +# absolute_ref = self.search_by_relative_path(orig_ref, orig_var.group(), resolve_dim_or_var) + absolute_ref = self.search_by_relative_path(orig_ref, self.group(orig_var), resolve_dim_or_var) # If failed and alternative possible, second tentative if absolute_ref is None and resolve_alt: ref_type = "dimension" if not resolve_dim_or_var else "variable" - absolute_ref = self.search_by_relative_path(orig_ref, orig_var.group(), not resolve_dim_or_var) +# absolute_ref = self.search_by_relative_path(orig_ref, orig_var.group(), not resolve_dim_or_var) + absolute_ref = self.search_by_relative_path(orig_ref, self.groupp(orig_var), not resolve_dim_or_var) # Reference is to be searched by proximity else: @@ -573,19 +644,23 @@ def resolve_reference_proximity(self, ref, resolve_dim_or_var, resolve_alt, orig """ # First tentative as dim OR var ref_type = "dimension" if resolve_dim_or_var else "variable" - resolved_var = self.search_by_proximity(ref, orig_var.group(), resolve_dim_or_var, False, +# resolved_var = self.search_by_proximity(ref, orig_var.group(), resolve_dim_or_var, False, +# attr.stop_at_local_apex) + resolved_var = self.search_by_proximity(ref, self.group(orig_var), resolve_dim_or_var, False, attr.stop_at_local_apex) # If failed and alternative possible, second tentative if resolved_var is None and resolve_alt: ref_type = "dimension" if not resolve_dim_or_var else "variable" - resolved_var = self.search_by_proximity(ref, orig_var.group(), not resolve_dim_or_var, False, +# resolved_var = self.search_by_proximity(ref, orig_var.group(), not resolve_dim_or_var, False, +# attr.stop_at_local_apex) + resolved_var = self.search_by_proximity(ref, self.group(orig_var), not resolve_dim_or_var, False, attr.stop_at_local_apex) # If found, create ref string if resolved_var is not None: # return self.pathname(resolved_var.group(), resolved_var.name), ref_type - return self.pathname(resolved_var.group(), self.name(resolved_var)), ref_type + return self.pathname(self.group(resolved_var), self.name(resolved_var)), ref_type else: return None, "" @@ -600,7 +675,7 @@ def resolve_reference_post_processing(self, absolute_ref, orig_ref, orig_var, at # Else if not found, raise exception elif absolute_ref is None: # absolute_ref = self.handle_reference_error(orig_ref, orig_var.group().path) - absolute_ref = self.handle_reference_error(orig_ref, self.path(orig_var.group())) + absolute_ref = self.handle_reference_error(orig_ref, self.path(self.group(orig_var))) # If found: else: logging.info(" {} coordinate reference to {} '{}' resolved as '{}'" @@ -646,7 +721,7 @@ def search_by_relative_path(self, ref, current_group, search_dim): # Get absolute reference # return self.pathname(elt.group(), elt.name) - return self.pathname(elt.group(), self.name(elt)) + return self.pathname(self.group(elt), self.name(elt)) def search_by_proximity(self, ref, current_group, search_dim, local_apex_reached, is_coordinate_variable): """Resolve the absolute path to a reference within the group structure, using search by proximity. @@ -811,7 +886,8 @@ def pathname(self, group, name): :param name: name of the element :return: pathname """ - if group.parent is None: +# if group.parent is None: + if self.parent(group) is None: return self.__default_separator + name else: # return self.__pathname_format.format(group.path, name) @@ -849,7 +925,11 @@ def generate_flattened_name(self, input_group, orig_name): :return: new valid name of the element """ # If element is at root: no change - if input_group.parent is None: + # if input_group.parent is None: + print (999, orig_name) + print (input_group) + print (dir(input_group)) + if self.parent(input_group) is None: new_name = orig_name # If element in child group, concatenate group path and element name diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index b638829ed..4ca03f5f4 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -14,14 +14,15 @@ from urllib.parse import urlparse from uuid import uuid4 -import netCDF4 import h5netcdf +import netCDF4 import netcdf_flattener import numpy as np from packaging.version import Version +from s3fs import S3FileSystem from ...decorators import _manage_log_level_via_verbosity -from ...functions import is_log_level_debug +from ...functions import is_log_level_debug, is_log_level_detail from .. import IORead from ...flatten import flatten as flatten2 @@ -493,9 +494,32 @@ def file_open(self, filename, flatten=True, verbose=None): >>> r.file_open('file.nc') """ + g = self.read_vars + netCDF = False HDF = False + + u = urlparse(filename) + if u.scheme == "s3": + # Create an openable s3 file object + endpoint_url = f"https://{u.netloc}" + uri = u.path[1:] + s3 = g['s3'] + if s3 is None: + s3 = {"anon": True, + "client_kwargs": {'endpoint_url': endpoint_url}} + + fs = S3FileSystem(**s3) + filename = fs.open(uri, 'rb') + print (filename, type(filename)) + if is_log_level_detail(logger): + logger.debug( + f" s3: s3fs.S3FileSystem options: {s3}\n" + ) # pragma: no cover + +# nc = h5netcdf.File(filename, "r", decode_vlen_strings=True) try: +# raise OSError() nc = h5netcdf.File(filename, "r", decode_vlen_strings=True) HDF = True except OSError: @@ -512,8 +536,7 @@ def file_open(self, filename, flatten=True, verbose=None): # ------------------------------------------------------------ # If the file has a group structure then flatten it (CF>=1.8) # ------------------------------------------------------------ - g = self.read_vars - + if flatten and nc.groups: #if HDF: # # TODOHDF: Can't yet use HDF access to process groups @@ -642,7 +665,7 @@ def is_netcdf_file(cls, filename): """ # Assume that URLs are in netCDF format - if filename.startswith("https://") or filename.startswith("http://"): + if filename.startswith("https://") or filename.startswith("http://") or filename.startswith("s3://"): return True # Read the magic number @@ -757,7 +780,7 @@ def is_file(cls, filename): """ # Assume that URLs are files u = urlparse(filename) - if u.scheme in ("http", "https"): + if u.scheme in ("http", "https", "s3"): return True return os.path.isfile(filename) @@ -821,6 +844,7 @@ def read( warnings=True, warn_valid=False, domain=False, + s3=None ): """Reads a netCDF dataset from file or OPenDAP URL. @@ -964,6 +988,9 @@ def read( # CFA # -------------------------------------------------------- "cfa": False, + # S3 + # + "s3": s3, } g = self.read_vars From 4427c83b9a033ac8b08cfb90378a8c9195ae5d64 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 24 Jan 2024 18:26:21 +0000 Subject: [PATCH 11/88] dev --- cfdm/flatten.py | 181 ++++++++++++++++----------- cfdm/read_write/netcdf/netcdfread.py | 49 ++++---- 2 files changed, 134 insertions(+), 96 deletions(-) diff --git a/cfdm/flatten.py b/cfdm/flatten.py index a4f4b9632..983038138 100644 --- a/cfdm/flatten.py +++ b/cfdm/flatten.py @@ -220,8 +220,36 @@ def __init__(self, input_ds, lax_mode, _copy_data=True, copy_slices=None): self.__input_file = input_ds self.__output_file = None - def filepath(self, dataset): - """Return the file system path (or the opendap URL) for the Dataset. + def attrs(self, variable): + try: + # h5netcdf + return variable.attrs + except: + # netCDF4 + return { + attr: variable.getncattr(attr) for attr in variable.ncattrs() + } + + def chunksizes(self, variable): + try: + # netCDF4 + chunking = variable.chunking() + if chunking == "contiguous": + return None + except AttributeError: + # h5netcdf + return variable.chunks + + def contiguous(self, variable): + try: + # netCDF4 + return variable.chunking() == "contiguous", + except AttributeError: + # h5netcdf + return variable.chunks is None + + def data_model(self, ds): + """Return the netCDF data model version. :Returns: @@ -230,16 +258,28 @@ def filepath(self, dataset): """ try: # netCDF4 - return dataset.filepath() + return ds.data_model except AttributeError: # h5netcdf - return dataset.filename + return 'NETCDF4' - def contiguous(self, variable): - pass + def dtype(self, variable): + out = variable.dtype + if out == "O": + out = str + + return out - def data_model(self, ds): - """Return the netCDF data model version. + def endian(self, variable): + try: + # netCDF4 + return variable.endian() + except AttributeError: + # h5netcdf + return "native" + + def filepath(self, dataset): + """Return the file system path (or the opendap URL) for the Dataset. :Returns: @@ -248,11 +288,11 @@ def data_model(self, ds): """ try: # netCDF4 - return ds.data_model + return dataset.filepath() except AttributeError: # h5netcdf - return 'NETCDF4' - + return dataset.filename + def get_dims(self, variable): """Return @@ -268,17 +308,36 @@ def get_dims(self, variable): dimension_names = list(variable.dimensions) group = variable._parent while dimension_names: - for name, dim in group.dims.items(): - if name in dimension_names[:]: - out.append(dim) + for name in dimension_names[:]: + if name in group.dims: + out.append(group.dims[name]) dimension_names.remove(name) group = group.parent if group is None: break - + return out - + + def getncattr(self, x, attr): + """Retrieve a netCDF attribute. + + :Parameters: + + x: variable, group, or dataset + + attr: `str` + + :Returns: + + """ + try: + # netCDF4 + return getattr(x, attr) + except AttributeError: + # h5netcdf + return x.attrs[attr] + def group(self, x): """Return a @@ -293,11 +352,6 @@ def group(self, x): except AttributeError: # h5netcdf return x._parent -# g = self.__input_file.groups -# for group_name in x.name.split('/')[1:-1]: -# g = g[group_name] -# -# return g def name(self, x): """Return the netCDF name, without its groups. @@ -306,14 +360,34 @@ def name(self, x): `str` + """ + out = x.name + if "/" in out: + # h5netcdf + out = x.name.split('/')[-1] + + return out + + def ncattrs(self, x): + """Return netCDF attribute names. + + :Parameters: + + x: variable, group, or dataset + + :Returns: + + `list` + """ try: # netCDF4 - return x.name + return x.ncattrs() except AttributeError: # h5netcdf - name = x.name.split('/')[-1] + return list(x.attrs) + def parent(self, group): """Return a simulated unix directory path to a group. @@ -340,50 +414,11 @@ def path(self, group): return group.path except AttributeError: # h5netcdf - print(group, dir(group)) try: return group.name except AttributeError: return "/" - - def ncattrs(self, x): - """Return netCDF attribute names. - - :Parameters: - - x: variable, group, or dataset - - :Returns: - - `list` - """ - try: - # netCDF4 - return x.ncattrs() - except AttributeError: - # h5netcdf - return list(x.attrs) - - def getncattr(self, x, attr): - """Retrieve a netCDF attribute. - - :Parameters: - - x: variable, group, or dataset - - attr: `str` - - :Returns: - - """ - try: - # netCDF4 - return getattr(x, attr) - except AttributeError: - # h5netcdf - return x.attrs[attr] - def flatten(self, output_ds): """Flattens and write to output file @@ -481,7 +516,6 @@ def flatten_variable(self, var): """ # logging.info(" Copying variable {} from group {} to root".format(var.name, var.group().path)) logging.info(" Copying variable {} from group {} to root".format(self.name(var), self.path(self.group(var)))) - print ("Copying variable {} from group {} to root".format(self.name(var), self.path(self.group(var)))) # Create new name # new_name = self.generate_flattened_name(var.group(), self.name(var)) @@ -489,6 +523,7 @@ def flatten_variable(self, var): # Replace old by new dimension names # new_dims = list(map(lambda x: self.__dim_map[self.pathname(x.group(), x.name)], var.get_dims())) + new_dims = list(map(lambda x: self.__dim_map[self.pathname(self.group(x), self.name(x))], self.get_dims(var))) # Write variable @@ -498,15 +533,15 @@ def flatten_variable(self, var): new_var = self.__output_file.createVariable( new_name, - var.dtype, + self.dtype(var), new_dims, zlib=False, complevel=4, shuffle=True, fletcher32=False, - contiguous=var.chunking() == "contiguous", - chunksizes=var.chunking() if var.chunking() != "contiguous" else None, - endian=var.endian(), + contiguous=self.contiguous(var), + chunksizes=self.chunksizes(var), + endian=self.endian(var), least_significant_digit=None, fill_value=None) @@ -525,7 +560,8 @@ def flatten_variable(self, var): self.copy_var_by_slices(new_var, var, copy_slice) # Copy attributes - new_var.setncatts(var.__dict__) +# new_var.setncatts(var.__dict__) + new_var.setncatts(self.attrs(var)) # Store new name in dict for resolving references later # self.__var_map[self.pathname(var.group(), var.name)] = new_name @@ -684,7 +720,7 @@ def resolve_reference_post_processing(self, absolute_ref, orig_ref, orig_var, at # If variables refs are limited to coordinate variable, additional check # and (("coordinates" not in orig_var.ncattrs() or orig_ref not in orig_var.coordinates) if ref_type == "variable" and attr.limit_to_scalar_coordinates \ - and (("coordinates" not in self.ncattrs(orig_var) or orig_ref not in self.attr(orig_var, coordinates)) + and (("coordinates" not in self.ncattrs(orig_var) or orig_ref not in self.getncattr(orig_var, "coordinates")) or self._Flattener__input_file[absolute_ref].ndim > 0): logging.info(" coordinate reference to '{}' is not a SCALAR COORDINATE variable. " "Assumed to be a standard name.".format(orig_ref)) @@ -789,7 +825,6 @@ def resolve_references(self, var, old_var): :param var: flattened variable in which references should be renamed with absolute references :param old_var: original variable (in group structure) """ - print(_AttributeProperties) for attr in _AttributeProperties: if attr.name in var.__dict__: # attr_value = var.getncattr(attr.name) @@ -926,15 +961,13 @@ def generate_flattened_name(self, input_group, orig_name): """ # If element is at root: no change # if input_group.parent is None: - print (999, orig_name) - print (input_group) - print (dir(input_group)) if self.parent(input_group) is None: new_name = orig_name # If element in child group, concatenate group path and element name else: - full_name = self.convert_path_to_valid_name(input_group.path) + self.__new_separator + orig_name +# full_name = self.convert_path_to_valid_name(input_group.path) + self.__new_separator + orig_name + full_name = self.convert_path_to_valid_name(self.path(input_group)) + self.__new_separator + orig_name new_name = full_name # If resulting name is too long, hash group path diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 4ca03f5f4..6e488b987 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -519,7 +519,7 @@ def file_open(self, filename, flatten=True, verbose=None): # nc = h5netcdf.File(filename, "r", decode_vlen_strings=True) try: -# raise OSError() + #raise OSError() nc = h5netcdf.File(filename, "r", decode_vlen_strings=True) HDF = True except OSError: @@ -533,10 +533,17 @@ def file_open(self, filename, flatten=True, verbose=None): except Exception as error: raise Exception(f"{error}: {filename}") + g["original_HDF"] = HDF + g["original_netCDF"] = netCDF # ------------------------------------------------------------ # If the file has a group structure then flatten it (CF>=1.8) # ------------------------------------------------------------ + if HDF: + print ("Opened with h5netcdf") + else: + print ("Opened with netCDF4") + if flatten and nc.groups: #if HDF: # # TODOHDF: Can't yet use HDF access to process groups @@ -581,15 +588,12 @@ def file_open(self, filename, flatten=True, verbose=None): nc = flat_nc + netCDF = True + HDF = False + g["has_groups"] = True g["flat_files"].append(flat_file) - - - if HDF: - print ("Opened with h5netcdf") - else: - print ("Opened with netCDF4") - + g["netCDF"] = netCDF g["HDF"] = HDF g["nc"] = nc @@ -602,7 +606,7 @@ def cdl_to_netcdf(cls, filename): :Parameters: filename: `str` - The name of the CDL file. + The name sdef _netof the CDL file. :Returns: @@ -1298,6 +1302,7 @@ def read( variable_attributes[ncvar][attr] = value # print (attr, value, type(value)) + # variable_dimensions[ncvar] = tuple(variable.dimensions) variable_dimensions[ncvar] = tuple(self._file_variable_dimensions(variable)) variable_dataset[ncvar] = nc @@ -6095,10 +6100,10 @@ def _create_netcdfarray( if return_kwargs_only: return kwargs - if g['netCDF']: + if g['original_netCDF']: array = self.implementation.initialise_NetCDFArray(**kwargs) else: - # HDF + # h5netcdf array = self.implementation.initialise_HDFArray(**kwargs) return array, kwargs @@ -9953,10 +9958,10 @@ def _file_global_attributes(self): g = self.read_vars nc = g['nc'] if g['netCDF']: - # NetCDF + # netCDF4 return {attr: nc.getncattr(attr) for attr in nc.ncattrs()} - # HDF + # h5netcdf return nc.attrs def _file_dimensions(self): @@ -9984,17 +9989,17 @@ def _file_variable_attributes(self, var, names_only=False): g = self.read_vars if not names_only: if g['netCDF']: - # NetCDF + # netCDF4 return {attr: var.getncattr(attr) for attr in var.ncattrs()} - # HDF + # h5netcdf return var.attrs if g['netCDF']: - # NetCDF + # netCDF4 return var.ncattrs() - # HDF + # h5netcdf return list(var.attrs) def _file_variable_dimensions(self, var): @@ -10002,11 +10007,11 @@ def _file_variable_dimensions(self, var): def _file_variable_size(self, var): g = self.read_vars - if g['netCDF']: - # NetCDF + try: + # netCDF4 return var.size - - # HDF - return prod(var.shape) + except AttributeError: + # h5netcdf + return prod(var.shape) From 3061454acd28756d57698d0f8f5ae63b3c0c0b3a Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 24 Jan 2024 22:52:47 +0000 Subject: [PATCH 12/88] dev --- cfdm/data/hdfarray.py | 171 ++++--- cfdm/data/mixin/filearraymixin.py | 12 +- cfdm/data/mixin/netcdffilemixin.py | 19 +- cfdm/data/netcdfarray.py | 13 +- cfdm/flatten.py | 612 ++++++++++++++++++-------- cfdm/read_write/netcdf/netcdfread.py | 155 +++---- cfdm/read_write/netcdf/netcdfwrite.py | 2 +- cfdm/test/test_groups.py | 6 +- 8 files changed, 616 insertions(+), 374 deletions(-) diff --git a/cfdm/data/hdfarray.py b/cfdm/data/hdfarray.py index 3a0344a0a..d150e5e43 100644 --- a/cfdm/data/hdfarray.py +++ b/cfdm/data/hdfarray.py @@ -1,14 +1,18 @@ +import logging + import h5netcdf import netCDF4 import numpy as np from . import abstract from .mixin import FileArrayMixin, NetCDFFileMixin -from .numpyarray import NumpyArray _safecast = netCDF4.utils._safecast default_fillvals = netCDF4.default_fillvals +logger = logging.getLogger(__name__) + + class HDFArray(NetCDFFileMixin, FileArrayMixin, abstract.Array): """An underlying array stored in an HDF file. @@ -187,7 +191,7 @@ def __getitem__(self, indices): if groups: dataset = self._uuu(dataset, groups) - + # Get the variable by netCDF name variable = dataset.variables[address] self.variable = variable @@ -196,18 +200,18 @@ def __getitem__(self, indices): if mask: self.scale = True self.always_mask = False - self._isvlen = variable.dtype == np.dtype('O') + self._isvlen = variable.dtype == np.dtype("O") if not self._isvlen: array = self._mask(array) array = self._scale(array) - + # Set the units, if they haven't been set already. self._set_units(variable) self.close(dataset0) del dataset, dataset0 del self.variable - + string_type = isinstance(array, str) if string_type: # -------------------------------------------------------- @@ -222,9 +226,14 @@ def __getitem__(self, indices): array = self._process_string_and_char(array) return array - + def _check_safecast(self, attname): - """Check to see that variable attribute exists can can be safely cast to variable data type.""" + """ToDOHDF. + + Check to see that variable attribute exists can can be safely + cast to variable data type. + + """ attrs = self.variable.attrs if attname in attrs: attvalue = attrs[attname] @@ -240,7 +249,7 @@ def _check_safecast(self, attname): is_safe = False else: is_safe = _safecast(att, atta) - + if not is_safe: logger.warn( f"WARNING: {attname} not used since it cannot " @@ -250,33 +259,33 @@ def _check_safecast(self, attname): return is_safe def _mask(self, data): - """TODOHDF""" + """TODOHDF.""" # Private function for creating a masked array, masking # missing_values and/or _FillValues. - + attrs = self.variable.attrs - is_unsigned = attrs.get('_Unsigned', False) in ("true", "True") - is_unsigned_int = is_unsigned and data.dtype.kind == 'i' + is_unsigned = attrs.get("_Unsigned", False) in ("true", "True") + is_unsigned_int = is_unsigned and data.dtype.kind == "i" dtype = data.dtype if self.scale and is_unsigned_int: # Only do this if autoscale option is on. dtype_unsigned_int = f"{dtype.byteorder}u{dtype.itemsize}" data = data.view(dtype_unsigned_int) - + totalmask = np.zeros(data.shape, np.bool_) fill_value = None - safe_missval = self._check_safecast('missing_value') + safe_missval = self._check_safecast("missing_value") if safe_missval: mval = np.array(self.missing_value, self.dtype) if self.scale and is_unsigned_int: mval = mval.view(dtype_unsigned_int) - + # create mask from missing values. mvalmask = np.zeros(data.shape, np.bool_) - if mval.shape == (): # mval a scalar. - mval = (mval,) # make into iterable. - + if mval.shape == (): # mval a scalar. + mval = (mval,) # make into iterable. + for m in mval: # is scalar missing value a NaN? try: @@ -284,50 +293,50 @@ def _mask(self, data): except TypeError: # isnan fails on some dtypes mvalisnan = False - + if mvalisnan: mvalmask += np.isnan(data) else: mvalmask += data == m - + if mvalmask.any(): # Set fill_value for masked array to missing_value (or # 1st element if missing_value is a vector). fill_value = mval[0] totalmask += mvalmask - + # set mask=True for data == fill value - safe_fillval = self._check_safecast('_FillValue') + safe_fillval = self._check_safecast("_FillValue") if safe_fillval: fval = np.array(self._FillValue, self.dtype) if self.scale and is_unsigned_int: fval = fval.view(dtype_unsigned_int) - + # is _FillValue a NaN? try: fvalisnan = np.isnan(fval) except Exception: # isnan fails on some dtypes fvalisnan = False - + if fvalisnan: mask = np.isnan(data) elif (data == fval).any(): - mask = data==fval + mask = data == fval else: mask = None if mask is not None: if fill_value is None: fill_value = fval - + totalmask += mask else: # Don't return masked array if variable filling is disabled. no_fill = 0 -# with nogil: -# ierr = nc_inq_var_fill(self._grpid,self._varid,&no_fill,NULL) -# _ensure_nc_success(ierr) + # with nogil: + # ierr = nc_inq_var_fill(self._grpid,self._varid,&no_fill,NULL) + # _ensure_nc_success(ierr) # if no_fill is not 1, and not a byte variable, then use # default fill value. from @@ -344,18 +353,21 @@ def _mask(self, data): # as a missing value unless a _FillValue attribute is set # explicitly." (do this only for non-vlens, since vlens # don't have a default _FillValue) - if not self._isvlen and (no_fill != 1 or dtype.str[1:] not in ('u1','i1')): + if not self._isvlen and ( + no_fill != 1 or dtype.str[1:] not in ("u1", "i1") + ): fillval = np.array(default_fillvals[dtype.str[1:]], dtype) has_fillval = data == fillval # if data is an array scalar, has_fillval will be a # boolean. in that case convert to an array. - if type(has_fillval) == bool: + # if type(has_fillval) == bool: + if isinstance(has_fillval, bool): has_fillval = np.asarray(has_fillval) - + if has_fillval.any(): if fill_value is None: fill_value = fillval - + mask = data == fillval totalmask += mask @@ -366,9 +378,9 @@ def _mask(self, data): # valid_min, valid_max. No special treatment of byte data as # described at # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). - safe_validrange = self._check_safecast('valid_range') - safe_validmin = self._check_safecast('valid_min') - safe_validmax = self._check_safecast('valid_max') + safe_validrange = self._check_safecast("valid_range") + safe_validmin = self._check_safecast("valid_min") + safe_validmax = self._check_safecast("valid_max") if safe_validrange and self.valid_range.size == 2: validmin = np.array(self.valid_range[0], self.dtype) validmax = np.array(self.valid_range[1], self.dtype) @@ -377,8 +389,8 @@ def _mask(self, data): validmin = np.array(self.valid_min, self.dtype) if safe_validmax: - validmax = numpy.array(self.valid_max, self.dtype) - + validmax = np.array(self.valid_max, self.dtype) + if validmin is not None and self.scale and is_unsigned_int: validmin = validmin.view(dtype_unsigned_int) @@ -396,15 +408,14 @@ def _mask(self, data): fval = np.array(self._FillValue, dtype) else: k = dtype.str[1:] - if k in ('u1','i1'): + if k in ("u1", "i1"): fval = None else: fval = np.array(default_fillvals[k], dtype) - - - if self.dtype.kind != 'S': + + if self.dtype.kind != "S": # Don't set mask for character data - + # Setting valid_min/valid_max to the _FillVaue is too # surprising for many users (despite the netcdf docs # attribute best practices suggesting clients should do @@ -422,49 +433,52 @@ def _mask(self, data): # masked array. if fill_value is None: fill_value = default_fillvals[dtype.str[1:]] - + # Create masked array with computed mask masked_values = bool(totalmask.any()) if masked_values: - data = np.ma.masked_array(data, mask=totalmask, fill_value=fill_value) + data = np.ma.masked_array( + data, mask=totalmask, fill_value=fill_value + ) else: # Always return masked array, if no values masked. data = np.ma.masked_array(data) - + # Scalar array with mask=True should be converted to - # numpy.ma.MaskedConstant to be consistent with slicing + # np.ma.MaskedConstant to be consistent with slicing # behavior of masked arrays. if data.shape == () and data.mask.all(): # Return a scalar numpy masked constant not a 0-d masked - # array, so that data == numpy.ma.masked. + # array, so that data == np.ma.masked. data = data[()] - + elif not self.always_mask and not masked_values: # Return a regular numpy array if requested and there are # no missing values data = np.array(data, copy=False) return data - + def _scale(self, data): + """TODOHDF.""" # If variable has scale_factor and add_offset attributes, # apply them. attrs = self.variable.attrs - scale_factor = attrs.get('scale_factor') - add_offset = attrs.get('add_offset') + scale_factor = attrs.get("scale_factor") + add_offset = attrs.get("add_offset") try: if scale_factor is not None: float(scale_factor) - + if add_offset is not None: float(add_offset) - except: + except ValueError: logging.warn( "invalid scale_factor or add_offset attribute, " "no unpacking done..." ) return data - + if scale_factor is not None and add_offset is not None: if add_offset != 0.0 or scale_factor != 1.0: data = data * scale_factor + add_offset @@ -476,19 +490,18 @@ def _scale(self, data): elif add_offset is not None and add_offset != 0.0: # If variable has only add_offset attribute, add offset. data = data + add_offset - - return data - - def _get_attr(self, var, attr): - """TODOHDF - - .. versionadded:: (cfdm) HDFVER - - :Parameters: - """ + return data - return var.attrs[attr] + # def _get_attr(self, var, attr): + # """TODOHDF. + # + # .. versionadded:: (cfdm) HDFVER + # + # :Parameters: + # + # """ + # return var.attrs[attr] def close(self, dataset): """Close the dataset containing the data. @@ -552,27 +565,8 @@ def get_groups(self, address): out = address.split("/")[1:] return out[:-1], out[-1] - def _fff(self, ): - u = urlparse(filename) - if u.scheme == "s3": - # Create an openable s3 file object - endpoint_url = f"https://{u.netloc}" - uri = u.path[1:] - s3 = g['s3'] - if s3 is None: - s3 = {"anon": True, - "client_kwargs": {'endpoint_url': endpoint_url}} - - fs = S3FileSystem(**s3) - filename = fs.open(uri, 'rb') - if is_log_level_detail(logger): - logger.debug( - f" s3: s3fs.S3FileSystem options: {s3}\n" - ) # pragma: no cover - - def open(self, **kwargs): - """Return a file object for the dataset and the variable address. + """Return a dataset file object and address. When multiple files have been provided an attempt is made to open each one, in the order stored, and a file object is @@ -585,5 +579,6 @@ def open(self, **kwargs): within the file. """ - return super().open(h5netcdf.File, mode="r", - decode_vlen_strings=True, **kwargs) + return super().open( + h5netcdf.File, mode="r", decode_vlen_strings=True, **kwargs + ) diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index bab92c0c4..8339179d2 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -179,7 +179,7 @@ def get_formats(self): return (self.get_format(),) * len(self.get_filenames()) def open(self, func, *args, **kwargs): - """Return a file object for the dataset and the variable address. + """Return a dataset file object and address. When multiple files have been provided an attempt is made to open each one, in the order stored, and a file object is @@ -214,11 +214,13 @@ def open(self, func, *args, **kwargs): # Create an openable s3 file object endpoint_url = f"https://{url.netloc}" uri = url.path[1:] - s3 = {"anon": True, - "client_kwargs": {'endpoint_url': endpoint_url}} + s3 = { + "anon": True, + "client_kwargs": {"endpoint_url": endpoint_url}, + } fs = S3FileSystem(**s3) - filename = fs.open(uri, 'rb') - + filename = fs.open(uri, "rb") + try: nc = func(filename, *args, **kwargs) except FileNotFoundError: diff --git a/cfdm/data/mixin/netcdffilemixin.py b/cfdm/data/mixin/netcdffilemixin.py index 6f6e74238..4ac3b5a70 100644 --- a/cfdm/data/mixin/netcdffilemixin.py +++ b/cfdm/data/mixin/netcdffilemixin.py @@ -1,8 +1,11 @@ import netCDF4 import numpy as np +from ..numpyarray import NumpyArray + + class NetCDFFileMixin: - """Mixin class TODOHDF + """Mixin class TODOHDF. .. versionadded:: (cfdm) HDFVER @@ -25,7 +28,7 @@ def __str__(self): return f"{self.get_filename(None)}, {self.get_address()}" def _get_attr(self, var, attr): - """TODOHDF + """TODOHDF. .. versionadded:: (cfdm) HDFVER @@ -37,7 +40,7 @@ def _get_attr(self, var, attr): ) # pragma: no cover def _process_string_and_char(self, array): - """TODOHDF""" + """TODOHDF.""" string_type = isinstance(array, str) kind = array.dtype.kind if not string_type and kind in "SU": @@ -61,7 +64,7 @@ def _process_string_and_char(self, array): array = np.ma.where(array == "", np.ma.masked, array) return array - + def _set_units(self, var): """The units and calendar properties. @@ -108,10 +111,10 @@ def _set_units(self, var): return units, calendar def _uuu(self, dataset, groups): - for g in groups: #[:-1]: + for g in groups: # [:-1]: dataset = dataset.groups[g] - - return dataset #dataset = dataset.groups[groups[-1]] + + return dataset # dataset = dataset.groups[groups[-1]] @property def array(self): @@ -140,7 +143,7 @@ def close(self, dataset): :Parameters: - dataset: + dataset: The dataset to be be closed. :Returns: diff --git a/cfdm/data/netcdfarray.py b/cfdm/data/netcdfarray.py index 062c03d0d..47980399d 100644 --- a/cfdm/data/netcdfarray.py +++ b/cfdm/data/netcdfarray.py @@ -3,7 +3,6 @@ from . import abstract from .mixin import FileArrayMixin, NetCDFFileMixin -from .numpyarray import NumpyArray class NetCDFArray(NetCDFFileMixin, FileArrayMixin, abstract.Array): @@ -210,10 +209,10 @@ def __getitem__(self, indices): if groups: # Traverse the group structure, if there is one (CF>=1.8). netcdf = self._uuu(netcdf, groups) -# for g in groups[:-1]: -# netcdf = netcdf.groups[g] -# -# netcdf = netcdf.groups[groups[-1]] + # for g in groups[:-1]: + # netcdf = netcdf.groups[g] + # + # netcdf = netcdf.groups[groups[-1]] if isinstance(address, str): # Get the variable by netCDF name @@ -266,7 +265,7 @@ def __str__(self): return f"{self.get_filename(None)}, {self.get_address()}" def _get_attr(self, var, attr): - """TODOHDF + """TODOHDF. .. versionadded:: (cfdm) HDFVER @@ -338,7 +337,7 @@ def close(self, dataset): dataset.close() def open(self): - """Return a file object for the dataset and the variable address. + """Return a dataset file object and address. When multiple files have been provided an attempt is made to open each one, in the order stored, and a file object is diff --git a/cfdm/flatten.py b/cfdm/flatten.py index 983038138..c806a7a20 100644 --- a/cfdm/flatten.py +++ b/cfdm/flatten.py @@ -23,18 +23,21 @@ import collections import hashlib import logging -import os + +# import os import re import warnings from enum import Enum -from netCDF4 import Dataset -#from h5netcdf import File as Dataset - +# from netCDF4 import Dataset +# from h5netcdf import File as Dataset -def flatten(input_ds, output_ds, lax_mode=False, _copy_data=True, copy_slices=None): - """Flatten an input NetCDF dataset and write the result in an output NetCDF dataset. +def flatten( + input_ds, output_ds, lax_mode=False, _copy_data=True, copy_slices=None +): + """Flatten an input NetCDF dataset and write the result in an output + NetCDF dataset. For variable that are too big to fit in memory, the optional "copy_slices" input allows to copy some or all of the variables in slices. @@ -51,38 +54,50 @@ def flatten(input_ds, output_ds, lax_mode=False, _copy_data=True, copy_slices=No using default slice value, or a custom slicing shap in the form of a tuple of the same dimension as the variable (for instance (1000,2000,1500,) for a 3-dimensional variable). If a variable from the Dataset is not contained in the dict, it will not be sliced and copied normally. + """ - _Flattener(input_ds, lax_mode, _copy_data=_copy_data, copy_slices=copy_slices).flatten(output_ds) + _Flattener( + input_ds, lax_mode, _copy_data=_copy_data, copy_slices=copy_slices + ).flatten(output_ds) def parse_var_attr(input_str): """Parse variable attribute of any form into a dict: - * 'time' -> OrderedDict([('time', [])]) - * 'lat lon' -> OrderedDict([('lat', []), ('lon', [])]) - * 'area: time volume: lat lon' -> OrderedDict([('area', ['time']), ('volume', ['lat', 'lon'])]) + * 'time' -> OrderedDict([('time', [])]) + * 'lat lon' -> OrderedDict([('lat', []), ('lon', [])]) + * 'area: time volume: lat lon' -> OrderedDict([('area', ['time']), ('volume', ['lat', 'lon'])]) + + :param input_str: string to parse + :return: parsed string in an OrderedDict - :param input_str: string to parse - :return: parsed string in an OrderedDict """ def subst(s): - """substitute tokens for WORD and SEP (space or end of string)""" - return s.replace('WORD', r'[A-Za-z0-9_#/.\(\)]+').replace( - 'SEP', r'(\s+|$)') + """substitute tokens for WORD and SEP (space or end of + string)""" + return s.replace("WORD", r"[A-Za-z0-9_#/.\(\)]+").replace( + "SEP", r"(\s+|$)" + ) # Regex for 'dict form': "k1: v1 v2 k2: v3" - pat_value = subst('(?PWORD)SEP') - pat_values = '({})*'.format(pat_value) - pat_mapping = (subst('(?PWORD):SEP(?P{})'.format(pat_values))) - pat_mapping_list = '({})+'.format(pat_mapping) + pat_value = subst("(?PWORD)SEP") + pat_values = "({})*".format(pat_value) + pat_mapping = subst( + "(?PWORD):SEP(?P{})".format(pat_values) + ) + pat_mapping_list = "({})+".format(pat_mapping) # Regex for 'list form': "v1 v2 v3" (including single-item form) - pat_list_item = (subst('(?PWORD)SEP')) - pat_list = '({})+'.format(pat_list_item) + pat_list_item = subst("(?PWORD)SEP") + pat_list = "({})+".format(pat_list_item) # Regex for any form: - pat_all = (subst('((?P{})|(?P{}))$'.format(pat_list, pat_mapping_list))) + pat_all = subst( + "((?P{})|(?P{}))$".format( + pat_list, pat_mapping_list + ) + ) m = re.match(pat_all, input_str) @@ -90,21 +105,28 @@ def subst(s): out = collections.OrderedDict() if m is not None: - list_match = m.group('list') + list_match = m.group("list") # Parse as a list if list_match: for mapping in re.finditer(pat_list_item, list_match): - item = mapping.group('list_item') + item = mapping.group("list_item") out[item] = None # Parse as a dict: else: - mapping_list = m.group('mapping_list') + mapping_list = m.group("mapping_list") for mapping in re.finditer(pat_mapping, mapping_list): - term = mapping.group('mapping_name') - values = [value.group('value') for value in re.finditer(pat_value, mapping.group('values'))] + term = mapping.group("mapping_name") + values = [ + value.group("value") + for value in re.finditer( + pat_value, mapping.group("values") + ) + ] out[term] = values else: - raise ReferenceException("Error while parsing attribute value: '{}'".format(input_str)) + raise ReferenceException( + "Error while parsing attribute value: '{}'".format(input_str) + ) return out @@ -114,6 +136,7 @@ def generate_var_attr_str(d): :param d: dictionary :return: valid attribute string + """ parsed_list = [] for k, v in d.items(): @@ -122,13 +145,15 @@ def generate_var_attr_str(d): elif not v: parsed_list.append("{}:".format(k)) else: - parsed_list.append(k + ': ' + (' '.join(v))) - return ' '.join(parsed_list) + parsed_list.append(k + ": " + (" ".join(v))) + return " ".join(parsed_list) class _AttributeProperties(Enum): - """"Utility class containing the properties for each type of variable attribute, defining how contained references - to dimensions and variables should be parsed and processed.""" + """"Utility class containing the properties for each type of + variable attribute, defining how contained references to dimensions + and variables should be parsed and processed.""" + ancillary_variables = (0, (False, True, True, False, False, False, False)) bounds = (1, (False, True, True, False, False, False, False)) cell_measures = (2, (False, True, False, True, False, False, False)) @@ -148,7 +173,7 @@ class _AttributeProperties(Enum): cell_methods = (16, (2, 1, True, False, False, True, True)) def __init__(self, n, props): - """_AttributeProperties enum constructor + """_AttributeProperties enum constructor. :param n: enum id :param props: a tuple containing the attribute's properties (ref_to_dim, ref_to_var, resolve_key, resolve_value, @@ -162,6 +187,7 @@ def __init__(self, n, props): exception is raised if a reference cannot be resolved, and the standard name is used in place) * limit_to_scalar_coordinates: True if references to variables are only resolved if present as well in the 'coordinates' attributes of the variable, and they are scalar. + """ self.id = n self.ref_to_dim = props[0] @@ -174,12 +200,12 @@ def __init__(self, n, props): class _Flattener: - """Utility class contained the input file, the output file being flattened, and all the logic of the flattening - process. - """ + """Utility class contained the input file, the output file being + flattened, and all the logic of the flattening process.""" + __max_name_len = 256 - __default_separator = '/' - __new_separator = '__' + __default_separator = "/" + __new_separator = "__" __pathname_format = "{}/{}" __mapping_str_format = "{}: {}" __ref_not_found_error = "REF_NOT_FOUND" @@ -191,7 +217,8 @@ class _Flattener: __var_map_name = "__flattener_name_mapping_variables" def __init__(self, input_ds, lax_mode, _copy_data=True, copy_slices=None): - """Constructor. Initializes the Flattener class given the input file. + """Constructor. Initializes the Flattener class given the input + file. :param input_ds: input netcdf dataset :param lax_mode: if false (default), not resolving a reference halts the execution. If true, continue with warning. @@ -203,6 +230,7 @@ def __init__(self, input_ds, lax_mode, _copy_data=True, copy_slices=None): for using default slice value, or a custom slicing shape in the form of a tuple of the same dimension as the variable (for instance (1000,2000,1500,) for a 3-dimensional variable). If a variable from the Dataset is not contained in the dict, it will not be sliced and copied normally. + """ self.__attr_map_value = [] @@ -224,12 +252,12 @@ def attrs(self, variable): try: # h5netcdf return variable.attrs - except: + except AttributeError: # netCDF4 return { attr: variable.getncattr(attr) for attr in variable.ncattrs() } - + def chunksizes(self, variable): try: # netCDF4 @@ -239,15 +267,15 @@ def chunksizes(self, variable): except AttributeError: # h5netcdf return variable.chunks - + def contiguous(self, variable): try: # netCDF4 - return variable.chunking() == "contiguous", + return (variable.chunking() == "contiguous",) except AttributeError: # h5netcdf return variable.chunks is None - + def data_model(self, ds): """Return the netCDF data model version. @@ -261,7 +289,7 @@ def data_model(self, ds): return ds.data_model except AttributeError: # h5netcdf - return 'NETCDF4' + return "NETCDF4" def dtype(self, variable): out = variable.dtype @@ -269,7 +297,7 @@ def dtype(self, variable): out = str return out - + def endian(self, variable): try: # netCDF4 @@ -277,9 +305,10 @@ def endian(self, variable): except AttributeError: # h5netcdf return "native" - + def filepath(self, dataset): - """Return the file system path (or the opendap URL) for the Dataset. + """Return the file system path (or the opendap URL) for the + Dataset. :Returns: @@ -294,7 +323,7 @@ def filepath(self, dataset): return dataset.filename def get_dims(self, variable): - """Return + """Return. :Returns: @@ -337,9 +366,9 @@ def getncattr(self, x, attr): except AttributeError: # h5netcdf return x.attrs[attr] - + def group(self, x): - """Return a + """Return a. :Returns: @@ -352,7 +381,7 @@ def group(self, x): except AttributeError: # h5netcdf return x._parent - + def name(self, x): """Return the netCDF name, without its groups. @@ -364,10 +393,10 @@ def name(self, x): out = x.name if "/" in out: # h5netcdf - out = x.name.split('/')[-1] + out = x.name.split("/")[-1] return out - + def ncattrs(self, x): """Return netCDF attribute names. @@ -386,7 +415,6 @@ def ncattrs(self, x): except AttributeError: # h5netcdf return list(x.attrs) - def parent(self, group): """Return a simulated unix directory path to a group. @@ -400,7 +428,7 @@ def parent(self, group): return group.parent except AttributeError: return - + def path(self, group): """Return a simulated unix directory path to a group. @@ -420,17 +448,22 @@ def path(self, group): return "/" def flatten(self, output_ds): - """Flattens and write to output file + """Flattens and write to output file. :param output_ds: The dataset in which to store the flattened result. + """ -# or output_ds.filepath() == self.__input_file.filepath() \ -# or output_ds.data_model != 'NETCDF4': - if output_ds == self.__input_file \ - or self.filepath(output_ds) == self.filepath(self.__input_file) \ - or self.data_model(output_ds) != 'NETCDF4': - raise ValueError("Invalid inputs. Input and output datasets should be different, and output should be of " - "the 'NETCDF4' format.") + # or output_ds.filepath() == self.__input_file.filepath() \ + # or output_ds.data_model != 'NETCDF4': + if ( + output_ds == self.__input_file + or self.filepath(output_ds) == self.filepath(self.__input_file) + or self.data_model(output_ds) != "NETCDF4" + ): + raise ValueError( + "Invalid inputs. Input and output datasets should be different, and output should be of " + "the 'NETCDF4' format." + ) self.__output_file = output_ds @@ -438,12 +471,16 @@ def flatten(self, output_ds): self.process_group(self.__input_file) # Add name mapping attributes - self.__output_file.setncattr(self.__attr_map_name, self.__attr_map_value) + self.__output_file.setncattr( + self.__attr_map_name, self.__attr_map_value + ) self.__output_file.setncattr(self.__dim_map_name, self.__dim_map_value) self.__output_file.setncattr(self.__var_map_name, self.__var_map_value) # Browse flattened variables to rename references: - logging.info("Browsing flattened variables to rename references in attributes:") + logging.info( + "Browsing flattened variables to rename references in attributes:" + ) for var in self.__output_file.variables.values(): self.adapt_references(var) @@ -451,10 +488,11 @@ def process_group(self, input_group): """Flattens a given group to the output file. :param input_group: group to flatten + """ -# logging.info("Browsing group " + input_group.path) + # logging.info("Browsing group " + input_group.path) logging.info("Browsing group " + self.path(input_group)) -# for attr_name in input_group.ncattrs(): + # for attr_name in input_group.ncattrs(): for attr_name in self.ncattrs(input_group): self.flatten_attribute(input_group, attr_name) @@ -472,62 +510,100 @@ def flatten_attribute(self, input_group, attr_name): :param input_group: group containing the attribute to flatten :param attr_name: name of the attribute to flatten + """ -# logging.info(" Copying attribute {} from group {} to root".format(attr_name, input_group.path)) - logging.info(" Copying attribute {} from group {} to root".format(attr_name, self.path(input_group))) + # logging.info(" Copying attribute {} from group {} to root".format(attr_name, input_group.path)) + logging.info( + " Copying attribute {} from group {} to root".format( + attr_name, self.path(input_group) + ) + ) # Create new name new_attr_name = self.generate_flattened_name(input_group, attr_name) # Write attribute -# self.__output_file.setncattr(new_attr_name, input_group.getncattr(attr_name)) - self.__output_file.setncattr(new_attr_name, self.getncattr(input_group, attr_name)) + # self.__output_file.setncattr(new_attr_name, input_group.getncattr(attr_name)) + self.__output_file.setncattr( + new_attr_name, self.getncattr(input_group, attr_name) + ) # Store new naming for later and in mapping attribute - self.__attr_map_value.append(self.generate_mapping_str(input_group, attr_name, new_attr_name)) + self.__attr_map_value.append( + self.generate_mapping_str(input_group, attr_name, new_attr_name) + ) def flatten_dimension(self, dim): """Flattens a given dimension to the output file. :param dim: dimension to flatten + """ -# logging.info(" Copying dimension {} from group {} to root".format(dim.name, dim.group().path)) - logging.info(" Copying dimension {} from group {} to root".format(self.name(dim), self.path(self.group(dim)))) + # logging.info(" Copying dimension {} from group {} to root".format(dim.name, dim.group().path)) + logging.info( + " Copying dimension {} from group {} to root".format( + self.name(dim), self.path(self.group(dim)) + ) + ) # Create new name -# new_name = self.generate_flattened_name(dim.group(), dim.name) - new_name = self.generate_flattened_name(self.group(dim), self.name(dim)) - + # new_name = self.generate_flattened_name(dim.group(), dim.name) + new_name = self.generate_flattened_name( + self.group(dim), self.name(dim) + ) + # Write dimension - self.__output_file.createDimension(new_name, (len(dim), None)[dim.isunlimited()]) + self.__output_file.createDimension( + new_name, (len(dim), None)[dim.isunlimited()] + ) # Store new name in dict for resolving references later -# self.__dim_map[self.pathname(dim.group(), dim.name)] = new_name - self.__dim_map[self.pathname(self.group(dim), self.name(dim))] = new_name + # self.__dim_map[self.pathname(dim.group(), dim.name)] = new_name + self.__dim_map[ + self.pathname(self.group(dim), self.name(dim)) + ] = new_name # Add to name mapping attribute -# self.__dim_map_value.append(self.generate_mapping_str(dim.group(), dim.name, new_name)) - self.__dim_map_value.append(self.generate_mapping_str(self.group(dim), self.name(dim), new_name)) + # self.__dim_map_value.append(self.generate_mapping_str(dim.group(), dim.name, new_name)) + self.__dim_map_value.append( + self.generate_mapping_str( + self.group(dim), self.name(dim), new_name + ) + ) def flatten_variable(self, var): """Flattens a given variable to the output file. :param var: variable to flatten + """ -# logging.info(" Copying variable {} from group {} to root".format(var.name, var.group().path)) - logging.info(" Copying variable {} from group {} to root".format(self.name(var), self.path(self.group(var)))) + # logging.info(" Copying variable {} from group {} to root".format(var.name, var.group().path)) + logging.info( + " Copying variable {} from group {} to root".format( + self.name(var), self.path(self.group(var)) + ) + ) # Create new name -# new_name = self.generate_flattened_name(var.group(), self.name(var)) - new_name = self.generate_flattened_name(self.group(var), self.name(var)) + # new_name = self.generate_flattened_name(var.group(), self.name(var)) + new_name = self.generate_flattened_name( + self.group(var), self.name(var) + ) # Replace old by new dimension names -# new_dims = list(map(lambda x: self.__dim_map[self.pathname(x.group(), x.name)], var.get_dims())) + # new_dims = list(map(lambda x: self.__dim_map[self.pathname(x.group(), x.name)], var.get_dims())) - new_dims = list(map(lambda x: self.__dim_map[self.pathname(self.group(x), self.name(x))], self.get_dims(var))) + new_dims = list( + map( + lambda x: self.__dim_map[ + self.pathname(self.group(x), self.name(x)) + ], + self.get_dims(var), + ) + ) # Write variable -# fullname = self.pathname(var.group(), self.name(var)) + # fullname = self.pathname(var.group(), self.name(var)) fullname = self.pathname(self.group(var), self.name(var)) logging.info("create variable {} from {}".format(new_name, fullname)) @@ -543,16 +619,23 @@ def flatten_variable(self, var): chunksizes=self.chunksizes(var), endian=self.endian(var), least_significant_digit=None, - fill_value=None) + fill_value=None, + ) if self.__copy_data: # Find out slice method for variable and copy data - if self.__copy_slices is None or fullname not in self.__copy_slices: + if ( + self.__copy_slices is None + or fullname not in self.__copy_slices + ): # Copy data as a whole new_var[:] = var[:] elif self.__copy_slices[fullname] is None: # Copy with default slice size - copy_slice = tuple(self.__default_copy_slice_size // len(var.shape) for _ in range(len(var.shape))) + copy_slice = tuple( + self.__default_copy_slice_size // len(var.shape) + for _ in range(len(var.shape)) + ) self.copy_var_by_slices(new_var, var, copy_slice) else: # Copy in slices @@ -560,29 +643,38 @@ def flatten_variable(self, var): self.copy_var_by_slices(new_var, var, copy_slice) # Copy attributes -# new_var.setncatts(var.__dict__) + # new_var.setncatts(var.__dict__) new_var.setncatts(self.attrs(var)) # Store new name in dict for resolving references later -# self.__var_map[self.pathname(var.group(), var.name)] = new_name - self.__var_map[self.pathname(self.group(var), self.name(var))] = new_name + # self.__var_map[self.pathname(var.group(), var.name)] = new_name + self.__var_map[ + self.pathname(self.group(var), self.name(var)) + ] = new_name # Add to name mapping attribute -# self.__var_map_value.append(self.generate_mapping_str(var.group(), var.name, new_name)) - self.__var_map_value.append(self.generate_mapping_str(self.group(var), self.name(var), new_name)) + # self.__var_map_value.append(self.generate_mapping_str(var.group(), var.name, new_name)) + self.__var_map_value.append( + self.generate_mapping_str( + self.group(var), self.name(var), new_name + ) + ) # Resolve references in variable attributes and replace by absolute path: self.resolve_references(new_var, var) def increment_pos(self, pos, dim, copy_slice_shape, var_shape): - """Increment position vector in a variable along a dimension by the matching slice length along than dimension. - If end of the dimension is reached, recursively increment the next dimensions until a valid position is found. + """Increment position vector in a variable along a dimension by + the matching slice length along than dimension. If end of the + dimension is reached, recursively increment the next dimensions + until a valid position is found. :param pos: current position :param dim: dimension to be incremented :param copy_slice_shape: shape of the slice :param var_shape: shape of the variable :return True if a valid position is found within the variable, False otherwise + """ # Try to increment dimension pos[dim] += copy_slice_shape[dim] @@ -596,8 +688,10 @@ def increment_pos(self, pos, dim, copy_slice_shape, var_shape): return True # End of this dimension reached. Reset to 0 and try increment next one recursively elif dim_end_reached and not var_end_reached: - pos[:dim + 1] = [0 for j in range(dim + 1)] - return self.increment_pos(pos, dim + 1, copy_slice_shape, var_shape) + pos[: dim + 1] = [0 for j in range(dim + 1)] + return self.increment_pos( + pos, dim + 1, copy_slice_shape, var_shape + ) # End of this dimension reached, and no dimension to increment. Finish. else: return False @@ -608,9 +702,14 @@ def copy_var_by_slices(self, new_var, old_var, copy_slice_shape): :param new_var: new variable where to copy data :param old_var: variable where data should be copied from :param copy_slice_shape: shape of the slice + """ -# logging.info(" copying data of {} in {} slices".format(old_var.name, copy_slice_shape)) - logging.info(" copying data of {} in {} slices".format(self.name(old_var), copy_slice_shape)) + # logging.info(" copying data of {} in {} slices".format(old_var.name, copy_slice_shape)) + logging.info( + " copying data of {} in {} slices".format( + self.name(old_var), copy_slice_shape + ) + ) # Initial position vector pos = [0 for _ in range(len(copy_slice_shape))] @@ -619,22 +718,30 @@ def copy_var_by_slices(self, new_var, old_var, copy_slice_shape): var_end_reached = False while not var_end_reached: # Create current slice - current_slice = tuple(slice(pos[dim_i], min(old_var.shape[dim_i], pos[dim_i] + dim_l)) for dim_i, dim_l in - enumerate(copy_slice_shape)) + current_slice = tuple( + slice( + pos[dim_i], min(old_var.shape[dim_i], pos[dim_i] + dim_l) + ) + for dim_i, dim_l in enumerate(copy_slice_shape) + ) # Copy data in slice new_var[current_slice] = old_var[current_slice] # Get next position - var_end_reached = not self.increment_pos(pos, 0, copy_slice_shape, old_var.shape) + var_end_reached = not self.increment_pos( + pos, 0, copy_slice_shape, old_var.shape + ) def resolve_reference(self, orig_ref, orig_var, attr): - """Resolve the absolute path to a coordinate variable within the group structure. + """Resolve the absolute path to a coordinate variable within the + group structure. :param orig_ref: reference to resolve :param orig_var: variable originally containing the reference :param attr: _AttributeProperties object enum item to know if ref to dim or var :return: absolute path to the reference + """ ref = orig_ref absolute_ref = None @@ -644,7 +751,7 @@ def resolve_reference(self, orig_ref, orig_var, attr): resolve_dim_or_var = attr.ref_to_dim > attr.ref_to_var # Resolve var (resp. dim) if resolving as dim (resp. var) failed - resolve_alt = (attr.ref_to_dim and attr.ref_to_var) + resolve_alt = attr.ref_to_dim and attr.ref_to_var # Reference is already given by absolute path if ref.startswith(self.__default_separator): @@ -657,85 +764,132 @@ def resolve_reference(self, orig_ref, orig_var, attr): # First tentative as dim OR var ref_type = "dimension" if resolve_dim_or_var else "variable" -# absolute_ref = self.search_by_relative_path(orig_ref, orig_var.group(), resolve_dim_or_var) - absolute_ref = self.search_by_relative_path(orig_ref, self.group(orig_var), resolve_dim_or_var) + # absolute_ref = self.search_by_relative_path(orig_ref, orig_var.group(), resolve_dim_or_var) + absolute_ref = self.search_by_relative_path( + orig_ref, self.group(orig_var), resolve_dim_or_var + ) # If failed and alternative possible, second tentative if absolute_ref is None and resolve_alt: - ref_type = "dimension" if not resolve_dim_or_var else "variable" -# absolute_ref = self.search_by_relative_path(orig_ref, orig_var.group(), not resolve_dim_or_var) - absolute_ref = self.search_by_relative_path(orig_ref, self.groupp(orig_var), not resolve_dim_or_var) + ref_type = ( + "dimension" if not resolve_dim_or_var else "variable" + ) + # absolute_ref = self.search_by_relative_path(orig_ref, orig_var.group(), not resolve_dim_or_var) + absolute_ref = self.search_by_relative_path( + orig_ref, self.groupp(orig_var), not resolve_dim_or_var + ) # Reference is to be searched by proximity else: method = " proximity" - absolute_ref, ref_type = self.resolve_reference_proximity(ref, resolve_dim_or_var, resolve_alt, orig_var, - attr) + absolute_ref, ref_type = self.resolve_reference_proximity( + ref, resolve_dim_or_var, resolve_alt, orig_var, attr + ) # Post-search checks and return result - return self.resolve_reference_post_processing(absolute_ref, orig_ref, orig_var, attr, ref_type, method) - - def resolve_reference_proximity(self, ref, resolve_dim_or_var, resolve_alt, orig_var, attr): - """Resolve reference: search by proximity - """ + return self.resolve_reference_post_processing( + absolute_ref, orig_ref, orig_var, attr, ref_type, method + ) + + def resolve_reference_proximity( + self, ref, resolve_dim_or_var, resolve_alt, orig_var, attr + ): + """Resolve reference: search by proximity.""" # First tentative as dim OR var ref_type = "dimension" if resolve_dim_or_var else "variable" -# resolved_var = self.search_by_proximity(ref, orig_var.group(), resolve_dim_or_var, False, -# attr.stop_at_local_apex) - resolved_var = self.search_by_proximity(ref, self.group(orig_var), resolve_dim_or_var, False, - attr.stop_at_local_apex) + # resolved_var = self.search_by_proximity(ref, orig_var.group(), resolve_dim_or_var, False, + # attr.stop_at_local_apex) + resolved_var = self.search_by_proximity( + ref, + self.group(orig_var), + resolve_dim_or_var, + False, + attr.stop_at_local_apex, + ) # If failed and alternative possible, second tentative if resolved_var is None and resolve_alt: ref_type = "dimension" if not resolve_dim_or_var else "variable" -# resolved_var = self.search_by_proximity(ref, orig_var.group(), not resolve_dim_or_var, False, -# attr.stop_at_local_apex) - resolved_var = self.search_by_proximity(ref, self.group(orig_var), not resolve_dim_or_var, False, - attr.stop_at_local_apex) + # resolved_var = self.search_by_proximity(ref, orig_var.group(), not resolve_dim_or_var, False, + # attr.stop_at_local_apex) + resolved_var = self.search_by_proximity( + ref, + self.group(orig_var), + not resolve_dim_or_var, + False, + attr.stop_at_local_apex, + ) # If found, create ref string if resolved_var is not None: -# return self.pathname(resolved_var.group(), resolved_var.name), ref_type - return self.pathname(self.group(resolved_var), self.name(resolved_var)), ref_type + # return self.pathname(resolved_var.group(), resolved_var.name), ref_type + return ( + self.pathname( + self.group(resolved_var), self.name(resolved_var) + ), + ref_type, + ) else: return None, "" - def resolve_reference_post_processing(self, absolute_ref, orig_ref, orig_var, attr, ref_type, method): - """Post-processing operations after resolving reference - """ + def resolve_reference_post_processing( + self, absolute_ref, orig_ref, orig_var, attr, ref_type, method + ): + """Post-processing operations after resolving reference.""" # If not found and accept standard name, assume standard name if absolute_ref is None and attr.accept_standard_names: - logging.info(" coordinate reference to '{}' not resolved. Assumed to be a standard name.".format(orig_ref)) + logging.info( + " coordinate reference to '{}' not resolved. Assumed to be a standard name.".format( + orig_ref + ) + ) ref_type = "standard_name" absolute_ref = orig_ref # Else if not found, raise exception elif absolute_ref is None: -# absolute_ref = self.handle_reference_error(orig_ref, orig_var.group().path) - absolute_ref = self.handle_reference_error(orig_ref, self.path(self.group(orig_var))) + # absolute_ref = self.handle_reference_error(orig_ref, orig_var.group().path) + absolute_ref = self.handle_reference_error( + orig_ref, self.path(self.group(orig_var)) + ) # If found: else: - logging.info(" {} coordinate reference to {} '{}' resolved as '{}'" - .format(method, ref_type, orig_ref, absolute_ref)) + logging.info( + " {} coordinate reference to {} '{}' resolved as '{}'".format( + method, ref_type, orig_ref, absolute_ref + ) + ) # If variables refs are limited to coordinate variable, additional check -# and (("coordinates" not in orig_var.ncattrs() or orig_ref not in orig_var.coordinates) - if ref_type == "variable" and attr.limit_to_scalar_coordinates \ - and (("coordinates" not in self.ncattrs(orig_var) or orig_ref not in self.getncattr(orig_var, "coordinates")) - or self._Flattener__input_file[absolute_ref].ndim > 0): - logging.info(" coordinate reference to '{}' is not a SCALAR COORDINATE variable. " - "Assumed to be a standard name.".format(orig_ref)) + # and (("coordinates" not in orig_var.ncattrs() or orig_ref not in orig_var.coordinates) + if ( + ref_type == "variable" + and attr.limit_to_scalar_coordinates + and ( + ( + "coordinates" not in self.ncattrs(orig_var) + or orig_ref not in self.getncattr(orig_var, "coordinates") + ) + or self._Flattener__input_file[absolute_ref].ndim > 0 + ) + ): + logging.info( + " coordinate reference to '{}' is not a SCALAR COORDINATE variable. " + "Assumed to be a standard name.".format(orig_ref) + ) absolute_ref = orig_ref # Return result return absolute_ref def search_by_relative_path(self, ref, current_group, search_dim): - """Resolve the absolute path to a reference within the group structure, using search by relative path. + """Resolve the absolute path to a reference within the group + structure, using search by relative path. :param ref: reference to resolve :param current_group: current group where searching :param search_dim: if true, search references to dimensions, if false, search references to variables :return: absolute path to the coordinate + """ # Go up parent groups while ref.startswith("../"): @@ -753,14 +907,26 @@ def search_by_relative_path(self, ref, current_group, search_dim): return None # Get variable or dimension - elt = current_group.dimensions[ref_split[-1]] if search_dim else current_group.variables[ref_split[-1]] + elt = ( + current_group.dimensions[ref_split[-1]] + if search_dim + else current_group.variables[ref_split[-1]] + ) # Get absolute reference -# return self.pathname(elt.group(), elt.name) + # return self.pathname(elt.group(), elt.name) return self.pathname(self.group(elt), self.name(elt)) - def search_by_proximity(self, ref, current_group, search_dim, local_apex_reached, is_coordinate_variable): - """Resolve the absolute path to a reference within the group structure, using search by proximity. + def search_by_proximity( + self, + ref, + current_group, + search_dim, + local_apex_reached, + is_coordinate_variable, + ): + """Resolve the absolute path to a reference within the group + structure, using search by proximity. First search up in the hierarchy for the reference, until root group is reached. If coordinate variable, search until local apex is reached, Then search down in siblings. @@ -771,14 +937,19 @@ def search_by_proximity(self, ref, current_group, search_dim, local_apex_reached :param local_apex_reached: False initially, until apex is reached. :param is_coordinate_variable: true, if looking for a coordinate variable :return: absolute path to the coordinate + """ - dims_or_vars = current_group.dimensions if search_dim else current_group.variables + dims_or_vars = ( + current_group.dimensions if search_dim else current_group.variables + ) # Found in current group if ref in dims_or_vars.keys(): return dims_or_vars[ref] - local_apex_reached = local_apex_reached or ref in current_group.dimensions.keys() + local_apex_reached = ( + local_apex_reached or ref in current_group.dimensions.keys() + ) # Check if has to continue looking in parent group # - normal search: continue until root is reached @@ -790,15 +961,25 @@ def search_by_proximity(self, ref, current_group, search_dim, local_apex_reached # Search up if not top_reached: - return self.search_by_proximity(ref, current_group.parent, search_dim, local_apex_reached, - is_coordinate_variable) + return self.search_by_proximity( + ref, + current_group.parent, + search_dim, + local_apex_reached, + is_coordinate_variable, + ) # If coordinate variable and local apex reached, search down in siblings elif is_coordinate_variable and local_apex_reached: found_elt = None for child_group in current_group.groups.values(): - found_elt = self.search_by_proximity(ref, child_group, search_dim, local_apex_reached, - is_coordinate_variable) + found_elt = self.search_by_proximity( + ref, + child_group, + search_dim, + local_apex_reached, + is_coordinate_variable, + ) if found_elt is not None: break return found_elt @@ -808,11 +989,13 @@ def search_by_proximity(self, ref, current_group, search_dim, local_apex_reached return None def __escape_index_error(self, match, group_name): - """Return the group in a match if it exists, an empty string otherwise. + """Return the group in a match if it exists, an empty string + otherwise. :param match: regex match :param group_name: group name :return: match group + """ try: return match.group(group_name) @@ -820,14 +1003,16 @@ def __escape_index_error(self, match, group_name): return "" def resolve_references(self, var, old_var): - """In a given variable, replace all references to other variables in its attributes by absolute references. + """In a given variable, replace all references to other + variables in its attributes by absolute references. :param var: flattened variable in which references should be renamed with absolute references :param old_var: original variable (in group structure) + """ for attr in _AttributeProperties: if attr.name in var.__dict__: -# attr_value = var.getncattr(attr.name) + # attr_value = var.getncattr(attr.name) attr_value = self.getncattr(var, attr.name) # Parse attribute value parsed_attr = parse_var_attr(attr_value) @@ -836,25 +1021,40 @@ def resolve_references(self, var, old_var): resolved_parsed_attr = collections.OrderedDict() for k, v in parsed_attr.items(): - new_k = self.resolve_reference(k, old_var, attr) if attr.resolve_key else k - - new_v = ([self.resolve_reference(x, old_var, attr) for x in parsed_attr[k]] - if attr.resolve_value and parsed_attr[k] is not None else parsed_attr[k]) + new_k = ( + self.resolve_reference(k, old_var, attr) + if attr.resolve_key + else k + ) + + new_v = ( + [ + self.resolve_reference(x, old_var, attr) + for x in parsed_attr[k] + ] + if attr.resolve_value and parsed_attr[k] is not None + else parsed_attr[k] + ) resolved_parsed_attr[new_k] = new_v # Re-generate attribute value string with resolved references - var.setncattr(attr.name, generate_var_attr_str(resolved_parsed_attr)) + var.setncattr( + attr.name, generate_var_attr_str(resolved_parsed_attr) + ) def adapt_references(self, var): - """In a given variable, replace all references to variables in attributes by references to the new names in the - flattened NetCDF. All references have to be already resolved as absolute references. + """In a given variable, replace all references to variables in + attributes by references to the new names in the flattened + NetCDF. All references have to be already resolved as absolute + references. :param var: flattened variable in which references should be renamed with new names + """ for attr in _AttributeProperties: if attr.name in var.__dict__: - #attr_value = var.getncattr(attr.name) + # attr_value = var.getncattr(attr.name) attr_value = self.getncattr(var, attr.name) # Parse attribute value parsed_attr = parse_var_attr(attr_value) @@ -864,24 +1064,33 @@ def adapt_references(self, var): for k, v in parsed_attr.items(): new_k = self.adapt_name(k, attr) if attr.resolve_key else k - new_v = ([self.adapt_name(x, attr) for x in parsed_attr[k]] - if attr.resolve_value and parsed_attr[k] is not None else parsed_attr[k]) + new_v = ( + [self.adapt_name(x, attr) for x in parsed_attr[k]] + if attr.resolve_value and parsed_attr[k] is not None + else parsed_attr[k] + ) adapted_parsed_attr[new_k] = new_v new_attr_value = generate_var_attr_str(adapted_parsed_attr) var.setncattr(attr.name, new_attr_value) - logging.info(" attribute '{}' in '{}': references '{}' renamed as '{}'" - .format(attr.name, self.name(var), attr_value, new_attr_value)) -# .format(attr.name, var.name, attr_value, new_attr_value)) + logging.info( + " attribute '{}' in '{}': references '{}' renamed as '{}'".format( + attr.name, self.name(var), attr_value, new_attr_value + ) + ) + + # .format(attr.name, var.name, attr_value, new_attr_value)) def adapt_name(self, resolved_ref, attr): - """Return name of flattened reference. If not found, raise exception or continue warning. + """Return name of flattened reference. If not found, raise + exception or continue warning. :param resolved_ref: resolved reference to adapt :param attr: _AttributeProperties object enum item to know in which dict to look for name mapping :return: adapted reference + """ # If ref contains Error message, leave as such if self.__ref_not_found_error in resolved_ref: @@ -899,9 +1108,12 @@ def adapt_name(self, resolved_ref, attr): # If not found, look in other map if allowed except KeyError: - if attr.ref_to_dim and attr.ref_to_var: - name_mapping = self.__dim_map if attr.ref_to_dim < attr.ref_to_var else self.__var_map + name_mapping = ( + self.__dim_map + if attr.ref_to_dim < attr.ref_to_var + else self.__var_map + ) try: return name_mapping[resolved_ref] except KeyError: @@ -915,29 +1127,36 @@ def adapt_name(self, resolved_ref, attr): return self.handle_reference_error(resolved_ref) def pathname(self, group, name): - """Compose full path name to an element in a group structure: /path/to/group/elt + """Compose full path name to an element in a group structure: + + /path/to/group/elt. :param group: group containing element :param name: name of the element :return: pathname + """ -# if group.parent is None: + # if group.parent is None: if self.parent(group) is None: return self.__default_separator + name else: -# return self.__pathname_format.format(group.path, name) + # return self.__pathname_format.format(group.path, name) return self.__pathname_format.format(self.path(group), name) def generate_mapping_str(self, input_group, name, new_name): - """Generate a string representing the name mapping of an element before and after flattening. + """Generate a string representing the name mapping of an element + before and after flattening. :param input_group: group containing the non-flattened element :param name: name of the non-flattened element :param new_name: name of the flattened element :return: string representing the name mapping for the element + """ original_pathname = self.pathname(input_group, name) - mapping_str = self.__mapping_str_format.format(new_name, original_pathname) + mapping_str = self.__mapping_str_format.format( + new_name, original_pathname + ) return mapping_str def convert_path_to_valid_name(self, pathname): @@ -945,11 +1164,15 @@ def convert_path_to_valid_name(self, pathname): :param pathname: pathname :return: valid NetCDF name + """ - return pathname.replace(self.__default_separator, '', 1).replace(self.__default_separator, self.__new_separator) + return pathname.replace(self.__default_separator, "", 1).replace( + self.__default_separator, self.__new_separator + ) def generate_flattened_name(self, input_group, orig_name): """Convert full path of an element to a valid NetCDF name: + - the name of an element is the concatenation of its containing group and its name, - replaces / from paths (forbidden as NetCDF name), - if name is longer than 255 characters, replace path to group by hash, @@ -958,6 +1181,7 @@ def generate_flattened_name(self, input_group, orig_name): :param input_group: group containing element :param orig_name: original name of the element :return: new valid name of the element + """ # If element is at root: no change # if input_group.parent is None: @@ -966,27 +1190,37 @@ def generate_flattened_name(self, input_group, orig_name): # If element in child group, concatenate group path and element name else: -# full_name = self.convert_path_to_valid_name(input_group.path) + self.__new_separator + orig_name - full_name = self.convert_path_to_valid_name(self.path(input_group)) + self.__new_separator + orig_name + # full_name = self.convert_path_to_valid_name(input_group.path) + self.__new_separator + orig_name + full_name = ( + self.convert_path_to_valid_name(self.path(input_group)) + + self.__new_separator + + orig_name + ) new_name = full_name # If resulting name is too long, hash group path if len(new_name) >= self.__max_name_len: -# group_hash = hashlib.sha1(input_group.path.encode("UTF-8")).hexdigest() - group_hash = hashlib.sha1(self.path(input_group).encode("UTF-8")).hexdigest() + # group_hash = hashlib.sha1(input_group.path.encode("UTF-8")).hexdigest() + group_hash = hashlib.sha1( + self.path(input_group).encode("UTF-8") + ).hexdigest() new_name = group_hash + self.__new_separator + orig_name # If resulting name still too long, hash everything if len(new_name) >= self.__max_name_len: - new_name = hashlib.sha1(full_name.encode("UTF-8")).hexdigest() + new_name = hashlib.sha1( + full_name.encode("UTF-8") + ).hexdigest() return new_name def handle_reference_error(self, ref, context=None): - """Depending on lax/strict mode, either raise exception or log warning. If lax, return reference placeholder. + """Depending on lax/strict mode, either raise exception or log + warning. If lax, return reference placeholder. :param ref: reference :param context: additional context info to add to message :return: if continue with warning, error replacement name for reference + """ message = "Reference '{}' could not be resolved".format(ref) if context is not None: @@ -999,10 +1233,12 @@ def handle_reference_error(self, ref, context=None): class ReferenceException(Exception): - """Exception raised when references in attributes cannot be resolved. + """Exception raised when references in attributes cannot be + resolved. Attributes: message -- explanation of the error + """ def __init__(self, message): diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 6e488b987..9bca65388 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -22,10 +22,9 @@ from s3fs import S3FileSystem from ...decorators import _manage_log_level_via_verbosity +from ...flatten import flatten as flatten2 from ...functions import is_log_level_debug, is_log_level_detail from .. import IORead -from ...flatten import flatten as flatten2 - logger = logging.getLogger(__name__) @@ -504,25 +503,27 @@ def file_open(self, filename, flatten=True, verbose=None): # Create an openable s3 file object endpoint_url = f"https://{u.netloc}" uri = u.path[1:] - s3 = g['s3'] + s3 = g["s3"] if s3 is None: - s3 = {"anon": True, - "client_kwargs": {'endpoint_url': endpoint_url}} - + s3 = { + "anon": True, + "client_kwargs": {"endpoint_url": endpoint_url}, + } + fs = S3FileSystem(**s3) - filename = fs.open(uri, 'rb') - print (filename, type(filename)) + filename = fs.open(uri, "rb") + print(filename, type(filename)) if is_log_level_detail(logger): logger.debug( f" s3: s3fs.S3FileSystem options: {s3}\n" ) # pragma: no cover -# nc = h5netcdf.File(filename, "r", decode_vlen_strings=True) + # nc = h5netcdf.File(filename, "r", decode_vlen_strings=True) try: - #raise OSError() + # raise OSError() nc = h5netcdf.File(filename, "r", decode_vlen_strings=True) HDF = True - except OSError: + except OSError: # File is not HDF. Assume instead that it's netCDF3 and # open it with netCDF4. try: @@ -538,27 +539,27 @@ def file_open(self, filename, flatten=True, verbose=None): # ------------------------------------------------------------ # If the file has a group structure then flatten it (CF>=1.8) # ------------------------------------------------------------ - + if HDF: - print ("Opened with h5netcdf") + print("Opened with h5netcdf") else: - print ("Opened with netCDF4") - + print("Opened with netCDF4") + if flatten and nc.groups: - #if HDF: - # # TODOHDF: Can't yet use HDF access to process groups - # logger.warning( - # "WARNING: Using netCDF4 (rather than h5netcdf) " - # f"to access file {filename} containing groups" - # ) # pragma: no cover - # nc.close() - # HDF = False - # try: - # nc = netCDF4.Dataset(filename, "r") - # netCDF = True - # except RuntimeError as error: - # raise RuntimeError(f"{error}: {filename}") - + # if HDF: + # # TODOHDF: Can't yet use HDF access to process groups + # logger.warning( + # "WARNING: Using netCDF4 (rather than h5netcdf) " + # f"to access file {filename} containing groups" + # ) # pragma: no cover + # nc.close() + # HDF = False + # try: + # nc = netCDF4.Dataset(filename, "r") + # netCDF = True + # except RuntimeError as error: + # raise RuntimeError(f"{error}: {filename}") + # Create a diskless, non-persistent container for the # flattened file flat_file = tempfile.NamedTemporaryFile( @@ -575,10 +576,8 @@ def file_open(self, filename, flatten=True, verbose=None): flat_nc.set_fill_off() # Flatten the file -# netcdf_flattener.flatten( - flatten2( - nc, flat_nc, lax_mode=True, _copy_data=False - ) + # netcdf_flattener.flatten( + flatten2(nc, flat_nc, lax_mode=True, _copy_data=False) # Store the original grouped file. This is primarily # because the unlimited dimensions in the flattened @@ -590,10 +589,10 @@ def file_open(self, filename, flatten=True, verbose=None): netCDF = True HDF = False - + g["has_groups"] = True g["flat_files"].append(flat_file) - + g["netCDF"] = netCDF g["HDF"] = HDF g["nc"] = nc @@ -669,7 +668,11 @@ def is_netcdf_file(cls, filename): """ # Assume that URLs are in netCDF format - if filename.startswith("https://") or filename.startswith("http://") or filename.startswith("s3://"): + if ( + filename.startswith("https://") + or filename.startswith("http://") + or filename.startswith("s3://") + ): return True # Read the magic number @@ -848,7 +851,7 @@ def read( warnings=True, warn_valid=False, domain=False, - s3=None + s3=None, ): """Reads a netCDF dataset from file or OPenDAP URL. @@ -1067,7 +1070,7 @@ def read( # 'global_attributes' dictionary # ---------------------------------------------------------------- global_attributes = {} -# for attr in map(str,nc.ncattrs()): + # for attr in map(str,nc.ncattrs()): for attr, value in self._file_global_attributes().items(): attr = str(attr) try: @@ -1227,7 +1230,7 @@ def read( flattener_attributes.setdefault(tuple(groups), {})[ group_attr ] = self._file_global_attribute(flat_attr) -# ] = nc.getncattr(flat_attr) + # ] = nc.getncattr(flat_attr) # Remove flattener attributes from the global attributes for attr in ( @@ -1242,9 +1245,9 @@ def read( groups = () group_attributes = {} -# variable = nc.variables[ncvar] + # variable = nc.variables[ncvar] variable = self._file_variable(ncvar) - + # -------------------------------------------------------- # Specify the group structure for each variable (CF>=1.8) # TODO @@ -1293,18 +1296,21 @@ def read( variable_grouped_dataset[ncvar] = g["nc_grouped"] variable_attributes[ncvar] = {} -# for attr in map(str, variable.ncattrs()): - for attr, value in self._file_variable_attributes(variable).items(): + # for attr in map(str, variable.ncattrs()): + for attr, value in self._file_variable_attributes( + variable + ).items(): attr = str(attr) if isinstance(value, bytes): value = value.decode(errors="ignore") - - variable_attributes[ncvar][attr] = value -# print (attr, value, type(value)) + variable_attributes[ncvar][attr] = value + # print (attr, value, type(value)) -# variable_dimensions[ncvar] = tuple(variable.dimensions) - variable_dimensions[ncvar] = tuple(self._file_variable_dimensions(variable)) + # variable_dimensions[ncvar] = tuple(variable.dimensions) + variable_dimensions[ncvar] = tuple( + self._file_variable_dimensions(variable) + ) variable_dataset[ncvar] = nc variable_filename[ncvar] = g["filename"] variables[ncvar] = variable @@ -1315,7 +1321,7 @@ def read( # Populate dimensions_groups abd dimension_basename # dictionaries -# for ncdim in nc.dimensions: + # for ncdim in nc.dimensions: for ncdim in self._file_dimensions(): ncdim_org = ncdim ncdim_basename = ncdim @@ -1341,10 +1347,12 @@ def read( dimension_groups[ncdim] = groups dimension_basename[ncdim] = ncdim_basename -# dimension_isunlimited[ncdim] = nc.dimensions[ -# ncdim_org -# ].isunlimited() - dimension_isunlimited[ncdim] = self._file_dimension_isunlimited(ncdim_org) + # dimension_isunlimited[ncdim] = nc.dimensions[ + # ncdim_org + # ].isunlimited() + dimension_isunlimited[ncdim] = self._file_dimension_isunlimited( + ncdim_org + ) if has_groups: variable_dimensions = { @@ -1392,7 +1400,7 @@ def read( # The netCDF dimensions of the parent file internal_dimension_sizes = {} -# for name, dimension in nc.dimensions.items(): + # for name, dimension in nc.dimensions.items(): for name, dimension in self._file_dimensions().items(): if ( has_groups @@ -1402,10 +1410,12 @@ def read( # size from the original grouped dataset, because # unlimited dimensions have size 0 in the flattened # dataset (because it contains no data) (v1.8.8.1) - group, ncdim = self._netCDF4_group( # h5netcdf + group, ncdim = self._netCDF4_group( # h5netcdf g["nc_grouped"], flattener_dimensions[name] ) - internal_dimension_sizes[name] = group.dimensions[ncdim].size # h5netcdf + internal_dimension_sizes[name] = group.dimensions[ + ncdim + ].size # h5netcdf else: internal_dimension_sizes[name] = dimension.size @@ -2547,7 +2557,7 @@ def _parse_geometry(self, parent_ncvar, attributes): # variable in this case. # -------------------------------------------------------- nodes_per_geometry = self.implementation.initialise_Count() -# size = g["nc"].dimensions[node_dimension].size + # size = g["nc"].dimensions[node_dimension].size size = self._file_dimension_size(node_dimension) ones = self.implementation.initialise_Data( array=np.ones((size,), dtype="int32"), copy=False @@ -6016,7 +6026,7 @@ def _create_netcdfarray( group, name = self._netCDF4_group( g["variable_grouped_dataset"][ncvar], ncvar ) - variable = group.variables.get(name) # h5netcdf + variable = group.variables.get(name) # h5netcdf else: variable = g["variables"].get(ncvar) @@ -6035,7 +6045,7 @@ def _create_netcdfarray( ndim = variable.ndim shape = variable.shape -# size = variable.size + # size = variable.size size = self._file_variable_size(variable) if size < 2: @@ -6100,7 +6110,7 @@ def _create_netcdfarray( if return_kwargs_only: return kwargs - if g['original_netCDF']: + if g["original_netCDF"]: array = self.implementation.initialise_NetCDFArray(**kwargs) else: # h5netcdf @@ -9956,8 +9966,8 @@ def _ugrid_check_connectivity_variable( def _file_global_attributes(self): g = self.read_vars - nc = g['nc'] - if g['netCDF']: + nc = g["nc"] + if g["netCDF"]: # netCDF4 return {attr: nc.getncattr(attr) for attr in nc.ncattrs()} @@ -9966,7 +9976,7 @@ def _file_global_attributes(self): def _file_dimensions(self): g = self.read_vars - return g['nc'].dimensions + return g["nc"].dimensions def _file_dimension(self, dim_name): return self._file_dimensions()[dim_name] @@ -9978,9 +9988,9 @@ def _file_dimension_size(self, dim_name): return self._file_dimensions(dim_name).size def _file_variables(self): - """ """ + """TOODHDF.""" g = self.read_vars - return g['nc'].variables + return g["nc"].variables def _file_variable(self, var_name): return self._file_variables()[var_name] @@ -9988,17 +9998,17 @@ def _file_variable(self, var_name): def _file_variable_attributes(self, var, names_only=False): g = self.read_vars if not names_only: - if g['netCDF']: + if g["netCDF"]: # netCDF4 return {attr: var.getncattr(attr) for attr in var.ncattrs()} - + # h5netcdf return var.attrs - - if g['netCDF']: + + if g["netCDF"]: # netCDF4 return var.ncattrs() - + # h5netcdf return list(var.attrs) @@ -10006,12 +10016,9 @@ def _file_variable_dimensions(self, var): return var.dimensions def _file_variable_size(self, var): - g = self.read_vars try: # netCDF4 return var.size - except AttributeError: + except AttributeError: # h5netcdf return prod(var.shape) - - diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index f6eb7024b..5669ccced 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -4459,7 +4459,7 @@ def file_open(self, filename, mode, fmt, fields): os.remove(filename) try: -# nc.set_chunk_cache(16*1024*1024) # 16MiB chunkcache + # nc.set_chunk_cache(16*1024*1024) # 16MiB chunkcache nc = netCDF4.Dataset(filename, mode, format=fmt) except RuntimeError as error: raise RuntimeError(f"{error}: {filename}") diff --git a/cfdm/test/test_groups.py b/cfdm/test/test_groups.py index a011f3108..3610a5439 100644 --- a/cfdm/test/test_groups.py +++ b/cfdm/test/test_groups.py @@ -65,8 +65,8 @@ def test_groups(self): ungrouped_file = ungrouped_file1 grouped_file = grouped_file1 - grouped_file = 'delme_grouped.nc' - + grouped_file = "delme_grouped.nc" + # Add a second grid mapping datum = cfdm.Datum(parameters={"earth_radius": 7000000}) conversion = cfdm.CoordinateConversion( @@ -105,7 +105,7 @@ def test_groups(self): nc.close() grouped_file = grouped_file1 - + h = cfdm.read(grouped_file, verbose=1) self.assertEqual(len(h), 1, repr(h)) self.assertTrue(f.equals(h[0], verbose=2)) From 5b2287a6fb6848950a68087f97037223aa76e1a6 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 25 Jan 2024 09:56:28 +0000 Subject: [PATCH 13/88] dev --- cfdm/flatten.py | 342 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 238 insertions(+), 104 deletions(-) diff --git a/cfdm/flatten.py b/cfdm/flatten.py index c806a7a20..4fc4245b7 100644 --- a/cfdm/flatten.py +++ b/cfdm/flatten.py @@ -1,4 +1,5 @@ """Project: NetCDF Flattener + Copyright (c) 2020 EUMETSAT License: Apache License 2.0 @@ -18,29 +19,27 @@ KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + +This code has been modified from the original found in the +netcdf_flattener package. + """ -import collections import hashlib import logging - -# import os import re import warnings from enum import Enum -# from netCDF4 import Dataset -# from h5netcdf import File as Dataset - def flatten( input_ds, output_ds, lax_mode=False, _copy_data=True, copy_slices=None ): - """Flatten an input NetCDF dataset and write the result in an output - NetCDF dataset. + """Create a flattened version of a netCDF dataset. - For variable that are too big to fit in memory, the optional "copy_slices" input allows to copy some or all of the - variables in slices. + For variable that are too big to fit in memory, the optional + "copy_slices" input allows to copy some or all of the variables in + slices. :param input_ds: input netcdf4 dataset :param output_ds: output netcdf4 dataset @@ -74,8 +73,7 @@ def parse_var_attr(input_str): """ def subst(s): - """substitute tokens for WORD and SEP (space or end of - string)""" + """Substitute tokens for WORD and SEP.""" return s.replace("WORD", r"[A-Za-z0-9_#/.\(\)]+").replace( "SEP", r"(\s+|$)" ) @@ -102,7 +100,7 @@ def subst(s): m = re.match(pat_all, input_str) # Output is always a dict. If input form is a list, dict values are set as empty lists - out = collections.OrderedDict() + out = {} # collections.OrderedDict() if m is not None: list_match = m.group("list") @@ -150,9 +148,12 @@ def generate_var_attr_str(d): class _AttributeProperties(Enum): - """"Utility class containing the properties for each type of - variable attribute, defining how contained references to dimensions - and variables should be parsed and processed.""" + """Utility class containing the properties for each attribute. + + For each variable attribute, defines how contained references to + dimensions and variables should be parsed and processed. + + """ ancillary_variables = (0, (False, True, True, False, False, False, False)) bounds = (1, (False, True, True, False, False, False, False)) @@ -162,6 +163,7 @@ class _AttributeProperties(Enum): formula_terms = (5, (False, True, False, True, False, False, False)) geometry = (6, (False, True, True, False, False, False, False)) grid_mapping = (7, (False, True, True, True, False, False, False)) + # Geometry variables interior_ring = (8, (False, True, True, False, False, False, False)) node_coordinates = (9, (False, True, True, False, False, False, False)) node_count = (10, (False, True, True, False, False, False, False)) @@ -171,22 +173,96 @@ class _AttributeProperties(Enum): instance_dimension = (14, (True, False, True, False, False, False, False)) sample_dimension = (15, (True, False, True, False, False, False, False)) cell_methods = (16, (2, 1, True, False, False, True, True)) + # Domain variable dimensions + dimensions = (17, (True, False, True, False, False, False, False)) + # CFA instructsions + aggregated_dimensions = ( + 18, + (True, False, True, False, False, False, False), + ) + aggregated_data = (19, (False, True, False, True, False, False, False)) + # UGRID variables + # + # * node_coordinates has already been assigned under Geometry + # variables + # * IDs 20, 23, 29, 30, 31, 32, 35, 36, 37 are reserved for potential + # further UGRID usage + edge_coordinates = (21, (False, True, True, False, False, False, False)) + face_coordinates = (22, (False, True, True, False, False, False, False)) + edge_node_connectivity = ( + 24, + (False, True, True, False, False, False, False), + ) + face_node_connectivity = ( + 25, + (False, True, True, False, False, False, False), + ) + face_face_connectivity = ( + 26, + (False, True, True, False, False, False, False), + ) + edge_face_connectivity = ( + 27, + (False, True, True, False, False, False, False), + ) + face_edge_connectivity = ( + 28, + (False, True, True, False, False, False, False), + ) + edge_dimension = (33, (True, False, True, False, False, False, False)) + face_dimension = (34, (True, False, True, False, False, False, False)) + mesh = (38, (False, True, True, False, False, False, False)) def __init__(self, n, props): """_AttributeProperties enum constructor. - :param n: enum id - :param props: a tuple containing the attribute's properties (ref_to_dim, ref_to_var, resolve_key, resolve_value, - stop_at_local_apex, accept_standard_names, limit_to_scalar_coordinates): - * ref_to_dim: True or integer if contains references to dimensions (highest int have priority) - * ref_to_var: True or integer if contains references to variables (highest int have priority) - * resolve_key: True if 'keys' have to be resolved in 'key1: value1 key2: value2 value3' or 'key1 key2' - * resolve_value: True if 'values' have to be resolved in 'key1: value1 key2: value2 value3' - * stop_at_local_apex: True if upward research in the hierarchy has to stop at local apex - * accept_standard_names: True if any standard name is valid in place of references (in which case no - exception is raised if a reference cannot be resolved, and the standard name is used in place) - * limit_to_scalar_coordinates: True if references to variables are only resolved if present as well in - the 'coordinates' attributes of the variable, and they are scalar. + :Parameters: + + n: `int` + Enum id. + + props: `tuple` + A sequence containing the attribute's properties + (ref_to_dim, ref_to_var, resolve_key, resolve_value, + stop_at_local_apex, accept_standard_names, + limit_to_scalar_coordinates): + + 1. ref_to_dim: True or integer if contains references + to dimensions (highest int have + priority) + + 2. ref_to_var: True or integer if contains references + to variables (highest int have + priority) + + 3. resolve_key: True if 'keys' have to be resolved in + 'key1: value1 key2: value2 value3' or + 'key1 key2' + + 4. resolve_value: True if 'values' have to be resolved + in 'key1: value1 key2: value2 + value3' + + 5. stop_at_local_apex: True if upward research in the + hierarchy has to stop at local + apex + + 6. accept_standard_names: True if any standard name is + valid in place of references + (in which case no exception + is raised if a reference + cannot be resolved, and the + standard name is used in + place) + + 7. limit_to_scalar_coordinates: True if references to + variables are only + resolved if present as + well in the + 'coordinates' + attributes of the + variable, and they are + scalar. """ self.id = n @@ -200,8 +276,12 @@ def __init__(self, n, props): class _Flattener: - """Utility class contained the input file, the output file being - flattened, and all the logic of the flattening process.""" + """Information and methods needed to flatten a netCDF dataset. + + Contains the input file, the output file being flattened, and all + the logic of the flattening process. + + """ __max_name_len = 256 __default_separator = "/" @@ -217,8 +297,7 @@ class _Flattener: __var_map_name = "__flattener_name_mapping_variables" def __init__(self, input_ds, lax_mode, _copy_data=True, copy_slices=None): - """Constructor. Initializes the Flattener class given the input - file. + """**Initialisation** :param input_ds: input netcdf dataset :param lax_mode: if false (default), not resolving a reference halts the execution. If true, continue with warning. @@ -232,13 +311,12 @@ def __init__(self, input_ds, lax_mode, _copy_data=True, copy_slices=None): not contained in the dict, it will not be sliced and copied normally. """ - self.__attr_map_value = [] self.__dim_map_value = [] self.__var_map_value = [] - self.__dim_map = dict() - self.__var_map = dict() + self.__dim_map = {} # dict() + self.__var_map = {} # dict() self.__lax_mode = lax_mode @@ -307,12 +385,13 @@ def endian(self, variable): return "native" def filepath(self, dataset): - """Return the file system path (or the opendap URL) for the - Dataset. + """Return the file path for the Dataset. :Returns: `str` + The file system path, or the opendap URL, for the + dataset. """ try: @@ -664,10 +743,12 @@ def flatten_variable(self, var): self.resolve_references(new_var, var) def increment_pos(self, pos, dim, copy_slice_shape, var_shape): - """Increment position vector in a variable along a dimension by + """TODOHDF. + + Increment position vector in a variable along a dimension by the matching slice length along than dimension. If end of the - dimension is reached, recursively increment the next dimensions - until a valid position is found. + dimension is reached, recursively increment the next + dimensions until a valid position is found. :param pos: current position :param dim: dimension to be incremented @@ -734,7 +815,9 @@ def copy_var_by_slices(self, new_var, old_var, copy_slice_shape): ) def resolve_reference(self, orig_ref, orig_var, attr): - """Resolve the absolute path to a coordinate variable within the + """Resolve a refrence. + + Resolves the absolute path to a coordinate variable within the group structure. :param orig_ref: reference to resolve @@ -763,7 +846,11 @@ def resolve_reference(self, orig_ref, orig_var, attr): method = " relative" # First tentative as dim OR var - ref_type = "dimension" if resolve_dim_or_var else "variable" + # ref_type = "dimension" if resolve_dim_or_var else "variable" + if resolve_dim_or_var: + ref_type = "dimension" + else: + ref_type = "variable" # absolute_ref = self.search_by_relative_path(orig_ref, orig_var.group(), resolve_dim_or_var) absolute_ref = self.search_by_relative_path( orig_ref, self.group(orig_var), resolve_dim_or_var @@ -771,9 +858,13 @@ def resolve_reference(self, orig_ref, orig_var, attr): # If failed and alternative possible, second tentative if absolute_ref is None and resolve_alt: - ref_type = ( - "dimension" if not resolve_dim_or_var else "variable" - ) + # ref_type = ( + # "dimension" if not resolve_dim_or_var else "variable" + # ) + if resolve_dim_or_var: + ref_type = "variable" + else: + ref_type = "dimension" # absolute_ref = self.search_by_relative_path(orig_ref, orig_var.group(), not resolve_dim_or_var) absolute_ref = self.search_by_relative_path( orig_ref, self.groupp(orig_var), not resolve_dim_or_var @@ -796,7 +887,11 @@ def resolve_reference_proximity( ): """Resolve reference: search by proximity.""" # First tentative as dim OR var - ref_type = "dimension" if resolve_dim_or_var else "variable" + # ref_type = "dimension" if resolve_dim_or_var else "variable" + if resolve_dim_or_var: + ref_type = "dimension" + else: + ref_type = "variable" # resolved_var = self.search_by_proximity(ref, orig_var.group(), resolve_dim_or_var, False, # attr.stop_at_local_apex) resolved_var = self.search_by_proximity( @@ -809,7 +904,11 @@ def resolve_reference_proximity( # If failed and alternative possible, second tentative if resolved_var is None and resolve_alt: - ref_type = "dimension" if not resolve_dim_or_var else "variable" + # ref_type = "dimension" if not resolve_dim_or_var else "variable" + if resolve_dim_or_var: + ref_type = "variable" + else: + ref_type = "dimension" # resolved_var = self.search_by_proximity(ref, orig_var.group(), not resolve_dim_or_var, False, # attr.stop_at_local_apex) resolved_var = self.search_by_proximity( @@ -882,7 +981,9 @@ def resolve_reference_post_processing( return absolute_ref def search_by_relative_path(self, ref, current_group, search_dim): - """Resolve the absolute path to a reference within the group + """Search by relative path. + + Resolves the absolute path to a reference within the group structure, using search by relative path. :param ref: reference to resolve @@ -895,6 +996,7 @@ def search_by_relative_path(self, ref, current_group, search_dim): while ref.startswith("../"): if current_group.parent is None: return None + ref = ref[3:] current_group = current_group.parent @@ -907,11 +1009,15 @@ def search_by_relative_path(self, ref, current_group, search_dim): return None # Get variable or dimension - elt = ( - current_group.dimensions[ref_split[-1]] - if search_dim - else current_group.variables[ref_split[-1]] - ) + # elt = ( + # current_group.dimensions[ref_split[-1]] + # if search_dim + # else current_group.variables[ref_split[-1]] + # ) + if search_dim: + elt = current_group.dimensions[ref_split[-1]] + else: + elt = current_group.variables[ref_split[-1]] # Get absolute reference # return self.pathname(elt.group(), elt.name) @@ -925,11 +1031,14 @@ def search_by_proximity( local_apex_reached, is_coordinate_variable, ): - """Resolve the absolute path to a reference within the group + """Search by proximity. + + Resolves the absolute path to a reference within the group structure, using search by proximity. - First search up in the hierarchy for the reference, until root group is reached. If coordinate variable, search - until local apex is reached, Then search down in siblings. + First search up in the hierarchy for the reference, until root + group is reached. If coordinate variable, search until local + apex is reached, Then search down in siblings. :param ref: reference to resolve :param current_group: current group where searching @@ -939,9 +1048,13 @@ def search_by_proximity( :return: absolute path to the coordinate """ - dims_or_vars = ( - current_group.dimensions if search_dim else current_group.variables - ) + # dims_or_vars = ( + # current_group.dimensions if search_dim else current_group.variables + # ) + if search_dim: + dims_or_vars = current_group.dimensions + else: + dims_or_vars = current_group.variables # DCH # Found in current group if ref in dims_or_vars.keys(): @@ -969,8 +1082,9 @@ def search_by_proximity( is_coordinate_variable, ) - # If coordinate variable and local apex reached, search down in siblings elif is_coordinate_variable and local_apex_reached: + # Coordinate variable and local apex reached, so search + # down in siblings found_elt = None for child_group in current_group.groups.values(): found_elt = self.search_by_proximity( @@ -982,6 +1096,7 @@ def search_by_proximity( ) if found_elt is not None: break + return found_elt # If here, did not find @@ -989,12 +1104,16 @@ def search_by_proximity( return None def __escape_index_error(self, match, group_name): - """Return the group in a match if it exists, an empty string - otherwise. + """TODOHDF. :param match: regex match :param group_name: group name - :return: match group + + :Returns: + + `str` + The group in a match if it exists, an empty string + otherwise. """ try: @@ -1003,40 +1122,49 @@ def __escape_index_error(self, match, group_name): return "" def resolve_references(self, var, old_var): - """In a given variable, replace all references to other - variables in its attributes by absolute references. + """Resolve references. + + In a given variable, replace all references to other variables + in its attributes by absolute references. :param var: flattened variable in which references should be renamed with absolute references :param old_var: original variable (in group structure) """ for attr in _AttributeProperties: - if attr.name in var.__dict__: + # if attr.name in var.__dict__: + if attr.name in self.ncattrs(var): # attr_value = var.getncattr(attr.name) attr_value = self.getncattr(var, attr.name) # Parse attribute value parsed_attr = parse_var_attr(attr_value) # Resolved references in parsed as required by attribute properties - resolved_parsed_attr = collections.OrderedDict() + resolved_parsed_attr = {} # collections.OrderedDict() for k, v in parsed_attr.items(): - new_k = ( - self.resolve_reference(k, old_var, attr) - if attr.resolve_key - else k - ) - - new_v = ( - [ - self.resolve_reference(x, old_var, attr) - for x in parsed_attr[k] + # new_k = ( + # self.resolve_reference(k, old_var, attr) + # if attr.resolve_key + # else k + # ) + if attr.resolve_key: + k = self.resolve_reference(k, old_var, attr) + + # new_v = ( + # [ + # self.resolve_reference(x, old_var, attr) + # for x in parsed_attr[k] + # ] + # if attr.resolve_value and parsed_attr[k] is not None + # else parsed_attr[k] + # ) + if attr.resolve_value and v is not None: + v = [ + self.resolve_reference(x, old_var, attr) for x in v ] - if attr.resolve_value and parsed_attr[k] is not None - else parsed_attr[k] - ) - resolved_parsed_attr[new_k] = new_v + resolved_parsed_attr[k] = v # Re-generate attribute value string with resolved references var.setncattr( @@ -1044,33 +1172,40 @@ def resolve_references(self, var, old_var): ) def adapt_references(self, var): - """In a given variable, replace all references to variables in + """Adapt references. + + In a given variable, replace all references to variables in attributes by references to the new names in the flattened - NetCDF. All references have to be already resolved as absolute + netCDF. All references have to be already resolved as absolute references. :param var: flattened variable in which references should be renamed with new names """ for attr in _AttributeProperties: - if attr.name in var.__dict__: + # if attr.name in var.__dict__: + if attr.name in self.ncattrs(var): # attr_value = var.getncattr(attr.name) attr_value = self.getncattr(var, attr.name) # Parse attribute value parsed_attr = parse_var_attr(attr_value) - adapted_parsed_attr = collections.OrderedDict() + adapted_parsed_attr = {} # collections.OrderedDict() for k, v in parsed_attr.items(): - new_k = self.adapt_name(k, attr) if attr.resolve_key else k + # new_k = self.adapt_name(k, attr) if attr.resolve_key else k + if attr.resolve_key: + k = self.adapt_name(k, attr) - new_v = ( - [self.adapt_name(x, attr) for x in parsed_attr[k]] - if attr.resolve_value and parsed_attr[k] is not None - else parsed_attr[k] - ) + # new_v = ( + # [self.adapt_name(x, attr) for x in parsed_attr[k]] + # if attr.resolve_value and parsed_attr[k] is not None + # else parsed_attr[k] + # ) + if attr.resolve_value and v is not None: + v = [self.adapt_name(x, attr) for x in v] - adapted_parsed_attr[new_k] = new_v + adapted_parsed_attr[k] = v new_attr_value = generate_var_attr_str(adapted_parsed_attr) var.setncattr(attr.name, new_attr_value) @@ -1084,7 +1219,9 @@ def adapt_references(self, var): # .format(attr.name, var.name, attr_value, new_attr_value)) def adapt_name(self, resolved_ref, attr): - """Return name of flattened reference. If not found, raise + """Apapt the name. + + Return name of flattened reference. If not found, raise exception or continue warning. :param resolved_ref: resolved reference to adapt @@ -1144,7 +1281,9 @@ def pathname(self, group, name): return self.__pathname_format.format(self.path(group), name) def generate_mapping_str(self, input_group, name, new_name): - """Generate a string representing the name mapping of an element + """Generate string mapping. + + Generates a string representing the name mapping of an element before and after flattening. :param input_group: group containing the non-flattened element @@ -1214,7 +1353,9 @@ def generate_flattened_name(self, input_group, orig_name): return new_name def handle_reference_error(self, ref, context=None): - """Depending on lax/strict mode, either raise exception or log + """Handle reference error. + + Depending on lax/strict mode, either raise exception or log warning. If lax, return reference placeholder. :param ref: reference @@ -1233,13 +1374,6 @@ def handle_reference_error(self, ref, context=None): class ReferenceException(Exception): - """Exception raised when references in attributes cannot be - resolved. - - Attributes: - message -- explanation of the error - - """ + """Exception for unresolvable references in attributes.""" - def __init__(self, message): - super().__init__(message) + pass From cb4e5c219bba797090b2e98cf1abd48ebec6d6ec Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 25 Jan 2024 11:58:31 +0000 Subject: [PATCH 14/88] h5py, h5netcdf, s3fs requirements --- requirements.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 90e0ee8ae..c50f5f8bc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,10 @@ netCDF4>=1.5.4 -h5py>=3.0.0 -h5netcdf>=1.3.0 cftime>=1.6.0 numpy>=1.15 netcdf-flattener>=1.2.0 packaging>=20.0 scipy>=1.10.0 +h5py>=3.0.0 +h5netcdf>=1.3.0 +s3fs>=2023.12.2 + From 6045ab5ced41f4bca0b0e38188ad8ddca4c864ce Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 25 Jan 2024 17:38:51 +0000 Subject: [PATCH 15/88] dev --- cfdm/cfdmimplementation.py | 31 +-- cfdm/data/hdfarray.py | 26 +- cfdm/data/mixin/filearraymixin.py | 42 ++- cfdm/flatten.py | 386 +++++++++++++++++++++++++-- cfdm/read_write/netcdf/netcdfread.py | 110 ++++++-- cfdm/read_write/read.py | 22 ++ 6 files changed, 535 insertions(+), 82 deletions(-) diff --git a/cfdm/cfdmimplementation.py b/cfdm/cfdmimplementation.py index 041ce7433..4a9437cbb 100644 --- a/cfdm/cfdmimplementation.py +++ b/cfdm/cfdmimplementation.py @@ -2356,14 +2356,16 @@ def initialise_NetCDFArray( def initialise_HDFArray( self, - filename=None, - address=None, - dtype=None, - shape=None, - mask=True, - units=False, - calendar=None, - missing_values=None, + **kwargs + # filename=None, + # address=None, + # dtype=None, + # shape=None, + # mask=True, + # units=False, + # calendar=None, + # missing_values=None, + # s3=None, ): """Return a HDF array instance. @@ -2393,22 +2395,15 @@ def initialise_HDFArray( The missing value indicators defined by the variable attributes. + s3 + :Returns: `HDFArray` """ cls = self.get_class("HDFArray") - return cls( - filename=filename, - address=address, - dtype=dtype, - shape=shape, - mask=mask, - units=units, - calendar=calendar, - missing_values=missing_values, - ) + return cls(**kwargs) def initialise_BoundsFromNodesArray(self, **kwargs): """Return a node bounds array. diff --git a/cfdm/data/hdfarray.py b/cfdm/data/hdfarray.py index d150e5e43..08baf08bc 100644 --- a/cfdm/data/hdfarray.py +++ b/cfdm/data/hdfarray.py @@ -30,6 +30,7 @@ def __init__( units=False, calendar=False, missing_values=None, + s3=None, source=None, copy=True, ): @@ -82,6 +83,13 @@ def __init__( The missing value indicators defined by the variable attributes. See `get_missing_values` for details. + s3: `dict` or `None`, optional + `s3fs.S3FileSystem` options for accessing S3 files. + If there is no ``'endpoint_url'`` key then `open` will + automatically derive one from the filename. + + .. versionadded:: (cfdm) HDFVER + {{init source: optional}} {{init copy: `bool`, optional}} @@ -130,6 +138,11 @@ def __init__( except AttributeError: missing_values = None + try: + s3 = source._get_component("s3", None) + except AttributeError: + s3 = None + if shape is not None: self._set_component("shape", shape, copy=False) @@ -158,6 +171,7 @@ def __init__( self._set_component("mask", mask, copy=False) self._set_component("units", units, copy=False) self._set_component("calendar", calendar, copy=False) + self._set_component("s3", s3, copy=False) # By default, close the file after data array access self._set_component("close", True, copy=False) @@ -228,7 +242,7 @@ def __getitem__(self, indices): return array def _check_safecast(self, attname): - """ToDOHDF. + """TODOHDF. Check to see that variable attribute exists can can be safely cast to variable data type. @@ -493,16 +507,6 @@ def _scale(self, data): return data - # def _get_attr(self, var, attr): - # """TODOHDF. - # - # .. versionadded:: (cfdm) HDFVER - # - # :Parameters: - # - # """ - # return var.attrs[attr] - def close(self, dataset): """Close the dataset containing the data. diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index 8339179d2..be217ef34 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -178,6 +178,26 @@ def get_formats(self): """ return (self.get_format(),) * len(self.get_filenames()) + def get_s3(self): + """Return `s3fs.S3FileSystem` options for accessing S3 files. + + .. versionadded:: (cfdm) HDFVER + + :Returns: + + `dict` + Keyword parameters to be passed to + `s3fs.S3FileSystem`. If there is no ``'endpoint_url'`` + key then `open` will automatically derive one from the + filename. + + """ + out = self._get_component("s3", None) + if not out: + return {} + + return out.copy() + def open(self, func, *args, **kwargs): """Return a dataset file object and address. @@ -204,22 +224,26 @@ def open(self, func, *args, **kwargs): """ # Loop round the files, returning as soon as we find one that # works. + s3 = None filenames = self.get_filenames() - for filename, address in zip(filenames, self.get_addresses()): + for i, (filename, address) in enumerate( + zip(filenames, self.get_addresses()) + ): url = urlparse(filename) if url.scheme == "file": # Convert a file URI into an absolute path filename = url.path elif url.scheme == "s3": - # Create an openable s3 file object - endpoint_url = f"https://{url.netloc}" - uri = url.path[1:] - s3 = { - "anon": True, - "client_kwargs": {"endpoint_url": endpoint_url}, - } + # Create an openable S3 file object + if s3 is None: + s3 = self.get_s3() + + if "endpoint_url" not in s3: + # Derive endpoint_url from filename + s3["endpoint_url"] = f"https://{url.netloc}" + fs = S3FileSystem(**s3) - filename = fs.open(uri, "rb") + filename = fs.open(url.path[1:], "rb") try: nc = func(filename, *args, **kwargs) diff --git a/cfdm/flatten.py b/cfdm/flatten.py index 4fc4245b7..8ddaa2c3f 100644 --- a/cfdm/flatten.py +++ b/cfdm/flatten.py @@ -289,13 +289,22 @@ class _Flattener: __pathname_format = "{}/{}" __mapping_str_format = "{}: {}" __ref_not_found_error = "REF_NOT_FOUND" - __default_copy_slice_size = 200000000 + __default_copy_slice_size = 134217728 # 128 MiB # name of the attributes used to store the mapping between original and flattened names __attr_map_name = "__flattener_name_mapping_attributes" __dim_map_name = "__flattener_name_mapping_dimensions" __var_map_name = "__flattener_name_mapping_variables" + # Mapping from numpy dtype endian format to what we expect + _dtype_endian_lookup = { + "=": "native", + ">": "big", + "<": "little", + "|": "native", + None: "native", + } + def __init__(self, input_ds, lax_mode, _copy_data=True, copy_slices=None): """**Initialisation** @@ -326,6 +335,24 @@ def __init__(self, input_ds, lax_mode, _copy_data=True, copy_slices=None): self.__input_file = input_ds self.__output_file = None + # if hasattr(input_ds, "_h5file"): + # dataset_type = 'h5netcdf' + # else: + # dataset_type = 'netCDF4' + # + # for method in ('attrs', 'chunksizes', 'contiguous', 'endian', + # 'filepath', 'get_dims', 'getncattr', 'group', 'name', + # 'ncattrs', 'path'): + # setattr(self, method, getattr(self, f"_{method}_{dataset_type}")) + + # def _attrs_netCDF4(self, variable): + # return { + # attr: variable.getncattr(attr) for attr in variable.ncattrs() + # } + # + # def _attrs_h5netcdf(self, variable): + # return variable.attrs + def attrs(self, variable): try: # h5netcdf @@ -336,6 +363,14 @@ def attrs(self, variable): attr: variable.getncattr(attr) for attr in variable.ncattrs() } + # def _chunksizes_h5netcdf(self, variable): + # return variable.chunks + # + # def _chunksizes_netCDF4(self, variable): + # chunking = variable.chunking() + # if chunking == "contiguous": + # return None + def chunksizes(self, variable): try: # netCDF4 @@ -346,46 +381,208 @@ def chunksizes(self, variable): # h5netcdf return variable.chunks + # def _contiguous_h5netcdf(self, variable): + # """Whether or not the variable data is contiguous on disk. + # + # See `_contiguous_netCDF4` for details. + # """ + # return variable.chunks is None + # + # def _contiguous_netCDF4(self, variable): + # """Whether or not the variable data is contiguous on disk. + # + # :Parameters: + # + # variable: + # The variable. + # + # :Returns: + # + # `bool` + # `True` if the variable data is contiguous on disk, + # otherwise `False`. + # + # **Examples** + # + # >>> f.contiguous(variable) + # False + # + # """ + # return variable.chunking() == "contiguous" + def contiguous(self, variable): - try: - # netCDF4 - return (variable.chunking() == "contiguous",) - except AttributeError: - # h5netcdf - return variable.chunks is None + """Whether or not the variable data is contiguous on disk. + + :Parameters: - def data_model(self, ds): - """Return the netCDF data model version. + variable: `netCDF4.Variable` or `h5netcdf.Variable` + The variable. :Returns: - `str` + `bool` + `True` if the variable data is contiguous on disk, + otherwise `False`. + + **Examples** + + >>> f.contiguous(variable) + False """ try: # netCDF4 - return ds.data_model + return (variable.chunking() == "contiguous",) except AttributeError: # h5netcdf - return "NETCDF4" + return variable.chunks is None + + # def data_model(self, dataset): + # """Return the netCDF data model version of the dataset. + # + # :Parameters: + # + # dataset: `netCDF4.Dataset` or `h5netcdf.File` + # The dataset. + # + # :Returns: + # + # `str` + # The data model version, one of ``'NETCDF4'``, + # ``'NETCDF4_CLASSIC'``, ``'NETCDF3_CLASSIC'``, + # ``'NETCDF3_64BIT_OFFSET'``, or + # ``'NETCDF3_64BIT_DATA'``. + # + # **Examples** + # + # >>> f.data_model(dataset) + # 'NETCDF4' + # + # """ + # try: + # # netCDF4 + # return dataset.data_model + # except AttributeError: + # # h5netcdf + # return "NETCDF4" def dtype(self, variable): + """Return the data type of a variable. + + :Parameters: + + variable: + The dataset variable. + + :Returns: + + `numpy.dtype` + The data type. + + **Examples** + + >>> f.dtype(variable) + dtype('>> f.endian(variable) + # 'native' + # + # """ + # return variable.endian() + # + # def _endian_h5netcdf(self, variable): + # """Return the endian-ness of a variable. + # + # """ + # dtype = variable.dtype + # return self._dtype_endian_lookup[getattr(dtype, "byteorder", None)] + def endian(self, variable): + """Return the endian-ness of a variable. + + :Parameters: + + variable: `netCDF4.Variable` or `h5netcdf.Variable` + The variable. + + :Returns: + + `str` + The endian-ness (``'little'``, ``'big'``, or + ``'native'``) of the variable. + + **Examples** + + >>> f.endian(variable) + 'native' + + """ try: # netCDF4 return variable.endian() except AttributeError: # h5netcdf - return "native" + dtype = variable.dtype + return self._dtype_endian_lookup[getattr(dtype, "byteorder", None)] + + # def _filepath_netCDF4(self, dataset): + # """Return the file path for the dataset. + # + # :Parameters: + # + # dataset: + # The dataset. + # + # :Returns: + # + # `str` + # The file system path, or the opendap URL, for the + # dataset. + # + # **Examples** + # + # >>> f.filepath(dataset) + # '/home/data/file.nc' + # + # """ + # return dataset.filepath() + # + # def _filepath_h5netcdf(self, dataset): + # """Return the file path for the dataset. + # + # """ + # return dataset.filename def filepath(self, dataset): - """Return the file path for the Dataset. + """Return the file path for the dataset. + + :Parameters: + + dataset: `netCDF4.Dataset` or `h5netcdf.File` + The dataset. :Returns: @@ -393,6 +590,11 @@ def filepath(self, dataset): The file system path, or the opendap URL, for the dataset. + **Examples** + + >>> f.filepath(dataset) + '/home/data/file.nc' + """ try: # netCDF4 @@ -401,6 +603,39 @@ def filepath(self, dataset): # h5netcdf return dataset.filename + # def _get_dims_netCDF4(self, variable): + # """Return. + # + # :Returns: + # + # `str` + # + # """ + # return variable.get_dims() + # + # def _get_dims_h5netcdf(self, variable): + # """Return. + # + # :Returns: + # + # `str` + # + # """ + # out = [] + # dimension_names = list(variable.dimensions) + # group = variable._parent + # while dimension_names: + # for name in dimension_names[:]: + # if name in group.dims: + # out.append(group.dims[name]) + # dimension_names.remove(name) + # + # group = group.parent + # if group is None: + # break + # + # return out + def get_dims(self, variable): """Return. @@ -427,6 +662,27 @@ def get_dims(self, variable): return out + # def _getncattr_netCDF4(self, x, attr): + # """Retrieve a netCDF attribute. + # + # :Parameters: + # + # x: variable, group, or dataset + # + # attr: `str` + # + # :Returns: + # + # """ + # return getattr(x, attr) + # + # def _getncattr_h5netcdf(self, x, attr): + # """Retrieve a netCDF attribute. + # + # + # """ + # return x.attrs[attr] + def getncattr(self, x, attr): """Retrieve a netCDF attribute. @@ -446,6 +702,22 @@ def getncattr(self, x, attr): # h5netcdf return x.attrs[attr] + # def _group_netCDF4(self, x): + # """Return a. + # + # :Returns: + # + # `Group` + # + # """ + # return x.group() + # + # def _group_h5netcdf(self, x): + # """Return a. + # + # """ + # return x._parent + def group(self, x): """Return a. @@ -461,6 +733,28 @@ def group(self, x): # h5netcdf return x._parent + # def _name_netCDF4(self, x): + # """Return the netCDF name, without its groups. + # + # :Returns: + # + # """ + # return x.name + # + # def _name_h5netcdf(self, x): + # """Return the netCDF name, without its groups. + # + # :Returns: + # + # `str` + # + # """ + # out = x.name + # if "/" in out: + # out = x.name.split("/")[-1] + # + # return out + def name(self, x): """Return the netCDF name, without its groups. @@ -476,6 +770,26 @@ def name(self, x): return out + # def _ncattrs_netCDF4(self, x): + # """Return netCDF attribute names. + # + # :Parameters: + # + # x: variable, group, or dataset + # + # :Returns: + # + # `list` + # + # """ + # return x.ncattrs() + # + # def _ncattrs_h5netcdf(self, x): + # """Return netCDF attribute names. + # + # """ + # return list(x.attrs) + def ncattrs(self, x): """Return netCDF attribute names. @@ -508,6 +822,29 @@ def parent(self, group): except AttributeError: return + # def _path_netCDF4(self, group): + # """Return a simulated unix directory path to a group. + # + # :Returns: + # + # `str` + # + # """ + # return group.path + # + # def _path_h5netcdf(self, group): + # """Return a simulated unix directory path to a group. + # + # :Returns: + # + # `str` + # + # """ + # try: + # return group.name + # except AttributeError: + # return "/" + def path(self, group): """Return a simulated unix directory path to a group. @@ -536,8 +873,8 @@ def flatten(self, output_ds): # or output_ds.data_model != 'NETCDF4': if ( output_ds == self.__input_file - or self.filepath(output_ds) == self.filepath(self.__input_file) - or self.data_model(output_ds) != "NETCDF4" + or output_ds.filepath() == self.filepath(self.__input_file) + or output_ds.data_model != "NETCDF4" ): raise ValueError( "Invalid inputs. Input and output datasets should be different, and output should be of " @@ -1131,11 +1468,16 @@ def resolve_references(self, var, old_var): :param old_var: original variable (in group structure) """ + var_attrs = self.attrs(var) + var_attrs_names = tuple(var_attrs) for attr in _AttributeProperties: # if attr.name in var.__dict__: - if attr.name in self.ncattrs(var): + # if attr.name in self.ncattrs(var): + if attr.name in var_attrs_names: # self.ncattrs(var): # attr_value = var.getncattr(attr.name) - attr_value = self.getncattr(var, attr.name) + attr_value = var_attrs[ + attr.name + ] # self.getncattr(var, attr.name) # Parse attribute value parsed_attr = parse_var_attr(attr_value) @@ -1182,11 +1524,15 @@ def adapt_references(self, var): :param var: flattened variable in which references should be renamed with new names """ + var_attrs = self.attrs(var) + var_attrs_names = tuple(var_attrs) for attr in _AttributeProperties: # if attr.name in var.__dict__: - if attr.name in self.ncattrs(var): + if attr.name in var_attrs_names: # self.ncattrs(var): # attr_value = var.getncattr(attr.name) - attr_value = self.getncattr(var, attr.name) + attr_value = var_attrs[ + attr.name + ] # self.getncattr(var, attr.name) # Parse attribute value parsed_attr = parse_var_attr(attr_value) diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 9bca65388..22bbb42d6 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -452,16 +452,22 @@ def file_close(self): >>> r.file_close() """ - for nc in self.read_vars["datasets"]: + g = self.read_vars + + for nc in g["datasets"]: nc.close() # Close temporary flattened files - for flat_file in self.read_vars["flat_files"]: + for flat_file in g["flat_files"]: flat_file.close() # Close the original grouped file (v1.8.8.1) - if "nc_grouped" in self.read_vars: - self.read_vars["nc_grouped"].close() + if "nc_grouped" in g: + g["nc_grouped"].close() + + # Close file-like object from S3 file systems + for filename in g["s3_file_objects"]: + filename.close() def file_open(self, filename, flatten=True, verbose=None): """Open the netCDf file for reading. @@ -498,21 +504,29 @@ def file_open(self, filename, flatten=True, verbose=None): netCDF = False HDF = False + # Deal with an file in an S3 object store u = urlparse(filename) if u.scheme == "s3": # Create an openable s3 file object - endpoint_url = f"https://{u.netloc}" - uri = u.path[1:] s3 = g["s3"] - if s3 is None: - s3 = { - "anon": True, - "client_kwargs": {"endpoint_url": endpoint_url}, - } + g["s3_file_system_options"][filename] = s3 + if "endpoint_url" not in s3: + # Derive endpoint_url from filename + s3 = g["s3"].copy() + s3["endpoint_url"] = f"https://{u.netloc}" + + key = tuple(sorted(s3.items())) + s3_file_systems = g["s3_file_systems"] + fs = s3_file_systems.get(key) + if fs is None: + # An s3 file system with these options does not exist, + # so create one. + fs = S3FileSystem(**s3) + s3_file_systems[key] = fs + + filename = fs.open(u.path[1:], "rb") + g["s3_file_objects"].append(filename) - fs = S3FileSystem(**s3) - filename = fs.open(uri, "rb") - print(filename, type(filename)) if is_log_level_detail(logger): logger.debug( f" s3: s3fs.S3FileSystem options: {s3}\n" @@ -520,12 +534,15 @@ def file_open(self, filename, flatten=True, verbose=None): # nc = h5netcdf.File(filename, "r", decode_vlen_strings=True) try: - # raise OSError() + if g["no_HDF"]: + raise OSError("Requested to not use HDF to open file") + nc = h5netcdf.File(filename, "r", decode_vlen_strings=True) HDF = True except OSError: - # File is not HDF. Assume instead that it's netCDF3 and - # open it with netCDF4. + # File could not be read by h5netcdf, or we've insisted + # that we don't use h5netcdf, so try to open it with + # netCDF4. try: nc = netCDF4.Dataset(filename, "r") netCDF = True @@ -536,15 +553,10 @@ def file_open(self, filename, flatten=True, verbose=None): g["original_HDF"] = HDF g["original_netCDF"] = netCDF + # ------------------------------------------------------------ # If the file has a group structure then flatten it (CF>=1.8) # ------------------------------------------------------------ - - if HDF: - print("Opened with h5netcdf") - else: - print("Opened with netCDF4") - if flatten and nc.groups: # if HDF: # # TODOHDF: Can't yet use HDF access to process groups @@ -852,6 +864,8 @@ def read( warn_valid=False, domain=False, s3=None, + _s3_file_systems=None, + _no_HDF=False, ): """Reads a netCDF dataset from file or OPenDAP URL. @@ -896,6 +910,21 @@ def read( .. versionadded:: (cfdm) 1.9.0.0 + s3: `bool`, optional + See `cfdm.read` for details + + .. versionadded:: (cfdm) HDFVER + + + _s3_file_systems: `dict`, optional + TODOHDF + + .. versionadded:: (cfdm) HDFVER + + _no_HDF: `bool`, optional + See `cfdm.read` for details + + .. versionadded:: (cfdm) HDFVER :Returns: `list` @@ -994,10 +1023,24 @@ def read( # -------------------------------------------------------- # CFA # -------------------------------------------------------- + # "cfa": False, + # -------------------------------------------------------- + # HDF + # -------------------------------------------------------- + # + "no_HDF": _no_HDF, + # -------------------------------------------------------- # S3 + # -------------------------------------------------------- # "s3": s3, + # + "s3_file_systems": {}, + # + "s3_file_system_options": {}, + # + "s3_file_objects": [], } g = self.read_vars @@ -1006,6 +1049,15 @@ def read( for version in ("1.6", "1.7", "1.8", "1.9", "1.10", "1.11"): g["version"][version] = Version(version) + if s3 is None: + # Default s3 file system options + g["s3"] = {"anon": True} + + if _s3_file_systems is not None: + # Update S3 file systems with those passed in as keyword + # parameter + g["s3_file_systems"] = _s3_file_systems + # ------------------------------------------------------------ # Add custom read vars # ------------------------------------------------------------ @@ -2123,8 +2175,14 @@ def _get_variables_from_external_files(self, netcdf_external_variables): "\nScanning external file:\n-----------------------" ) # pragma: no cover + # Note: We pass in the s3 file system (if any) of the + # parent file in case we can resuse it for the + # external file external_read_vars = self.read( - external_file, _scan_only=True, verbose=verbose + external_file, + _scan_only=True, + _s3_file_systems=read_vars["s3_file_systems"], + verbose=verbose, ) logger.info( @@ -6107,6 +6165,10 @@ def _create_netcdfarray( "missing_values": missing_values, } + s3 = g["s3_file_system_options"].get(filename) + if s3 is not None: + kwargs["s3"] = s3 + if return_kwargs_only: return kwargs diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index b96345878..1a30aa167 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -17,6 +17,8 @@ def read( warn_valid=False, mask=True, domain=False, + s3=None, + _no_HDF=False, _implementation=_implementation, ): """Read field or domain constructs from a dataset. @@ -262,6 +264,24 @@ def read( .. versionadded:: (cfdm) 1.9.0.0 + s3: `dict`, optional + + Provide keyword parameters to `s3fs.S3FileSystem` to + control the opening of files in an S3 object store. By + default, or if `None`, then ``s3={'anon': True, + 'endpoint_url': }`` is used, where + ```` is derived from each S3 file name. For + example, for file name + ``'s3://my-object-store/data/file.nc'``, + ```` will be ``'https://my-object-store'``. + + .. versionadded:: (cfdm) HDFVER + + _no_HDF: `bool`, optional + TODOHDF + + .. versionadded:: (cfdm) HDFVER + _implementation: (subclass of) `CFDMImplementation`, optional Define the CF data model implementation that provides the returned field constructs. @@ -334,6 +354,8 @@ def read( warn_valid=warn_valid, mask=mask, domain=domain, + s3=s3, + _no_HDF=_no_HDF, extra_read_vars=None, ) except MaskError: From 45bcfabbfccc83299c334ce4a92d80cc043f0181 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 26 Jan 2024 10:29:57 +0000 Subject: [PATCH 16/88] dev --- cfdm/data/hdfarray.py | 4 +++- cfdm/data/mixin/filearraymixin.py | 27 +++++++-------------------- cfdm/data/mixin/netcdffilemixin.py | 20 ++++++++++++++++++++ cfdm/read_write/read.py | 22 ++++++++++++---------- 4 files changed, 42 insertions(+), 31 deletions(-) diff --git a/cfdm/data/hdfarray.py b/cfdm/data/hdfarray.py index 08baf08bc..f0b6b724d 100644 --- a/cfdm/data/hdfarray.py +++ b/cfdm/data/hdfarray.py @@ -210,8 +210,9 @@ def __getitem__(self, indices): variable = dataset.variables[address] self.variable = variable array = variable[indices] - + print (11) if mask: + print (22) self.scale = True self.always_mask = False self._isvlen = variable.dtype == np.dtype("O") @@ -274,6 +275,7 @@ def _check_safecast(self, attname): def _mask(self, data): """TODOHDF.""" + print ('MASK', data.shape) # Private function for creating a masked array, masking # missing_values and/or _FillValues. diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index be217ef34..39ad04240 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -1,5 +1,6 @@ from urllib.parse import urlparse +import h5netcdf from s3fs import S3FileSystem from ...functions import abspath @@ -178,26 +179,6 @@ def get_formats(self): """ return (self.get_format(),) * len(self.get_filenames()) - def get_s3(self): - """Return `s3fs.S3FileSystem` options for accessing S3 files. - - .. versionadded:: (cfdm) HDFVER - - :Returns: - - `dict` - Keyword parameters to be passed to - `s3fs.S3FileSystem`. If there is no ``'endpoint_url'`` - key then `open` will automatically derive one from the - filename. - - """ - out = self._get_component("s3", None) - if not out: - return {} - - return out.copy() - def open(self, func, *args, **kwargs): """Return a dataset file object and address. @@ -245,6 +226,12 @@ def open(self, func, *args, **kwargs): fs = S3FileSystem(**s3) filename = fs.open(url.path[1:], "rb") + # Always use h5netcdf to access an S3 file + if func != h5netcdf.File: + func = h5netcdf.File + args NO = () + kwargs = {'decode_vlen_strings': True} + try: nc = func(filename, *args, **kwargs) except FileNotFoundError: diff --git a/cfdm/data/mixin/netcdffilemixin.py b/cfdm/data/mixin/netcdffilemixin.py index 4ac3b5a70..48588caf3 100644 --- a/cfdm/data/mixin/netcdffilemixin.py +++ b/cfdm/data/mixin/netcdffilemixin.py @@ -186,6 +186,26 @@ def get_mask(self): """ return self._get_component("mask") + def get_s3(self): + """Return `s3fs.S3FileSystem` options for accessing S3 files. + + .. versionadded:: (cfdm) HDFVER + + :Returns: + + `dict` + Keyword parameters to be passed to + `s3fs.S3FileSystem`. If there is no ``'endpoint_url'`` + key then `open` will automatically derive one from the + filename. + + """ + out = self._get_component("s3", None) + if not out: + return {} + + return out.copy() + def get_missing_values(self): """The missing value indicators from the netCDF variable. diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 1a30aa167..c3cb381bc 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -264,17 +264,19 @@ def read( .. versionadded:: (cfdm) 1.9.0.0 - s3: `dict`, optional - - Provide keyword parameters to `s3fs.S3FileSystem` to + s3: `dict` or `None`, optional + Keyword parameters to be passed to `s3fs.S3FileSystem` to control the opening of files in an S3 object store. By - default, or if `None`, then ``s3={'anon': True, - 'endpoint_url': }`` is used, where - ```` is derived from each S3 file name. For - example, for file name - ``'s3://my-object-store/data/file.nc'``, - ```` will be ``'https://my-object-store'``. - + default, or if `None`, then ``s3={'anon': True}``. Ignored + for file names that don't start with ``s3:``. + + If and only if *s3* has no ``'endpoint_url'`` key, then + one will be automatically derived from the *filename*. For + example, if *filename* was + ``'s3://object-store/data/file.nc'``, then an + ``'endpoint_url'`` key with value + ``'https://object-store'`` would be created. + .. versionadded:: (cfdm) HDFVER _no_HDF: `bool`, optional From 03808dfe1b528c7c44139b3d45654114ae4a4938 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 26 Jan 2024 17:59:58 +0000 Subject: [PATCH 17/88] dev --- cfdm/data/hdfarray.py | 275 ++++++++++++++++++++++++++- cfdm/data/mixin/filearraymixin.py | 26 +-- cfdm/data/mixin/netcdffilemixin.py | 32 ++-- cfdm/read_write/netcdf/netcdfread.py | 1 + 4 files changed, 297 insertions(+), 37 deletions(-) diff --git a/cfdm/data/hdfarray.py b/cfdm/data/hdfarray.py index f0b6b724d..cfb0664a5 100644 --- a/cfdm/data/hdfarray.py +++ b/cfdm/data/hdfarray.py @@ -84,9 +84,11 @@ def __init__( attributes. See `get_missing_values` for details. s3: `dict` or `None`, optional - `s3fs.S3FileSystem` options for accessing S3 files. - If there is no ``'endpoint_url'`` key then `open` will - automatically derive one from the filename. + The `s3fs.S3FileSystem` options for accessing S3 + files. If there are no options then ``anon=True`` is + assumed, and if there is no ``'endpoint_url'`` key + then one will automatically be derived one for each S3 + filename. .. versionadded:: (cfdm) HDFVER @@ -210,14 +212,23 @@ def __getitem__(self, indices): variable = dataset.variables[address] self.variable = variable array = variable[indices] - print (11) + print(11) if mask: - print (22) + print(22) self.scale = True self.always_mask = False self._isvlen = variable.dtype == np.dtype("O") + isvlen = variable.dtype == np.dtype("O") if not self._isvlen: - array = self._mask(array) + array = self._mask2( + array, + variable.dtype, + variable.attrs, + isvlen, + self.scale, + self.always_mask, + ) + # array = self._mask(array) array = self._scale(array) # Set the units, if they haven't been set already. @@ -242,6 +253,37 @@ def __getitem__(self, indices): array = self._process_string_and_char(array) return array + @classmethod + def _check_safecast2(cls, attname, var_dtype, attrs): + """TODOHDF. + + Check to see that variable attribute exists can can be safely + cast to variable data type. + + """ + # attrs = self.variable.attrs + if attname in attrs: + attvalue = attrs[attname] + att = np.array(attvalue) + else: + return False, None + + is_safe = True + try: + atta = np.array(att, var_dtype) + except ValueError: + is_safe = False + else: + is_safe = _safecast(att, atta) + + if not is_safe: + logger.warn( + f"WARNING: {attname} not used since it cannot " + "be safely cast to variable data type" + ) # pragma: no cover + + return is_safe, attvalue + def _check_safecast(self, attname): """TODOHDF. @@ -275,7 +317,7 @@ def _check_safecast(self, attname): def _mask(self, data): """TODOHDF.""" - print ('MASK', data.shape) + print("MASK", data.shape) # Private function for creating a masked array, masking # missing_values and/or _FillValues. @@ -475,6 +517,225 @@ def _mask(self, data): return data + @classmethod + def _mask2( + cls, data, var_dtype, attrs, isvlen, scale=False, always_mask=False + ): + """TODOHDF.""" + print("MASK", data.shape) + + if isvlen: + return data + + # Private function for creating a masked array, masking + # missing_values and/or _FillValues. + + # attrs = self.variable.attrs + is_unsigned = attrs.get("_Unsigned", False) in ("true", "True") + is_unsigned_int = is_unsigned and data.dtype.kind == "i" + + dtype = data.dtype + if scale and is_unsigned_int: + # Only do this if autoscale option is on. + dtype_unsigned_int = f"{dtype.byteorder}u{dtype.itemsize}" + data = data.view(dtype_unsigned_int) + + totalmask = np.zeros(data.shape, np.bool_) + fill_value = None + safe_missval, missing_value = cls._check_safecast2( + "missing_value", var_dtype, attrs + ) + if safe_missval: + mval = np.array(missing_value, var_dtype) + if scale and is_unsigned_int: + mval = mval.view(dtype_unsigned_int) + + # create mask from missing values. + mvalmask = np.zeros(data.shape, np.bool_) + if mval.shape == (): # mval a scalar. + mval = (mval,) # make into iterable. + + for m in mval: + # is scalar missing value a NaN? + try: + mvalisnan = np.isnan(m) + except TypeError: + # isnan fails on some dtypes + mvalisnan = False + + if mvalisnan: + mvalmask += np.isnan(data) + else: + mvalmask += data == m + + if mvalmask.any(): + # Set fill_value for masked array to missing_value (or + # 1st element if missing_value is a vector). + fill_value = mval[0] + totalmask += mvalmask + + # set mask=True for data == fill value + safe_fillval, _FillValue = cls._check_safecast2( + "_FillValue", dtype, attrs + ) + if safe_fillval: + fval = np.array(_FillValue, var_dtype) + if scale and is_unsigned_int: + fval = fval.view(dtype_unsigned_int) + + # is _FillValue a NaN? + try: + fvalisnan = np.isnan(fval) + except Exception: + # isnan fails on some dtypes + fvalisnan = False + + if fvalisnan: + mask = np.isnan(data) + elif (data == fval).any(): + mask = data == fval + else: + mask = None + + if mask is not None: + if fill_value is None: + fill_value = fval + + totalmask += mask + else: + # Don't return masked array if variable filling is disabled. + no_fill = 0 + # with nogil: + # ierr = nc_inq_var_fill(self._grpid,self._varid,&no_fill,NULL) + # _ensure_nc_success(ierr) + + # if no_fill is not 1, and not a byte variable, then use + # default fill value. from + # http://www.unidata.ucar.edu/software/netcdf/docs/netcdf-c/Fill-Values.html#Fill-Values + # "If you need a fill value for a byte variable, it is + # recommended that you explicitly define an appropriate + # _FillValue attribute, as generic utilities such as + # ncdump will not assume a default fill value for byte + # variables." Explained here too: + # http://www.unidata.ucar.edu/software/netcdf/docs/known_problems.html#ncdump_ubyte_fill + # "There should be no default fill values when reading any + # byte type, signed or unsigned, because the byte ranges + # are too small to assume one of the values should appear + # as a missing value unless a _FillValue attribute is set + # explicitly." (do this only for non-vlens, since vlens + # don't have a default _FillValue) + if not isvlen and ( + no_fill != 1 or dtype.str[1:] not in ("u1", "i1") + ): + fillval = np.array(default_fillvals[dtype.str[1:]], dtype) + has_fillval = data == fillval + # if data is an array scalar, has_fillval will be a + # boolean. in that case convert to an array. + # if type(has_fillval) == bool: + if isinstance(has_fillval, bool): + has_fillval = np.asarray(has_fillval) + + if has_fillval.any(): + if fill_value is None: + fill_value = fillval + + mask = data == fillval + totalmask += mask + + # Set mask=True for data outside valid_min, valid_max. + validmin = None + validmax = None + # If valid_range exists use that, otherwise look for + # valid_min, valid_max. No special treatment of byte data as + # described at + # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). + safe_validrange, valid_range = cls._check_safecast2( + "valid_range", var_dtype, attrs + ) + safe_validmin, valid_min = cls._check_safecast2( + "valid_min", var_dtype, attrs + ) + safe_validmax, valid_max = cls._check_safecast2( + "valid_max", var_dtype, attrs + ) + if safe_validrange and valid_range.size == 2: + validmin = np.array(valid_range[0], var_dtype) + validmax = np.array(valid_range[1], var_dtype) + else: + if safe_validmin: + validmin = np.array(valid_min, var_dtype) + + if safe_validmax: + validmax = np.array(valid_max, var_dtype) + + if validmin is not None and scale and is_unsigned_int: + validmin = validmin.view(dtype_unsigned_int) + + if validmax is not None and scale and is_unsigned_int: + validmax = validmax.view(dtype_unsigned_int) + + # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). + # "If the data type is byte and _FillValue is not explicitly + # defined, then the valid range should include all possible + # values. Otherwise, the valid range should exclude the + # _FillValue (whether defined explicitly or by default) as + # follows. If the _FillValue is positive then it defines a + # valid maximum, otherwise it defines a valid minimum." + if safe_fillval: + fval = np.array(_FillValue, dtype) + else: + k = dtype.str[1:] + if k in ("u1", "i1"): + fval = None + else: + fval = np.array(default_fillvals[k], dtype) + + if var_dtype.kind != "S": + # Don't set mask for character data + + # Setting valid_min/valid_max to the _FillVaue is too + # surprising for many users (despite the netcdf docs + # attribute best practices suggesting clients should do + # this). + if validmin is not None: + totalmask += data < validmin + + if validmax is not None: + totalmask += data > validmax + + if fill_value is None and fval is not None: + fill_value = fval + + # If all else fails, use default _FillValue as fill_value for + # masked array. + if fill_value is None: + fill_value = default_fillvals[dtype.str[1:]] + + # Create masked array with computed mask + masked_values = bool(totalmask.any()) + if masked_values: + data = np.ma.masked_array( + data, mask=totalmask, fill_value=fill_value + ) + else: + # Always return masked array, if no values masked. + data = np.ma.masked_array(data) + + # Scalar array with mask=True should be converted to + # np.ma.MaskedConstant to be consistent with slicing + # behavior of masked arrays. + if data.shape == () and data.mask.all(): + # Return a scalar numpy masked constant not a 0-d masked + # array, so that data == np.ma.masked. + data = data[()] + + elif not always_mask and not masked_values: + # Return a regular numpy array if requested and there are + # no missing values + data = np.array(data, copy=False) + + return data + def _scale(self, data): """TODOHDF.""" # If variable has scale_factor and add_offset attributes, diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index 39ad04240..bd8648c50 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -1,6 +1,5 @@ from urllib.parse import urlparse -import h5netcdf from s3fs import S3FileSystem from ...functions import abspath @@ -13,6 +12,14 @@ class FileArrayMixin: """ + def __repr__(self): + """Called by the `repr` built-in function. + + x.__repr__() <==> repr(x) + + """ + return f"<{self.__class__.__name__}{self.shape}: {self}>" + def __str__(self): """Called by the `str` built-in function. @@ -66,8 +73,11 @@ def get_address(self, default=AttributeError()): """ addresses = self.get_addresses() - if len(addresses) == 1: + n = len(addresses) + if n == 1: return addresses[0] + elif n > 1: + return if default is None: return @@ -205,7 +215,6 @@ def open(self, func, *args, **kwargs): """ # Loop round the files, returning as soon as we find one that # works. - s3 = None filenames = self.get_filenames() for i, (filename, address) in enumerate( zip(filenames, self.get_addresses()) @@ -216,8 +225,9 @@ def open(self, func, *args, **kwargs): filename = url.path elif url.scheme == "s3": # Create an openable S3 file object - if s3 is None: - s3 = self.get_s3() + s3 = self.get_s3() + if not s3: + s3["anon"] = True if "endpoint_url" not in s3: # Derive endpoint_url from filename @@ -226,12 +236,6 @@ def open(self, func, *args, **kwargs): fs = S3FileSystem(**s3) filename = fs.open(url.path[1:], "rb") - # Always use h5netcdf to access an S3 file - if func != h5netcdf.File: - func = h5netcdf.File - args NO = () - kwargs = {'decode_vlen_strings': True} - try: nc = func(filename, *args, **kwargs) except FileNotFoundError: diff --git a/cfdm/data/mixin/netcdffilemixin.py b/cfdm/data/mixin/netcdffilemixin.py index 48588caf3..f7a35c1a5 100644 --- a/cfdm/data/mixin/netcdffilemixin.py +++ b/cfdm/data/mixin/netcdffilemixin.py @@ -11,21 +11,13 @@ class NetCDFFileMixin: """ - def __repr__(self): - """Called by the `repr` built-in function. - - x.__repr__() <==> repr(x) - - """ - return f"<{self.__class__.__name__}{self.shape}: {self}>" - - def __str__(self): - """Called by the `str` built-in function. - - x.__str__() <==> str(x) - - """ - return f"{self.get_filename(None)}, {self.get_address()}" + # def __repr__(self): + # """Called by the `repr` built-in function. + # + # x.__repr__() <==> repr(x) + # + # """ + # return f"<{self.__class__.__name__}{self.shape}: {self}>" def _get_attr(self, var, attr): """TODOHDF. @@ -39,7 +31,8 @@ def _get_attr(self, var, attr): "Must implement {self.__class__.__name__}._get_attr" ) # pragma: no cover - def _process_string_and_char(self, array): + @classmethod + def _process_string_and_char(cls, array): """TODOHDF.""" string_type = isinstance(array, str) kind = array.dtype.kind @@ -194,9 +187,10 @@ def get_s3(self): :Returns: `dict` - Keyword parameters to be passed to - `s3fs.S3FileSystem`. If there is no ``'endpoint_url'`` - key then `open` will automatically derive one from the + The `s3fs.S3FileSystem` options for accessing S3 + files. If there are no options then ``anon=True`` is + assumed, and if there is no ``'endpoint_url'`` key + then one will automatically be derived one for each S3 filename. """ diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 22bbb42d6..70d967261 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -535,6 +535,7 @@ def file_open(self, filename, flatten=True, verbose=None): # nc = h5netcdf.File(filename, "r", decode_vlen_strings=True) try: if g["no_HDF"]: + print(99999999999999) raise OSError("Requested to not use HDF to open file") nc = h5netcdf.File(filename, "r", decode_vlen_strings=True) From c28e36d7797495586f71d963936bb1181c952f5b Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sat, 27 Jan 2024 17:30:23 +0000 Subject: [PATCH 18/88] dev --- cfdm/data/mask.py | 641 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 641 insertions(+) create mode 100644 cfdm/data/mask.py diff --git a/cfdm/data/mask.py b/cfdm/data/mask.py new file mode 100644 index 000000000..1baaf1432 --- /dev/null +++ b/cfdm/data/mask.py @@ -0,0 +1,641 @@ +import logging + +import netCDF4 +import numpy as np + +_safecast = netCDF4.utils._safecast +default_fillvals = netCDF4.default_fillvals + +logger = logging.getLogger(__name__) + + +class Mask: + +# variable = dataset.variables[address] +# self.variable = variable +# array = variable[indices] +# print(11) +# if mask: +# print(22) +# self.scale = True +# self.always_mask = False +# self._isvlen = variable.dtype == np.dtype("O") +# isvlen = variable.dtype == np.dtype("O") +# if not self._isvlen: +# array = self._mask2( +# array, +# variable.dtype, +# variable.attrs, +# isvlen, +# self.scale, +# self.always_mask, +# ) +# # array = self._mask(array) +# array = self._scale(array) +# +# string_type = isinstance(array, str) +# if string_type: +# # -------------------------------------------------------- +# # A netCDF string type scalar variable comes out as Python +# # str object, so convert it to a numpy array. +# # -------------------------------------------------------- +# array = np.array(array, dtype=f"U{len(array)}") +# +# if not self.ndim: +# # Hmm netCDF4 has a thing for making scalar size 1, 1d +# array = array.squeeze() +# +# array = self._process_string_and_char(array) +# return array + + @classmethod + def _check_safecast(cls, attname, var_dtype, attrs): + """TODOHDF. + + Check to see that variable attribute exists can can be safely + cast to variable data type. + + """ + # attrs = self.variable.attrs + if attname in attrs: + attvalue = attrs[attname] + att = np.array(attvalue) + else: + return False, None + + is_safe = True + try: + atta = np.array(att, var_dtype) + except ValueError: + is_safe = False + else: + is_safe = _safecast(att, atta) + + if not is_safe: + logger.warn( + f"WARNING: {attname} not used since it cannot " + "be safely cast to variable data type" + ) # pragma: no cover + + return is_safe, attvalue + +# def _mask(self, data): +# """TODOHDF.""" +# print("MASK", data.shape) +# # Private function for creating a masked array, masking +# # missing_values and/or _FillValues. +# +# attrs = self.variable.attrs +# is_unsigned = attrs.get("_Unsigned", False) in ("true", "True") +# is_unsigned_int = is_unsigned and data.dtype.kind == "i" +# +# dtype = data.dtype +# if self.scale and is_unsigned_int: +# # Only do this if autoscale option is on. +# dtype_unsigned_int = f"{dtype.byteorder}u{dtype.itemsize}" +# data = data.view(dtype_unsigned_int) +# +# totalmask = np.zeros(data.shape, np.bool_) +# fill_value = None +# safe_missval = self._check_safecast("missing_value") +# if safe_missval: +# mval = np.array(self.missing_value, self.dtype) +# if self.scale and is_unsigned_int: +# mval = mval.view(dtype_unsigned_int) +# +# # create mask from missing values. +# mvalmask = np.zeros(data.shape, np.bool_) +# if mval.shape == (): # mval a scalar. +# mval = (mval,) # make into iterable. +# +# for m in mval: +# # is scalar missing value a NaN? +# try: +# mvalisnan = np.isnan(m) +# except TypeError: +# # isnan fails on some dtypes +# mvalisnan = False +# +# if mvalisnan: +# mvalmask += np.isnan(data) +# else: +# mvalmask += data == m +# +# if mvalmask.any(): +# # Set fill_value for masked array to missing_value (or +# # 1st element if missing_value is a vector). +# fill_value = mval[0] +# totalmask += mvalmask +# +# # set mask=True for data == fill value +# safe_fillval = self._check_safecast("_FillValue") +# if safe_fillval: +# fval = np.array(self._FillValue, self.dtype) +# if self.scale and is_unsigned_int: +# fval = fval.view(dtype_unsigned_int) +# +# # is _FillValue a NaN? +# try: +# fvalisnan = np.isnan(fval) +# except Exception: +# # isnan fails on some dtypes +# fvalisnan = False +# +# if fvalisnan: +# mask = np.isnan(data) +# elif (data == fval).any(): +# mask = data == fval +# else: +# mask = None +# +# if mask is not None: +# if fill_value is None: +# fill_value = fval +# +# totalmask += mask +# else: +# # Don't return masked array if variable filling is disabled. +# no_fill = 0 +# # with nogil: +# # ierr = nc_inq_var_fill(self._grpid,self._varid,&no_fill,NULL) +# # _ensure_nc_success(ierr) +# +# # if no_fill is not 1, and not a byte variable, then use +# # default fill value. from +# # http://www.unidata.ucar.edu/software/netcdf/docs/netcdf-c/Fill-Values.html#Fill-Values +# # "If you need a fill value for a byte variable, it is +# # recommended that you explicitly define an appropriate +# # _FillValue attribute, as generic utilities such as +# # ncdump will not assume a default fill value for byte +# # variables." Explained here too: +# # http://www.unidata.ucar.edu/software/netcdf/docs/known_problems.html#ncdump_ubyte_fill +# # "There should be no default fill values when reading any +# # byte type, signed or unsigned, because the byte ranges +# # are too small to assume one of the values should appear +# # as a missing value unless a _FillValue attribute is set +# # explicitly." (do this only for non-vlens, since vlens +# # don't have a default _FillValue) +# if not self._isvlen and ( +# no_fill != 1 or dtype.str[1:] not in ("u1", "i1") +# ): +# fillval = np.array(default_fillvals[dtype.str[1:]], dtype) +# has_fillval = data == fillval +# # if data is an array scalar, has_fillval will be a +# # boolean. in that case convert to an array. +# # if type(has_fillval) == bool: +# if isinstance(has_fillval, bool): +# has_fillval = np.asarray(has_fillval) +# +# if has_fillval.any(): +# if fill_value is None: +# fill_value = fillval +# +# mask = data == fillval +# totalmask += mask +# +# # Set mask=True for data outside valid_min, valid_max. +# validmin = None +# validmax = None +# # If valid_range exists use that, otherwise look for +# # valid_min, valid_max. No special treatment of byte data as +# # described at +# # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). +# safe_validrange = self._check_safecast("valid_range") +# safe_validmin = self._check_safecast("valid_min") +# safe_validmax = self._check_safecast("valid_max") +# if safe_validrange and self.valid_range.size == 2: +# validmin = np.array(self.valid_range[0], self.dtype) +# validmax = np.array(self.valid_range[1], self.dtype) +# else: +# if safe_validmin: +# validmin = np.array(self.valid_min, self.dtype) +# +# if safe_validmax: +# validmax = np.array(self.valid_max, self.dtype) +# +# if validmin is not None and self.scale and is_unsigned_int: +# validmin = validmin.view(dtype_unsigned_int) +# +# if validmax is not None and self.scale and is_unsigned_int: +# validmax = validmax.view(dtype_unsigned_int) +# +# # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). +# # "If the data type is byte and _FillValue is not explicitly +# # defined, then the valid range should include all possible +# # values. Otherwise, the valid range should exclude the +# # _FillValue (whether defined explicitly or by default) as +# # follows. If the _FillValue is positive then it defines a +# # valid maximum, otherwise it defines a valid minimum." +# if safe_fillval: +# fval = np.array(self._FillValue, dtype) +# else: +# k = dtype.str[1:] +# if k in ("u1", "i1"): +# fval = None +# else: +# fval = np.array(default_fillvals[k], dtype) +# +# if self.dtype.kind != "S": +# # Don't set mask for character data +# +# # Setting valid_min/valid_max to the _FillVaue is too +# # surprising for many users (despite the netcdf docs +# # attribute best practices suggesting clients should do +# # this). +# if validmin is not None: +# totalmask += data < validmin +# +# if validmax is not None: +# totalmask += data > validmax +# +# if fill_value is None and fval is not None: +# fill_value = fval +# +# # If all else fails, use default _FillValue as fill_value for +# # masked array. +# if fill_value is None: +# fill_value = default_fillvals[dtype.str[1:]] +# +# # Create masked array with computed mask +# masked_values = bool(totalmask.any()) +# if masked_values: +# data = np.ma.masked_array( +# data, mask=totalmask, fill_value=fill_value +# ) +# else: +# # Always return masked array, if no values masked. +# data = np.ma.masked_array(data) +# +# # Scalar array with mask=True should be converted to +# # np.ma.MaskedConstant to be consistent with slicing +# # behavior of masked arrays. +# if data.shape == () and data.mask.all(): +# # Return a scalar numpy masked constant not a 0-d masked +# # array, so that data == np.ma.masked. +# data = data[()] +# +# elif not self.always_mask and not masked_values: +# # Return a regular numpy array if requested and there are +# # no missing values +# data = np.array(data, copy=False) +# +# +# return data + + @classmethod + def _process_string_and_char(cls, array): + """TODOHDF.""" + string_type = isinstance(array, str) + kind = array.dtype.kind + if not string_type and kind in "SU": + # Collapse by concatenation the outermost (fastest + # varying) dimension of char array into + # memory. E.g. [['a','b','c']] becomes ['abc'] + if kind == "U": + array = array.astype("S", copy=False) + + array = netCDF4.chartostring(array) + shape = array.shape + array = np.array([x.rstrip() for x in array.flat], dtype="U") + array = np.reshape(array, shape) + array = np.ma.masked_where(array == "", array) + elif not string_type and kind == "O": + # An N-d (N>=1) string variable comes out as a numpy + # object array, so convert it to numpy string array. + array = array.astype("U", copy=False) + + # Mask the VLEN variable + array = np.ma.where(array == "", np.ma.masked, array) + + return array + + @classmethod + def _process_char_array(cls, array, mask=True): + """TODOHDF.""" + # Collapse by concatenation the outermost (fastest + # varying) dimension of char array into + # memory. E.g. [['a','b','c']] becomes ['abc'] + if kind == "U": + array = array.astype("S", copy=False) + + array = netCDF4.chartostring(array) + shape = array.shape + array = np.array([x.rstrip() for x in array.flat], dtype="U") + array = np.reshape(array, shape) + if mask: + array = np.ma.masked_where(array == "", array) + if not np.ma.is_masked(data): + array = np.array(array, copy=False) + + return array + + @classmethod + def _process_string(cls, data, mask=True): + """TODOHDF.""" + if mask and data == "": + data = np.ma.masked_all((), dtype=f"U{len(data)}") + else: + data = np.array(data, dtype="U") + + return data + + @classmethod + def _process_object_array(cls, array, mask=True): + """TODOHDF.""" + array = array.astype("U", copy=False) + if mask: + array = np.ma.where(array == "", np.ma.masked, array) + if not np.ma.is_masked(data): + array = np.array(array, copy=False) + + return array + + def _is_char(cls, data): + return data.dtype.kind in "SU" # isinstance(data.item(0), (str, bytes)) + + def _is_string(cls, data): + return data.dtype.kind in "O" + + @classmethod + def mask_and_scale(cls, mask=True, scale=True): + """ + """ + if isinstance(data, str): + return cls._process_string(data, mask=mask) + + if _is_string(data): + return cls._process_object_array(data, mask=mask) + + if _is_char(data): + return cls._process_char_array(data, mask=mask) + + if mask or scale: + is_unsigned_int = attrs.get("_Unsigned", False) in ("true", "True") + if is_unsigned_int: + dtype = data.dtype + dtype_unsigned_int = f"{dtype.byteorder}u{dtype.itemsize}" + data = data.view(dtype_unsigned_int) + + if mask: + data = cls._mask(data.scale=scale, always_mask=False) + + if scale: + data = cls._scale(data, attrs) + + return data + + @classmethod + def _mask( + cls, data, var_dtype, attrs, scale=True, always_mask=False + ): + """TODOHDF.""" + print("MASK", data.shape) + + if isinstance(data, str): + return cls._process_string(data) + + if _is_string(data): + return cls._process_object_array(data) + + if _is_char(data): + return cls._process_char_array(data) + + dtype = data.dtype +# is_unsigned = attrs.get("_Unsigned", False) in ("true", "True") +# is_unsigned_int = is_unsigned and data.dtype.kind == "i" +# +# dtype = data.dtype +# if scale and is_unsigned_int: +# # Only do this if autoscale option is on. +# dtype_unsigned_int = f"{dtype.byteorder}u{dtype.itemsize}" +# data = data.view(dtype_unsigned_int) + + totalmask = np.zeros(data.shape, np.bool_) + fill_value = None + safe_missval, missing_value = cls._check_safecast( + "missing_value", var_dtype, attrs + ) + if safe_missval: + mval = np.array(missing_value, var_dtype) + if scale and is_unsigned_int: + mval = mval.view(dtype_unsigned_int) + + # create mask from missing values. + mvalmask = np.zeros(data.shape, np.bool_) + if mval.shape == (): # mval a scalar. + mval = (mval,) # make into iterable. + + for m in mval: + # is scalar missing value a NaN? + try: + mvalisnan = np.isnan(m) + except TypeError: + # isnan fails on some dtypes + mvalisnan = False + + if mvalisnan: + mvalmask += np.isnan(data) + else: + mvalmask += data == m + + if mvalmask.any(): + # Set fill_value for masked array to missing_value (or + # 1st element if missing_value is a vector). + fill_value = mval[0] + totalmask += mvalmask + + # set mask=True for data == fill value + safe_fillval, _FillValue = cls._check_safecast( + "_FillValue", dtype, attrs + ) + if safe_fillval: + fval = np.array(_FillValue, var_dtype) + if scale and is_unsigned_int: + fval = fval.view(dtype_unsigned_int) + + # is _FillValue a NaN? + try: + fvalisnan = np.isnan(fval) + except Exception: + # isnan fails on some dtypes + fvalisnan = False + + if fvalisnan: + mask = np.isnan(data) + elif (data == fval).any(): + mask = data == fval + else: + mask = None + + if mask is not None: + if fill_value is None: + fill_value = fval + + totalmask += mask + else: + # Don't return masked array if variable filling is disabled. + no_fill = 0 + # with nogil: + # ierr = nc_inq_var_fill(self._grpid,self._varid,&no_fill,NULL) + # _ensure_nc_success(ierr) + + # if no_fill is not 1, and not a byte variable, then use + # default fill value. from + # http://www.unidata.ucar.edu/software/netcdf/docs/netcdf-c/Fill-Values.html#Fill-Values + # "If you need a fill value for a byte variable, it is + # recommended that you explicitly define an appropriate + # _FillValue attribute, as generic utilities such as + # ncdump will not assume a default fill value for byte + # variables." Explained here too: + # http://www.unidata.ucar.edu/software/netcdf/docs/known_problems.html#ncdump_ubyte_fill + # "There should be no default fill values when reading any + # byte type, signed or unsigned, because the byte ranges + # are too small to assume one of the values should appear + # as a missing value unless a _FillValue attribute is set + # explicitly." (do this only for non-vlens, since vlens + # don't have a default _FillValue) + if ( + no_fill != 1 or dtype.str[1:] not in ("u1", "i1") + ): + fillval = np.array(default_fillvals[dtype.str[1:]], dtype) + has_fillval = data == fillval + # if data is an array scalar, has_fillval will be a + # boolean. in that case convert to an array. + # if type(has_fillval) == bool: + if isinstance(has_fillval, bool): + has_fillval = np.asarray(has_fillval) + + if has_fillval.any(): + if fill_value is None: + fill_value = fillval + + mask = data == fillval + totalmask += mask + + # Set mask=True for data outside valid_min, valid_max. + validmin = None + validmax = None + # If valid_range exists use that, otherwise look for + # valid_min, valid_max. No special treatment of byte data as + # described at + # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). + safe_validrange, valid_range = cls._check_safecast( + "valid_range", var_dtype, attrs + ) + safe_validmin, valid_min = cls._check_safecast( + "valid_min", var_dtype, attrs + ) + safe_validmax, valid_max = cls._check_safecast( + "valid_max", var_dtype, attrs + ) + if safe_validrange and valid_range.size == 2: + validmin = np.array(valid_range[0], var_dtype) + validmax = np.array(valid_range[1], var_dtype) + else: + if safe_validmin: + validmin = np.array(valid_min, var_dtype) + + if safe_validmax: + validmax = np.array(valid_max, var_dtype) + + if validmin is not None and scale and is_unsigned_int: + validmin = validmin.view(dtype_unsigned_int) + + if validmax is not None and scale and is_unsigned_int: + validmax = validmax.view(dtype_unsigned_int) + + # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). + # "If the data type is byte and _FillValue is not explicitly + # defined, then the valid range should include all possible + # values. Otherwise, the valid range should exclude the + # _FillValue (whether defined explicitly or by default) as + # follows. If the _FillValue is positive then it defines a + # valid maximum, otherwise it defines a valid minimum." + if safe_fillval: + fval = np.array(_FillValue, dtype) + else: + k = dtype.str[1:] + if k in ("u1", "i1"): + fval = None + else: + fval = np.array(default_fillvals[k], dtype) + + if var_dtype.kind != "S": + # Don't set mask for character data + + # Setting valid_min/valid_max to the _FillVaue is too + # surprising for many users (despite the netcdf docs + # attribute best practices suggesting clients should do + # this). + if validmin is not None: + totalmask += data < validmin + + if validmax is not None: + totalmask += data > validmax + + if fill_value is None and fval is not None: + fill_value = fval + + # If all else fails, use default _FillValue as fill_value for + # masked array. + if fill_value is None: + fill_value = default_fillvals[dtype.str[1:]] + + # Create masked array with computed mask + masked_values = bool(totalmask.any()) + if masked_values: + data = np.ma.masked_array( + data, mask=totalmask, fill_value=fill_value + ) + else: + # Always return masked array, if no values masked. + data = np.ma.masked_array(data) + + # Scalar array with mask=True should be converted to + # np.ma.MaskedConstant to be consistent with slicing + # behavior of masked arrays. + if data.shape == () and data.mask.all(): + # Return a scalar numpy masked constant not a 0-d masked + # array, so that data == np.ma.masked. + data = data[()] + + elif not always_mask and not masked_values: + # Return a regular numpy array if requested and there are + # no missing values + data = np.array(data, copy=False) + + return data + + @classmethod + def _scale(cls, data, attrs): + """TODOHDF.""" + # If variable has scale_factor and add_offset attributes, + # apply them. + scale_factor = attrs.get("scale_factor") + add_offset = attrs.get("add_offset") + try: + if scale_factor is not None: + float(scale_factor) + + if add_offset is not None: + float(add_offset) + except ValueError: + logging.warn( + "invalid scale_factor or add_offset attribute, " + "no unpacking done..." + ) + return data + + if scale_factor is not None and add_offset is not None: + if add_offset != 0.0 or scale_factor != 1.0: + data = data * scale_factor + add_offset + else: + data = data.astype(scale_factor.dtype) + elif scale_factor is not None and scale_factor != 1.0: + # If variable has only scale_factor attribute, rescale. + data = data * scale_factor + elif add_offset is not None and add_offset != 0.0: + # If variable has only add_offset attribute, add offset. + data = data + add_offset + + return data From 03733fad9b7791e498bcee7e31fcb4ee1e0d2401 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 30 Jan 2024 08:04:36 +0000 Subject: [PATCH 19/88] dev --- cfdm/__init__.py | 3 +- cfdm/cfdmimplementation.py | 26 +- cfdm/data/__init__.py | 3 +- cfdm/data/h5netcdfarray.py | 309 ++++++++++ cfdm/data/hdfarray.py | 851 --------------------------- cfdm/data/mask.py | 641 -------------------- cfdm/data/maskscale.py | 355 +++++++++++ cfdm/data/mixin/filearraymixin.py | 12 + cfdm/data/mixin/netcdffilemixin.py | 12 - cfdm/read_write/netcdf/netcdfread.py | 5 +- cfdm/read_write/read.py | 2 +- 11 files changed, 691 insertions(+), 1528 deletions(-) create mode 100644 cfdm/data/h5netcdfarray.py delete mode 100644 cfdm/data/hdfarray.py delete mode 100644 cfdm/data/mask.py create mode 100644 cfdm/data/maskscale.py diff --git a/cfdm/__init__.py b/cfdm/__init__.py index 5b210d04f..46636a582 100644 --- a/cfdm/__init__.py +++ b/cfdm/__init__.py @@ -139,7 +139,8 @@ CompressedArray, Data, GatheredArray, - HDFArray, + H5netcdfArray, + MaskScale, NetCDFArray, NumpyArray, PointTopologyArray, diff --git a/cfdm/cfdmimplementation.py b/cfdm/cfdmimplementation.py index 4a9437cbb..3300039f5 100644 --- a/cfdm/cfdmimplementation.py +++ b/cfdm/cfdmimplementation.py @@ -30,7 +30,7 @@ CellConnectivityArray, Data, GatheredArray, - HDFArray, + H5netcdfArray, NetCDFArray, PointTopologyArray, RaggedContiguousArray, @@ -2354,20 +2354,8 @@ def initialise_NetCDFArray( missing_values=missing_values, ) - def initialise_HDFArray( - self, - **kwargs - # filename=None, - # address=None, - # dtype=None, - # shape=None, - # mask=True, - # units=False, - # calendar=None, - # missing_values=None, - # s3=None, - ): - """Return a HDF array instance. + def initialise_H5netcdfArray(self, **kwargs): + """Return a `H5netcdfArray` instance. :Parameters: @@ -2399,10 +2387,10 @@ def initialise_HDFArray( :Returns: - `HDFArray` + `H5netcdfArray` """ - cls = self.get_class("HDFArray") + cls = self.get_class("H5netcdfArray") return cls(**kwargs) def initialise_BoundsFromNodesArray(self, **kwargs): @@ -3759,7 +3747,7 @@ def squeeze(self, construct, axes=None): Data=Data, BoundsFromNodesArray=BoundsFromNodesArray, GatheredArray=GatheredArray, - HDFArray=HDFArray, + H5netcdfArray=H5netcdfArray, NetCDFArray=NetCDFArray, PointTopologyArray=PointTopologyArray, RaggedContiguousArray=RaggedContiguousArray, @@ -3803,7 +3791,7 @@ def implementation(): 'Datum': , 'Data': , 'GatheredArray': , - 'HDFArray': , + 'H5netcdfArray': , 'NetCDFArray': , 'PointTopologyArray': , 'RaggedContiguousArray': , diff --git a/cfdm/data/__init__.py b/cfdm/data/__init__.py index 98e0b0a2e..19ffdd01d 100644 --- a/cfdm/data/__init__.py +++ b/cfdm/data/__init__.py @@ -18,7 +18,8 @@ from .boundsfromnodesarray import BoundsFromNodesArray from .cellconnectivityarray import CellConnectivityArray from .gatheredarray import GatheredArray -from .hdfarray import HDFArray +from .h5netcdfarray import H5netcdfArray +from .maskscale import MaskScale from .netcdfarray import NetCDFArray from .numpyarray import NumpyArray from .pointtopologyarray import PointTopologyArray diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py new file mode 100644 index 000000000..1e46e1885 --- /dev/null +++ b/cfdm/data/h5netcdfarray.py @@ -0,0 +1,309 @@ +import logging + +import h5netcdf +import netCDF4 + +from . import abstract +from .maskscale import MaskScale +from .mixin import FileArrayMixin, NetCDFFileMixin + +_safecast = netCDF4.utils._safecast +default_fillvals = netCDF4.default_fillvals.copy() +default_fillvals["O"] = default_fillvals["S1"] + +logger = logging.getLogger(__name__) + + +class H5netcdfArray(NetCDFFileMixin, FileArrayMixin, abstract.Array): + """An underlying array stored in an HDF file. + + .. versionadded:: (cfdm) TODOHDF + + """ + + def __init__( + self, + filename=None, + address=None, + dtype=None, + shape=None, + mask=True, + units=False, + calendar=False, + missing_values=None, + s3=None, + source=None, + copy=True, + ): + """**Initialisation** + + :Parameters: + + filename: (sequence of) `str`, optional + The name of the file(s) containing the array. + + address: (sequence of) `str`, optional + The identity of the variable in each file defined by + *filename*. Must be a netCDF variable name. + + dtype: `numpy.dtype` + The data type of the array in the file. May be `None` + if the numpy data-type is not known (which can be the + case for string types, for example). + + shape: `tuple` + The array dimension sizes in the file. + + size: `int` + Number of elements in the array in the file. + + ndim: `int` + The number of array dimensions in the file. + + mask: `bool` + If True (the default) then mask by convention when + reading data from disk. + + A netCDF array is masked depending on the values of any of + the netCDF variable attributes ``valid_min``, + ``valid_max``, ``valid_range``, ``_FillValue`` and + ``missing_value``. + + units: `str` or `None`, optional + The units of the variable. Set to `None` to indicate + that there are no units. If unset then the units will + be set during the first `__getitem__` call. + + calendar: `str` or `None`, optional + The calendar of the variable. By default, or if set to + `None`, then the CF default calendar is assumed, if + applicable. If unset then the calendar will be set + during the first `__getitem__` call. + + missing_values: `dict`, optional + The missing value indicators defined by the variable + attributes. See `get_missing_values` for details. + + s3: `dict` or `None`, optional + The `s3fs.S3FileSystem` options for accessing S3 + files. If there are no options then ``anon=True`` is + assumed, and if there is no ``'endpoint_url'`` key + then one will automatically be derived one for each S3 + filename. + + .. versionadded:: (cfdm) HDFVER + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + super().__init__(source=source, copy=copy) + + if source is not None: + try: + shape = source._get_component("shape", None) + except AttributeError: + shape = None + + try: + filename = source._get_component("filename", None) + except AttributeError: + filename = None + + try: + address = source._get_component("address", None) + except AttributeError: + address = None + + try: + dtype = source._get_component("dtype", None) + except AttributeError: + dtype = None + + try: + mask = source._get_component("mask", True) + except AttributeError: + mask = True + + try: + units = source._get_component("units", False) + except AttributeError: + units = False + + try: + calendar = source._get_component("calendar", False) + except AttributeError: + calendar = False + + try: + missing_values = source._get_component("missing_values", None) + except AttributeError: + missing_values = None + + try: + s3 = source._get_component("s3", None) + except AttributeError: + s3 = None + + if shape is not None: + self._set_component("shape", shape, copy=False) + + if filename is not None: + if isinstance(filename, str): + filename = (filename,) + else: + filename = tuple(filename) + + self._set_component("filename", filename, copy=False) + + if address is not None: + if isinstance(address, (str, int)): + address = (address,) + else: + address = tuple(address) + + self._set_component("address", address, copy=False) + + if missing_values is not None: + self._set_component( + "missing_values", missing_values.copy(), copy=False + ) + + self._set_component("dtype", dtype, copy=False) + self._set_component("mask", mask, copy=False) + self._set_component("units", units, copy=False) + self._set_component("calendar", calendar, copy=False) + self._set_component("s3", s3, copy=False) + + # By default, close the file after data array access + self._set_component("close", True, copy=False) + + def __getitem__(self, indices): + """Returns a subspace of the array as a numpy array. + + x.__getitem__(indices) <==> x[indices] + + The indices that define the subspace must be either `Ellipsis` or + a sequence that contains an index for each dimension. In the + latter case, each dimension's index must either be a `slice` + object or a sequence of two or more integers. + + Indexing is similar to numpy indexing. The only difference to + numpy indexing (given the restrictions on the type of indices + allowed) is: + + * When two or more dimension's indices are sequences of integers + then these indices work independently along each dimension + (similar to the way vector subscripts work in Fortran). + + .. versionadded:: (cfdm) 1.7.0 + + """ + dataset, address = self.open() + dataset0 = dataset + + mask = self.get_mask() + groups, address = self.get_groups(address) + + if groups: + dataset = self._uuu(dataset, groups) + + # Get the variable by netCDF name + variable = dataset.variables[address] + array = variable[indices] + array = MaskScale.apply( + variable, array, mask=mask, scale=mask, always_mask=False + ) + + # Set the units, if they haven't been set already. + self._set_units(variable) + + self.close(dataset0) + del dataset, dataset0 + + if not self.ndim: + # Hmm netCDF4 has a thing for making scalar size 1, 1d + array = array.squeeze() + + return array + + def close(self, dataset): + """Close the dataset containing the data. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + dataset: `h5netcdf.File` + The netCDF dataset to be be closed. + + :Returns: + + `None` + + """ + if self._get_component("close"): + dataset.close() + + def get_groups(self, address): + """The netCDF4 group structure of a netCDF variable. + + .. versionadded:: (cfdm) 1.8.6.0 + + :Parameters: + + address: `str` or `int` + The netCDF variable name, or integer varid, from which + to get the groups. + + .. versionadded:: (cfdm) 1.10.1.0 + + :Returns: + + (`list`, `str`) or (`list`, `int`) + The group structure and the name within the group. If + *address* is a varid then an empty list and the varid + are returned. + + **Examples** + + >>> n.get_groups('tas') + ([], 'tas') + + >>> n.get_groups('/tas') + ([], 'tas') + + >>> n.get_groups('/data/model/tas') + (['data', 'model'], 'tas') + + >>> n.get_groups(9) + ([], 9) + + """ + try: + if "/" not in address: + return [], address + except TypeError: + return [], address + + out = address.split("/")[1:] + return out[:-1], out[-1] + + def open(self, **kwargs): + """Return a dataset file object and address. + + When multiple files have been provided an attempt is made to + open each one, in the order stored, and a file object is + returned from the first file that exists. + + :Returns: + + (`h5netcdf.File`, `str`) + The open file object, and the address of the data + within the file. + + """ + return super().open( + h5netcdf.File, mode="r", decode_vlen_strings=True, **kwargs + ) diff --git a/cfdm/data/hdfarray.py b/cfdm/data/hdfarray.py deleted file mode 100644 index cfb0664a5..000000000 --- a/cfdm/data/hdfarray.py +++ /dev/null @@ -1,851 +0,0 @@ -import logging - -import h5netcdf -import netCDF4 -import numpy as np - -from . import abstract -from .mixin import FileArrayMixin, NetCDFFileMixin - -_safecast = netCDF4.utils._safecast -default_fillvals = netCDF4.default_fillvals - -logger = logging.getLogger(__name__) - - -class HDFArray(NetCDFFileMixin, FileArrayMixin, abstract.Array): - """An underlying array stored in an HDF file. - - .. versionadded:: (cfdm) TODOHDF - - """ - - def __init__( - self, - filename=None, - address=None, - dtype=None, - shape=None, - mask=True, - units=False, - calendar=False, - missing_values=None, - s3=None, - source=None, - copy=True, - ): - """**Initialisation** - - :Parameters: - - filename: (sequence of) `str`, optional - The name of the file(s) containing the array. - - address: (sequence of) `str`, optional - The identity of the variable in each file defined by - *filename*. Must be a netCDF variable name. - - dtype: `numpy.dtype` - The data type of the array in the file. May be `None` - if the numpy data-type is not known (which can be the - case for string types, for example). - - shape: `tuple` - The array dimension sizes in the file. - - size: `int` - Number of elements in the array in the file. - - ndim: `int` - The number of array dimensions in the file. - - mask: `bool` - If True (the default) then mask by convention when - reading data from disk. - - A netCDF array is masked depending on the values of any of - the netCDF variable attributes ``valid_min``, - ``valid_max``, ``valid_range``, ``_FillValue`` and - ``missing_value``. - - units: `str` or `None`, optional - The units of the variable. Set to `None` to indicate - that there are no units. If unset then the units will - be set during the first `__getitem__` call. - - calendar: `str` or `None`, optional - The calendar of the variable. By default, or if set to - `None`, then the CF default calendar is assumed, if - applicable. If unset then the calendar will be set - during the first `__getitem__` call. - - missing_values: `dict`, optional - The missing value indicators defined by the variable - attributes. See `get_missing_values` for details. - - s3: `dict` or `None`, optional - The `s3fs.S3FileSystem` options for accessing S3 - files. If there are no options then ``anon=True`` is - assumed, and if there is no ``'endpoint_url'`` key - then one will automatically be derived one for each S3 - filename. - - .. versionadded:: (cfdm) HDFVER - - {{init source: optional}} - - {{init copy: `bool`, optional}} - - """ - super().__init__(source=source, copy=copy) - - if source is not None: - try: - shape = source._get_component("shape", None) - except AttributeError: - shape = None - - try: - filename = source._get_component("filename", None) - except AttributeError: - filename = None - - try: - address = source._get_component("address", None) - except AttributeError: - address = None - - try: - dtype = source._get_component("dtype", None) - except AttributeError: - dtype = None - - try: - mask = source._get_component("mask", True) - except AttributeError: - mask = True - - try: - units = source._get_component("units", False) - except AttributeError: - units = False - - try: - calendar = source._get_component("calendar", False) - except AttributeError: - calendar = False - - try: - missing_values = source._get_component("missing_values", None) - except AttributeError: - missing_values = None - - try: - s3 = source._get_component("s3", None) - except AttributeError: - s3 = None - - if shape is not None: - self._set_component("shape", shape, copy=False) - - if filename is not None: - if isinstance(filename, str): - filename = (filename,) - else: - filename = tuple(filename) - - self._set_component("filename", filename, copy=False) - - if address is not None: - if isinstance(address, (str, int)): - address = (address,) - else: - address = tuple(address) - - self._set_component("address", address, copy=False) - - if missing_values is not None: - self._set_component( - "missing_values", missing_values.copy(), copy=False - ) - - self._set_component("dtype", dtype, copy=False) - self._set_component("mask", mask, copy=False) - self._set_component("units", units, copy=False) - self._set_component("calendar", calendar, copy=False) - self._set_component("s3", s3, copy=False) - - # By default, close the file after data array access - self._set_component("close", True, copy=False) - - def __getitem__(self, indices): - """Returns a subspace of the array as a numpy array. - - x.__getitem__(indices) <==> x[indices] - - The indices that define the subspace must be either `Ellipsis` or - a sequence that contains an index for each dimension. In the - latter case, each dimension's index must either be a `slice` - object or a sequence of two or more integers. - - Indexing is similar to numpy indexing. The only difference to - numpy indexing (given the restrictions on the type of indices - allowed) is: - - * When two or more dimension's indices are sequences of integers - then these indices work independently along each dimension - (similar to the way vector subscripts work in Fortran). - - .. versionadded:: (cfdm) 1.7.0 - - """ - dataset, address = self.open() - dataset0 = dataset - - mask = self.get_mask() - groups, address = self.get_groups(address) - - if groups: - dataset = self._uuu(dataset, groups) - - # Get the variable by netCDF name - variable = dataset.variables[address] - self.variable = variable - array = variable[indices] - print(11) - if mask: - print(22) - self.scale = True - self.always_mask = False - self._isvlen = variable.dtype == np.dtype("O") - isvlen = variable.dtype == np.dtype("O") - if not self._isvlen: - array = self._mask2( - array, - variable.dtype, - variable.attrs, - isvlen, - self.scale, - self.always_mask, - ) - # array = self._mask(array) - array = self._scale(array) - - # Set the units, if they haven't been set already. - self._set_units(variable) - - self.close(dataset0) - del dataset, dataset0 - del self.variable - - string_type = isinstance(array, str) - if string_type: - # -------------------------------------------------------- - # A netCDF string type scalar variable comes out as Python - # str object, so convert it to a numpy array. - # -------------------------------------------------------- - array = np.array(array, dtype=f"U{len(array)}") - - if not self.ndim: - # Hmm netCDF4 has a thing for making scalar size 1, 1d - array = array.squeeze() - - array = self._process_string_and_char(array) - return array - - @classmethod - def _check_safecast2(cls, attname, var_dtype, attrs): - """TODOHDF. - - Check to see that variable attribute exists can can be safely - cast to variable data type. - - """ - # attrs = self.variable.attrs - if attname in attrs: - attvalue = attrs[attname] - att = np.array(attvalue) - else: - return False, None - - is_safe = True - try: - atta = np.array(att, var_dtype) - except ValueError: - is_safe = False - else: - is_safe = _safecast(att, atta) - - if not is_safe: - logger.warn( - f"WARNING: {attname} not used since it cannot " - "be safely cast to variable data type" - ) # pragma: no cover - - return is_safe, attvalue - - def _check_safecast(self, attname): - """TODOHDF. - - Check to see that variable attribute exists can can be safely - cast to variable data type. - - """ - attrs = self.variable.attrs - if attname in attrs: - attvalue = attrs[attname] - att = np.array(attvalue) - setattr(self, attname, attvalue) - else: - return False - - is_safe = True - try: - atta = np.array(att, self.dtype) - except ValueError: - is_safe = False - else: - is_safe = _safecast(att, atta) - - if not is_safe: - logger.warn( - f"WARNING: {attname} not used since it cannot " - "be safely cast to variable data type" - ) # pragma: no cover - - return is_safe - - def _mask(self, data): - """TODOHDF.""" - print("MASK", data.shape) - # Private function for creating a masked array, masking - # missing_values and/or _FillValues. - - attrs = self.variable.attrs - is_unsigned = attrs.get("_Unsigned", False) in ("true", "True") - is_unsigned_int = is_unsigned and data.dtype.kind == "i" - - dtype = data.dtype - if self.scale and is_unsigned_int: - # Only do this if autoscale option is on. - dtype_unsigned_int = f"{dtype.byteorder}u{dtype.itemsize}" - data = data.view(dtype_unsigned_int) - - totalmask = np.zeros(data.shape, np.bool_) - fill_value = None - safe_missval = self._check_safecast("missing_value") - if safe_missval: - mval = np.array(self.missing_value, self.dtype) - if self.scale and is_unsigned_int: - mval = mval.view(dtype_unsigned_int) - - # create mask from missing values. - mvalmask = np.zeros(data.shape, np.bool_) - if mval.shape == (): # mval a scalar. - mval = (mval,) # make into iterable. - - for m in mval: - # is scalar missing value a NaN? - try: - mvalisnan = np.isnan(m) - except TypeError: - # isnan fails on some dtypes - mvalisnan = False - - if mvalisnan: - mvalmask += np.isnan(data) - else: - mvalmask += data == m - - if mvalmask.any(): - # Set fill_value for masked array to missing_value (or - # 1st element if missing_value is a vector). - fill_value = mval[0] - totalmask += mvalmask - - # set mask=True for data == fill value - safe_fillval = self._check_safecast("_FillValue") - if safe_fillval: - fval = np.array(self._FillValue, self.dtype) - if self.scale and is_unsigned_int: - fval = fval.view(dtype_unsigned_int) - - # is _FillValue a NaN? - try: - fvalisnan = np.isnan(fval) - except Exception: - # isnan fails on some dtypes - fvalisnan = False - - if fvalisnan: - mask = np.isnan(data) - elif (data == fval).any(): - mask = data == fval - else: - mask = None - - if mask is not None: - if fill_value is None: - fill_value = fval - - totalmask += mask - else: - # Don't return masked array if variable filling is disabled. - no_fill = 0 - # with nogil: - # ierr = nc_inq_var_fill(self._grpid,self._varid,&no_fill,NULL) - # _ensure_nc_success(ierr) - - # if no_fill is not 1, and not a byte variable, then use - # default fill value. from - # http://www.unidata.ucar.edu/software/netcdf/docs/netcdf-c/Fill-Values.html#Fill-Values - # "If you need a fill value for a byte variable, it is - # recommended that you explicitly define an appropriate - # _FillValue attribute, as generic utilities such as - # ncdump will not assume a default fill value for byte - # variables." Explained here too: - # http://www.unidata.ucar.edu/software/netcdf/docs/known_problems.html#ncdump_ubyte_fill - # "There should be no default fill values when reading any - # byte type, signed or unsigned, because the byte ranges - # are too small to assume one of the values should appear - # as a missing value unless a _FillValue attribute is set - # explicitly." (do this only for non-vlens, since vlens - # don't have a default _FillValue) - if not self._isvlen and ( - no_fill != 1 or dtype.str[1:] not in ("u1", "i1") - ): - fillval = np.array(default_fillvals[dtype.str[1:]], dtype) - has_fillval = data == fillval - # if data is an array scalar, has_fillval will be a - # boolean. in that case convert to an array. - # if type(has_fillval) == bool: - if isinstance(has_fillval, bool): - has_fillval = np.asarray(has_fillval) - - if has_fillval.any(): - if fill_value is None: - fill_value = fillval - - mask = data == fillval - totalmask += mask - - # Set mask=True for data outside valid_min, valid_max. - validmin = None - validmax = None - # If valid_range exists use that, otherwise look for - # valid_min, valid_max. No special treatment of byte data as - # described at - # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). - safe_validrange = self._check_safecast("valid_range") - safe_validmin = self._check_safecast("valid_min") - safe_validmax = self._check_safecast("valid_max") - if safe_validrange and self.valid_range.size == 2: - validmin = np.array(self.valid_range[0], self.dtype) - validmax = np.array(self.valid_range[1], self.dtype) - else: - if safe_validmin: - validmin = np.array(self.valid_min, self.dtype) - - if safe_validmax: - validmax = np.array(self.valid_max, self.dtype) - - if validmin is not None and self.scale and is_unsigned_int: - validmin = validmin.view(dtype_unsigned_int) - - if validmax is not None and self.scale and is_unsigned_int: - validmax = validmax.view(dtype_unsigned_int) - - # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). - # "If the data type is byte and _FillValue is not explicitly - # defined, then the valid range should include all possible - # values. Otherwise, the valid range should exclude the - # _FillValue (whether defined explicitly or by default) as - # follows. If the _FillValue is positive then it defines a - # valid maximum, otherwise it defines a valid minimum." - if safe_fillval: - fval = np.array(self._FillValue, dtype) - else: - k = dtype.str[1:] - if k in ("u1", "i1"): - fval = None - else: - fval = np.array(default_fillvals[k], dtype) - - if self.dtype.kind != "S": - # Don't set mask for character data - - # Setting valid_min/valid_max to the _FillVaue is too - # surprising for many users (despite the netcdf docs - # attribute best practices suggesting clients should do - # this). - if validmin is not None: - totalmask += data < validmin - - if validmax is not None: - totalmask += data > validmax - - if fill_value is None and fval is not None: - fill_value = fval - - # If all else fails, use default _FillValue as fill_value for - # masked array. - if fill_value is None: - fill_value = default_fillvals[dtype.str[1:]] - - # Create masked array with computed mask - masked_values = bool(totalmask.any()) - if masked_values: - data = np.ma.masked_array( - data, mask=totalmask, fill_value=fill_value - ) - else: - # Always return masked array, if no values masked. - data = np.ma.masked_array(data) - - # Scalar array with mask=True should be converted to - # np.ma.MaskedConstant to be consistent with slicing - # behavior of masked arrays. - if data.shape == () and data.mask.all(): - # Return a scalar numpy masked constant not a 0-d masked - # array, so that data == np.ma.masked. - data = data[()] - - elif not self.always_mask and not masked_values: - # Return a regular numpy array if requested and there are - # no missing values - data = np.array(data, copy=False) - - return data - - @classmethod - def _mask2( - cls, data, var_dtype, attrs, isvlen, scale=False, always_mask=False - ): - """TODOHDF.""" - print("MASK", data.shape) - - if isvlen: - return data - - # Private function for creating a masked array, masking - # missing_values and/or _FillValues. - - # attrs = self.variable.attrs - is_unsigned = attrs.get("_Unsigned", False) in ("true", "True") - is_unsigned_int = is_unsigned and data.dtype.kind == "i" - - dtype = data.dtype - if scale and is_unsigned_int: - # Only do this if autoscale option is on. - dtype_unsigned_int = f"{dtype.byteorder}u{dtype.itemsize}" - data = data.view(dtype_unsigned_int) - - totalmask = np.zeros(data.shape, np.bool_) - fill_value = None - safe_missval, missing_value = cls._check_safecast2( - "missing_value", var_dtype, attrs - ) - if safe_missval: - mval = np.array(missing_value, var_dtype) - if scale and is_unsigned_int: - mval = mval.view(dtype_unsigned_int) - - # create mask from missing values. - mvalmask = np.zeros(data.shape, np.bool_) - if mval.shape == (): # mval a scalar. - mval = (mval,) # make into iterable. - - for m in mval: - # is scalar missing value a NaN? - try: - mvalisnan = np.isnan(m) - except TypeError: - # isnan fails on some dtypes - mvalisnan = False - - if mvalisnan: - mvalmask += np.isnan(data) - else: - mvalmask += data == m - - if mvalmask.any(): - # Set fill_value for masked array to missing_value (or - # 1st element if missing_value is a vector). - fill_value = mval[0] - totalmask += mvalmask - - # set mask=True for data == fill value - safe_fillval, _FillValue = cls._check_safecast2( - "_FillValue", dtype, attrs - ) - if safe_fillval: - fval = np.array(_FillValue, var_dtype) - if scale and is_unsigned_int: - fval = fval.view(dtype_unsigned_int) - - # is _FillValue a NaN? - try: - fvalisnan = np.isnan(fval) - except Exception: - # isnan fails on some dtypes - fvalisnan = False - - if fvalisnan: - mask = np.isnan(data) - elif (data == fval).any(): - mask = data == fval - else: - mask = None - - if mask is not None: - if fill_value is None: - fill_value = fval - - totalmask += mask - else: - # Don't return masked array if variable filling is disabled. - no_fill = 0 - # with nogil: - # ierr = nc_inq_var_fill(self._grpid,self._varid,&no_fill,NULL) - # _ensure_nc_success(ierr) - - # if no_fill is not 1, and not a byte variable, then use - # default fill value. from - # http://www.unidata.ucar.edu/software/netcdf/docs/netcdf-c/Fill-Values.html#Fill-Values - # "If you need a fill value for a byte variable, it is - # recommended that you explicitly define an appropriate - # _FillValue attribute, as generic utilities such as - # ncdump will not assume a default fill value for byte - # variables." Explained here too: - # http://www.unidata.ucar.edu/software/netcdf/docs/known_problems.html#ncdump_ubyte_fill - # "There should be no default fill values when reading any - # byte type, signed or unsigned, because the byte ranges - # are too small to assume one of the values should appear - # as a missing value unless a _FillValue attribute is set - # explicitly." (do this only for non-vlens, since vlens - # don't have a default _FillValue) - if not isvlen and ( - no_fill != 1 or dtype.str[1:] not in ("u1", "i1") - ): - fillval = np.array(default_fillvals[dtype.str[1:]], dtype) - has_fillval = data == fillval - # if data is an array scalar, has_fillval will be a - # boolean. in that case convert to an array. - # if type(has_fillval) == bool: - if isinstance(has_fillval, bool): - has_fillval = np.asarray(has_fillval) - - if has_fillval.any(): - if fill_value is None: - fill_value = fillval - - mask = data == fillval - totalmask += mask - - # Set mask=True for data outside valid_min, valid_max. - validmin = None - validmax = None - # If valid_range exists use that, otherwise look for - # valid_min, valid_max. No special treatment of byte data as - # described at - # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). - safe_validrange, valid_range = cls._check_safecast2( - "valid_range", var_dtype, attrs - ) - safe_validmin, valid_min = cls._check_safecast2( - "valid_min", var_dtype, attrs - ) - safe_validmax, valid_max = cls._check_safecast2( - "valid_max", var_dtype, attrs - ) - if safe_validrange and valid_range.size == 2: - validmin = np.array(valid_range[0], var_dtype) - validmax = np.array(valid_range[1], var_dtype) - else: - if safe_validmin: - validmin = np.array(valid_min, var_dtype) - - if safe_validmax: - validmax = np.array(valid_max, var_dtype) - - if validmin is not None and scale and is_unsigned_int: - validmin = validmin.view(dtype_unsigned_int) - - if validmax is not None and scale and is_unsigned_int: - validmax = validmax.view(dtype_unsigned_int) - - # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). - # "If the data type is byte and _FillValue is not explicitly - # defined, then the valid range should include all possible - # values. Otherwise, the valid range should exclude the - # _FillValue (whether defined explicitly or by default) as - # follows. If the _FillValue is positive then it defines a - # valid maximum, otherwise it defines a valid minimum." - if safe_fillval: - fval = np.array(_FillValue, dtype) - else: - k = dtype.str[1:] - if k in ("u1", "i1"): - fval = None - else: - fval = np.array(default_fillvals[k], dtype) - - if var_dtype.kind != "S": - # Don't set mask for character data - - # Setting valid_min/valid_max to the _FillVaue is too - # surprising for many users (despite the netcdf docs - # attribute best practices suggesting clients should do - # this). - if validmin is not None: - totalmask += data < validmin - - if validmax is not None: - totalmask += data > validmax - - if fill_value is None and fval is not None: - fill_value = fval - - # If all else fails, use default _FillValue as fill_value for - # masked array. - if fill_value is None: - fill_value = default_fillvals[dtype.str[1:]] - - # Create masked array with computed mask - masked_values = bool(totalmask.any()) - if masked_values: - data = np.ma.masked_array( - data, mask=totalmask, fill_value=fill_value - ) - else: - # Always return masked array, if no values masked. - data = np.ma.masked_array(data) - - # Scalar array with mask=True should be converted to - # np.ma.MaskedConstant to be consistent with slicing - # behavior of masked arrays. - if data.shape == () and data.mask.all(): - # Return a scalar numpy masked constant not a 0-d masked - # array, so that data == np.ma.masked. - data = data[()] - - elif not always_mask and not masked_values: - # Return a regular numpy array if requested and there are - # no missing values - data = np.array(data, copy=False) - - return data - - def _scale(self, data): - """TODOHDF.""" - # If variable has scale_factor and add_offset attributes, - # apply them. - attrs = self.variable.attrs - scale_factor = attrs.get("scale_factor") - add_offset = attrs.get("add_offset") - try: - if scale_factor is not None: - float(scale_factor) - - if add_offset is not None: - float(add_offset) - except ValueError: - logging.warn( - "invalid scale_factor or add_offset attribute, " - "no unpacking done..." - ) - return data - - if scale_factor is not None and add_offset is not None: - if add_offset != 0.0 or scale_factor != 1.0: - data = data * scale_factor + add_offset - else: - data = data.astype(scale_factor.dtype) - elif scale_factor is not None and scale_factor != 1.0: - # If variable has only scale_factor attribute, rescale. - data = data * scale_factor - elif add_offset is not None and add_offset != 0.0: - # If variable has only add_offset attribute, add offset. - data = data + add_offset - - return data - - def close(self, dataset): - """Close the dataset containing the data. - - .. versionadded:: (cfdm) HDFVER - - :Parameters: - - dataset: `h5netcdf.File` - The netCDF dataset to be be closed. - - :Returns: - - `None` - - """ - if self._get_component("close"): - dataset.close() - - def get_groups(self, address): - """The netCDF4 group structure of a netCDF variable. - - .. versionadded:: (cfdm) 1.8.6.0 - - :Parameters: - - address: `str` or `int` - The netCDF variable name, or integer varid, from which - to get the groups. - - .. versionadded:: (cfdm) 1.10.1.0 - - :Returns: - - (`list`, `str`) or (`list`, `int`) - The group structure and the name within the group. If - *address* is a varid then an empty list and the varid - are returned. - - **Examples** - - >>> n.get_groups('tas') - ([], 'tas') - - >>> n.get_groups('/tas') - ([], 'tas') - - >>> n.get_groups('/data/model/tas') - (['data', 'model'], 'tas') - - >>> n.get_groups(9) - ([], 9) - - """ - try: - if "/" not in address: - return [], address - except TypeError: - return [], address - - out = address.split("/")[1:] - return out[:-1], out[-1] - - def open(self, **kwargs): - """Return a dataset file object and address. - - When multiple files have been provided an attempt is made to - open each one, in the order stored, and a file object is - returned from the first file that exists. - - :Returns: - - (`h5netcdf.File`, `str`) - The open file object, and the address of the data - within the file. - - """ - return super().open( - h5netcdf.File, mode="r", decode_vlen_strings=True, **kwargs - ) diff --git a/cfdm/data/mask.py b/cfdm/data/mask.py deleted file mode 100644 index 1baaf1432..000000000 --- a/cfdm/data/mask.py +++ /dev/null @@ -1,641 +0,0 @@ -import logging - -import netCDF4 -import numpy as np - -_safecast = netCDF4.utils._safecast -default_fillvals = netCDF4.default_fillvals - -logger = logging.getLogger(__name__) - - -class Mask: - -# variable = dataset.variables[address] -# self.variable = variable -# array = variable[indices] -# print(11) -# if mask: -# print(22) -# self.scale = True -# self.always_mask = False -# self._isvlen = variable.dtype == np.dtype("O") -# isvlen = variable.dtype == np.dtype("O") -# if not self._isvlen: -# array = self._mask2( -# array, -# variable.dtype, -# variable.attrs, -# isvlen, -# self.scale, -# self.always_mask, -# ) -# # array = self._mask(array) -# array = self._scale(array) -# -# string_type = isinstance(array, str) -# if string_type: -# # -------------------------------------------------------- -# # A netCDF string type scalar variable comes out as Python -# # str object, so convert it to a numpy array. -# # -------------------------------------------------------- -# array = np.array(array, dtype=f"U{len(array)}") -# -# if not self.ndim: -# # Hmm netCDF4 has a thing for making scalar size 1, 1d -# array = array.squeeze() -# -# array = self._process_string_and_char(array) -# return array - - @classmethod - def _check_safecast(cls, attname, var_dtype, attrs): - """TODOHDF. - - Check to see that variable attribute exists can can be safely - cast to variable data type. - - """ - # attrs = self.variable.attrs - if attname in attrs: - attvalue = attrs[attname] - att = np.array(attvalue) - else: - return False, None - - is_safe = True - try: - atta = np.array(att, var_dtype) - except ValueError: - is_safe = False - else: - is_safe = _safecast(att, atta) - - if not is_safe: - logger.warn( - f"WARNING: {attname} not used since it cannot " - "be safely cast to variable data type" - ) # pragma: no cover - - return is_safe, attvalue - -# def _mask(self, data): -# """TODOHDF.""" -# print("MASK", data.shape) -# # Private function for creating a masked array, masking -# # missing_values and/or _FillValues. -# -# attrs = self.variable.attrs -# is_unsigned = attrs.get("_Unsigned", False) in ("true", "True") -# is_unsigned_int = is_unsigned and data.dtype.kind == "i" -# -# dtype = data.dtype -# if self.scale and is_unsigned_int: -# # Only do this if autoscale option is on. -# dtype_unsigned_int = f"{dtype.byteorder}u{dtype.itemsize}" -# data = data.view(dtype_unsigned_int) -# -# totalmask = np.zeros(data.shape, np.bool_) -# fill_value = None -# safe_missval = self._check_safecast("missing_value") -# if safe_missval: -# mval = np.array(self.missing_value, self.dtype) -# if self.scale and is_unsigned_int: -# mval = mval.view(dtype_unsigned_int) -# -# # create mask from missing values. -# mvalmask = np.zeros(data.shape, np.bool_) -# if mval.shape == (): # mval a scalar. -# mval = (mval,) # make into iterable. -# -# for m in mval: -# # is scalar missing value a NaN? -# try: -# mvalisnan = np.isnan(m) -# except TypeError: -# # isnan fails on some dtypes -# mvalisnan = False -# -# if mvalisnan: -# mvalmask += np.isnan(data) -# else: -# mvalmask += data == m -# -# if mvalmask.any(): -# # Set fill_value for masked array to missing_value (or -# # 1st element if missing_value is a vector). -# fill_value = mval[0] -# totalmask += mvalmask -# -# # set mask=True for data == fill value -# safe_fillval = self._check_safecast("_FillValue") -# if safe_fillval: -# fval = np.array(self._FillValue, self.dtype) -# if self.scale and is_unsigned_int: -# fval = fval.view(dtype_unsigned_int) -# -# # is _FillValue a NaN? -# try: -# fvalisnan = np.isnan(fval) -# except Exception: -# # isnan fails on some dtypes -# fvalisnan = False -# -# if fvalisnan: -# mask = np.isnan(data) -# elif (data == fval).any(): -# mask = data == fval -# else: -# mask = None -# -# if mask is not None: -# if fill_value is None: -# fill_value = fval -# -# totalmask += mask -# else: -# # Don't return masked array if variable filling is disabled. -# no_fill = 0 -# # with nogil: -# # ierr = nc_inq_var_fill(self._grpid,self._varid,&no_fill,NULL) -# # _ensure_nc_success(ierr) -# -# # if no_fill is not 1, and not a byte variable, then use -# # default fill value. from -# # http://www.unidata.ucar.edu/software/netcdf/docs/netcdf-c/Fill-Values.html#Fill-Values -# # "If you need a fill value for a byte variable, it is -# # recommended that you explicitly define an appropriate -# # _FillValue attribute, as generic utilities such as -# # ncdump will not assume a default fill value for byte -# # variables." Explained here too: -# # http://www.unidata.ucar.edu/software/netcdf/docs/known_problems.html#ncdump_ubyte_fill -# # "There should be no default fill values when reading any -# # byte type, signed or unsigned, because the byte ranges -# # are too small to assume one of the values should appear -# # as a missing value unless a _FillValue attribute is set -# # explicitly." (do this only for non-vlens, since vlens -# # don't have a default _FillValue) -# if not self._isvlen and ( -# no_fill != 1 or dtype.str[1:] not in ("u1", "i1") -# ): -# fillval = np.array(default_fillvals[dtype.str[1:]], dtype) -# has_fillval = data == fillval -# # if data is an array scalar, has_fillval will be a -# # boolean. in that case convert to an array. -# # if type(has_fillval) == bool: -# if isinstance(has_fillval, bool): -# has_fillval = np.asarray(has_fillval) -# -# if has_fillval.any(): -# if fill_value is None: -# fill_value = fillval -# -# mask = data == fillval -# totalmask += mask -# -# # Set mask=True for data outside valid_min, valid_max. -# validmin = None -# validmax = None -# # If valid_range exists use that, otherwise look for -# # valid_min, valid_max. No special treatment of byte data as -# # described at -# # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). -# safe_validrange = self._check_safecast("valid_range") -# safe_validmin = self._check_safecast("valid_min") -# safe_validmax = self._check_safecast("valid_max") -# if safe_validrange and self.valid_range.size == 2: -# validmin = np.array(self.valid_range[0], self.dtype) -# validmax = np.array(self.valid_range[1], self.dtype) -# else: -# if safe_validmin: -# validmin = np.array(self.valid_min, self.dtype) -# -# if safe_validmax: -# validmax = np.array(self.valid_max, self.dtype) -# -# if validmin is not None and self.scale and is_unsigned_int: -# validmin = validmin.view(dtype_unsigned_int) -# -# if validmax is not None and self.scale and is_unsigned_int: -# validmax = validmax.view(dtype_unsigned_int) -# -# # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). -# # "If the data type is byte and _FillValue is not explicitly -# # defined, then the valid range should include all possible -# # values. Otherwise, the valid range should exclude the -# # _FillValue (whether defined explicitly or by default) as -# # follows. If the _FillValue is positive then it defines a -# # valid maximum, otherwise it defines a valid minimum." -# if safe_fillval: -# fval = np.array(self._FillValue, dtype) -# else: -# k = dtype.str[1:] -# if k in ("u1", "i1"): -# fval = None -# else: -# fval = np.array(default_fillvals[k], dtype) -# -# if self.dtype.kind != "S": -# # Don't set mask for character data -# -# # Setting valid_min/valid_max to the _FillVaue is too -# # surprising for many users (despite the netcdf docs -# # attribute best practices suggesting clients should do -# # this). -# if validmin is not None: -# totalmask += data < validmin -# -# if validmax is not None: -# totalmask += data > validmax -# -# if fill_value is None and fval is not None: -# fill_value = fval -# -# # If all else fails, use default _FillValue as fill_value for -# # masked array. -# if fill_value is None: -# fill_value = default_fillvals[dtype.str[1:]] -# -# # Create masked array with computed mask -# masked_values = bool(totalmask.any()) -# if masked_values: -# data = np.ma.masked_array( -# data, mask=totalmask, fill_value=fill_value -# ) -# else: -# # Always return masked array, if no values masked. -# data = np.ma.masked_array(data) -# -# # Scalar array with mask=True should be converted to -# # np.ma.MaskedConstant to be consistent with slicing -# # behavior of masked arrays. -# if data.shape == () and data.mask.all(): -# # Return a scalar numpy masked constant not a 0-d masked -# # array, so that data == np.ma.masked. -# data = data[()] -# -# elif not self.always_mask and not masked_values: -# # Return a regular numpy array if requested and there are -# # no missing values -# data = np.array(data, copy=False) -# -# -# return data - - @classmethod - def _process_string_and_char(cls, array): - """TODOHDF.""" - string_type = isinstance(array, str) - kind = array.dtype.kind - if not string_type and kind in "SU": - # Collapse by concatenation the outermost (fastest - # varying) dimension of char array into - # memory. E.g. [['a','b','c']] becomes ['abc'] - if kind == "U": - array = array.astype("S", copy=False) - - array = netCDF4.chartostring(array) - shape = array.shape - array = np.array([x.rstrip() for x in array.flat], dtype="U") - array = np.reshape(array, shape) - array = np.ma.masked_where(array == "", array) - elif not string_type and kind == "O": - # An N-d (N>=1) string variable comes out as a numpy - # object array, so convert it to numpy string array. - array = array.astype("U", copy=False) - - # Mask the VLEN variable - array = np.ma.where(array == "", np.ma.masked, array) - - return array - - @classmethod - def _process_char_array(cls, array, mask=True): - """TODOHDF.""" - # Collapse by concatenation the outermost (fastest - # varying) dimension of char array into - # memory. E.g. [['a','b','c']] becomes ['abc'] - if kind == "U": - array = array.astype("S", copy=False) - - array = netCDF4.chartostring(array) - shape = array.shape - array = np.array([x.rstrip() for x in array.flat], dtype="U") - array = np.reshape(array, shape) - if mask: - array = np.ma.masked_where(array == "", array) - if not np.ma.is_masked(data): - array = np.array(array, copy=False) - - return array - - @classmethod - def _process_string(cls, data, mask=True): - """TODOHDF.""" - if mask and data == "": - data = np.ma.masked_all((), dtype=f"U{len(data)}") - else: - data = np.array(data, dtype="U") - - return data - - @classmethod - def _process_object_array(cls, array, mask=True): - """TODOHDF.""" - array = array.astype("U", copy=False) - if mask: - array = np.ma.where(array == "", np.ma.masked, array) - if not np.ma.is_masked(data): - array = np.array(array, copy=False) - - return array - - def _is_char(cls, data): - return data.dtype.kind in "SU" # isinstance(data.item(0), (str, bytes)) - - def _is_string(cls, data): - return data.dtype.kind in "O" - - @classmethod - def mask_and_scale(cls, mask=True, scale=True): - """ - """ - if isinstance(data, str): - return cls._process_string(data, mask=mask) - - if _is_string(data): - return cls._process_object_array(data, mask=mask) - - if _is_char(data): - return cls._process_char_array(data, mask=mask) - - if mask or scale: - is_unsigned_int = attrs.get("_Unsigned", False) in ("true", "True") - if is_unsigned_int: - dtype = data.dtype - dtype_unsigned_int = f"{dtype.byteorder}u{dtype.itemsize}" - data = data.view(dtype_unsigned_int) - - if mask: - data = cls._mask(data.scale=scale, always_mask=False) - - if scale: - data = cls._scale(data, attrs) - - return data - - @classmethod - def _mask( - cls, data, var_dtype, attrs, scale=True, always_mask=False - ): - """TODOHDF.""" - print("MASK", data.shape) - - if isinstance(data, str): - return cls._process_string(data) - - if _is_string(data): - return cls._process_object_array(data) - - if _is_char(data): - return cls._process_char_array(data) - - dtype = data.dtype -# is_unsigned = attrs.get("_Unsigned", False) in ("true", "True") -# is_unsigned_int = is_unsigned and data.dtype.kind == "i" -# -# dtype = data.dtype -# if scale and is_unsigned_int: -# # Only do this if autoscale option is on. -# dtype_unsigned_int = f"{dtype.byteorder}u{dtype.itemsize}" -# data = data.view(dtype_unsigned_int) - - totalmask = np.zeros(data.shape, np.bool_) - fill_value = None - safe_missval, missing_value = cls._check_safecast( - "missing_value", var_dtype, attrs - ) - if safe_missval: - mval = np.array(missing_value, var_dtype) - if scale and is_unsigned_int: - mval = mval.view(dtype_unsigned_int) - - # create mask from missing values. - mvalmask = np.zeros(data.shape, np.bool_) - if mval.shape == (): # mval a scalar. - mval = (mval,) # make into iterable. - - for m in mval: - # is scalar missing value a NaN? - try: - mvalisnan = np.isnan(m) - except TypeError: - # isnan fails on some dtypes - mvalisnan = False - - if mvalisnan: - mvalmask += np.isnan(data) - else: - mvalmask += data == m - - if mvalmask.any(): - # Set fill_value for masked array to missing_value (or - # 1st element if missing_value is a vector). - fill_value = mval[0] - totalmask += mvalmask - - # set mask=True for data == fill value - safe_fillval, _FillValue = cls._check_safecast( - "_FillValue", dtype, attrs - ) - if safe_fillval: - fval = np.array(_FillValue, var_dtype) - if scale and is_unsigned_int: - fval = fval.view(dtype_unsigned_int) - - # is _FillValue a NaN? - try: - fvalisnan = np.isnan(fval) - except Exception: - # isnan fails on some dtypes - fvalisnan = False - - if fvalisnan: - mask = np.isnan(data) - elif (data == fval).any(): - mask = data == fval - else: - mask = None - - if mask is not None: - if fill_value is None: - fill_value = fval - - totalmask += mask - else: - # Don't return masked array if variable filling is disabled. - no_fill = 0 - # with nogil: - # ierr = nc_inq_var_fill(self._grpid,self._varid,&no_fill,NULL) - # _ensure_nc_success(ierr) - - # if no_fill is not 1, and not a byte variable, then use - # default fill value. from - # http://www.unidata.ucar.edu/software/netcdf/docs/netcdf-c/Fill-Values.html#Fill-Values - # "If you need a fill value for a byte variable, it is - # recommended that you explicitly define an appropriate - # _FillValue attribute, as generic utilities such as - # ncdump will not assume a default fill value for byte - # variables." Explained here too: - # http://www.unidata.ucar.edu/software/netcdf/docs/known_problems.html#ncdump_ubyte_fill - # "There should be no default fill values when reading any - # byte type, signed or unsigned, because the byte ranges - # are too small to assume one of the values should appear - # as a missing value unless a _FillValue attribute is set - # explicitly." (do this only for non-vlens, since vlens - # don't have a default _FillValue) - if ( - no_fill != 1 or dtype.str[1:] not in ("u1", "i1") - ): - fillval = np.array(default_fillvals[dtype.str[1:]], dtype) - has_fillval = data == fillval - # if data is an array scalar, has_fillval will be a - # boolean. in that case convert to an array. - # if type(has_fillval) == bool: - if isinstance(has_fillval, bool): - has_fillval = np.asarray(has_fillval) - - if has_fillval.any(): - if fill_value is None: - fill_value = fillval - - mask = data == fillval - totalmask += mask - - # Set mask=True for data outside valid_min, valid_max. - validmin = None - validmax = None - # If valid_range exists use that, otherwise look for - # valid_min, valid_max. No special treatment of byte data as - # described at - # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). - safe_validrange, valid_range = cls._check_safecast( - "valid_range", var_dtype, attrs - ) - safe_validmin, valid_min = cls._check_safecast( - "valid_min", var_dtype, attrs - ) - safe_validmax, valid_max = cls._check_safecast( - "valid_max", var_dtype, attrs - ) - if safe_validrange and valid_range.size == 2: - validmin = np.array(valid_range[0], var_dtype) - validmax = np.array(valid_range[1], var_dtype) - else: - if safe_validmin: - validmin = np.array(valid_min, var_dtype) - - if safe_validmax: - validmax = np.array(valid_max, var_dtype) - - if validmin is not None and scale and is_unsigned_int: - validmin = validmin.view(dtype_unsigned_int) - - if validmax is not None and scale and is_unsigned_int: - validmax = validmax.view(dtype_unsigned_int) - - # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). - # "If the data type is byte and _FillValue is not explicitly - # defined, then the valid range should include all possible - # values. Otherwise, the valid range should exclude the - # _FillValue (whether defined explicitly or by default) as - # follows. If the _FillValue is positive then it defines a - # valid maximum, otherwise it defines a valid minimum." - if safe_fillval: - fval = np.array(_FillValue, dtype) - else: - k = dtype.str[1:] - if k in ("u1", "i1"): - fval = None - else: - fval = np.array(default_fillvals[k], dtype) - - if var_dtype.kind != "S": - # Don't set mask for character data - - # Setting valid_min/valid_max to the _FillVaue is too - # surprising for many users (despite the netcdf docs - # attribute best practices suggesting clients should do - # this). - if validmin is not None: - totalmask += data < validmin - - if validmax is not None: - totalmask += data > validmax - - if fill_value is None and fval is not None: - fill_value = fval - - # If all else fails, use default _FillValue as fill_value for - # masked array. - if fill_value is None: - fill_value = default_fillvals[dtype.str[1:]] - - # Create masked array with computed mask - masked_values = bool(totalmask.any()) - if masked_values: - data = np.ma.masked_array( - data, mask=totalmask, fill_value=fill_value - ) - else: - # Always return masked array, if no values masked. - data = np.ma.masked_array(data) - - # Scalar array with mask=True should be converted to - # np.ma.MaskedConstant to be consistent with slicing - # behavior of masked arrays. - if data.shape == () and data.mask.all(): - # Return a scalar numpy masked constant not a 0-d masked - # array, so that data == np.ma.masked. - data = data[()] - - elif not always_mask and not masked_values: - # Return a regular numpy array if requested and there are - # no missing values - data = np.array(data, copy=False) - - return data - - @classmethod - def _scale(cls, data, attrs): - """TODOHDF.""" - # If variable has scale_factor and add_offset attributes, - # apply them. - scale_factor = attrs.get("scale_factor") - add_offset = attrs.get("add_offset") - try: - if scale_factor is not None: - float(scale_factor) - - if add_offset is not None: - float(add_offset) - except ValueError: - logging.warn( - "invalid scale_factor or add_offset attribute, " - "no unpacking done..." - ) - return data - - if scale_factor is not None and add_offset is not None: - if add_offset != 0.0 or scale_factor != 1.0: - data = data * scale_factor + add_offset - else: - data = data.astype(scale_factor.dtype) - elif scale_factor is not None and scale_factor != 1.0: - # If variable has only scale_factor attribute, rescale. - data = data * scale_factor - elif add_offset is not None and add_offset != 0.0: - # If variable has only add_offset attribute, add offset. - data = data + add_offset - - return data diff --git a/cfdm/data/maskscale.py b/cfdm/data/maskscale.py new file mode 100644 index 000000000..7a62b9442 --- /dev/null +++ b/cfdm/data/maskscale.py @@ -0,0 +1,355 @@ +import logging + +import netCDF4 +import numpy as np + +_safecast = netCDF4.utils._safecast +default_fillvals = netCDF4.default_fillvals + +logger = logging.getLogger(__name__) + + +class MaskScale: + """TODO.""" + + @classmethod + def _check_safecast(cls, attname, dtype, attrs): + """TODOHDF. + + Check to see that variable attribute exists can can be safely + cast to variable data type. + + """ + # attrs = self.variable.attrs + if attname in attrs: + attvalue = attrs[attname] + att = np.array(attvalue) + else: + return False, None + + is_safe = True + try: + atta = np.array(att, dtype) + except ValueError: + is_safe = False + else: + is_safe = _safecast(att, atta) + + if not is_safe: + logger.warn( + f"WARNING: {attname} not used since it cannot " + "be safely cast to variable data type" + ) # pragma: no cover + + return is_safe, attvalue + + @classmethod + def _FillValue(cls, attrs, variable): + """TODO.""" + if "_FillValue" not in attrs: + fillvalue = getattr(variable._h5ds, "fillvalue", None) + if fillvalue is not None: + attrs["_FillValue"] = fillvalue + elif variable.dtype.kind == "O": + attrs["_FillValue"] = default_fillvals["S1"] + + return attrs + + @classmethod + def _attrs(cls, variable): + """TODO.""" + try: + return dict(variable.attrs) + except AttributeError: + return { + attr: variable.getncattr(attr) for attr in variable.ncattrs() + } + + @classmethod + def _mask( + cls, + data, + dtype, + attrs, + scale=True, + always_mask=False, + dtype_unsigned_int=None, + ): + """TODOHDF.""" + totalmask = np.zeros(data.shape, np.bool_) + fill_value = None + + safe_missval, missing_value = cls._check_safecast( + "missing_value", dtype, attrs + ) + if safe_missval: + mval = np.array(missing_value, dtype) + if scale and dtype_unsigned_int is not None: + mval = mval.view(dtype_unsigned_int) + + # create mask from missing values. + mvalmask = np.zeros(data.shape, np.bool_) + if not mval.ndim: # mval a scalar. + mval = (mval,) # make into iterable. + + for m in mval: + # is scalar missing value a NaN? + try: + mvalisnan = np.isnan(m) + except TypeError: + # isnan fails on some dtypes + mvalisnan = False + + if mvalisnan: + mvalmask += np.isnan(data) + else: + mvalmask += data == m + + if mvalmask.any(): + # Set fill_value for masked array to missing_value (or + # 1st element if missing_value is a vector). + fill_value = mval[0] + totalmask += mvalmask + + # set mask=True for data == fill value + safe_fillval, _FillValue = cls._check_safecast( + "_FillValue", dtype, attrs + ) + if safe_fillval: + fval = np.array(_FillValue, dtype) + if scale and dtype_unsigned_int is not None: + fval = fval.view(dtype_unsigned_int) + + # is _FillValue a NaN? + try: + fvalisnan = np.isnan(fval) + except Exception: + # isnan fails on some dtypes + fvalisnan = False + + if fvalisnan: + mask = np.isnan(data) + elif (data == fval).any(): + mask = data == fval + else: + mask = None + + if mask is not None: + if fill_value is None: + fill_value = fval + + totalmask += mask + else: + # Don't return masked array if variable filling is disabled. + no_fill = 0 + # with nogil: + # ierr = nc_inq_var_fill(self._grpid,self._varid,&no_fill,NULL) + # _ensure_nc_success(ierr) + + # if no_fill is not 1, and not a byte variable, then use + # default fill value. from + # http://www.unidata.ucar.edu/software/netcdf/docs/netcdf-c/Fill-Values.html#Fill-Values + # "If you need a fill value for a byte variable, it is + # recommended that you explicitly define an appropriate + # _FillValue attribute, as generic utilities such as + # ncdump will not assume a default fill value for byte + # variables." Explained here too: + # http://www.unidata.ucar.edu/software/netcdf/docs/known_problems.html#ncdump_ubyte_fill + # "There should be no default fill values when reading any + # byte type, signed or unsigned, because the byte ranges + # are too small to assume one of the values should appear + # as a missing value unless a _FillValue attribute is set + # explicitly." (do this only for non-vlens, since vlens + # don't have a default _FillValue) + if no_fill != 1 or dtype.str[1:] not in ("u1", "i1"): + fillval = np.array(default_fillvals[dtype.str[1:]], dtype) + has_fillval = data == fillval + # if data is an array scalar, has_fillval will be a + # boolean. in that case convert to an array. + # if type(has_fillval) == bool: + if isinstance(has_fillval, bool): + has_fillval = np.asarray(has_fillval) + + if has_fillval.any(): + if fill_value is None: + fill_value = fillval + + mask = data == fillval + totalmask += mask + + # Set mask=True for data outside valid_min, valid_max. + validmin = None + validmax = None + # If valid_range exists use that, otherwise look for + # valid_min, valid_max. No special treatment of byte data as + # described at + # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). + safe_validrange, valid_range = cls._check_safecast( + "valid_range", dtype, attrs + ) + safe_validmin, valid_min = cls._check_safecast( + "valid_min", dtype, attrs + ) + safe_validmax, valid_max = cls._check_safecast( + "valid_max", dtype, attrs + ) + if safe_validrange and valid_range.size == 2: + validmin = np.array(valid_range[0], dtype) + validmax = np.array(valid_range[1], dtype) + else: + if safe_validmin: + validmin = np.array(valid_min, dtype) + + if safe_validmax: + validmax = np.array(valid_max, dtype) + + if scale: + if validmin is not None and dtype_unsigned_int is not None: + validmin = validmin.view(dtype_unsigned_int) + + if validmax is not None and dtype_unsigned_int is not None: + validmax = validmax.view(dtype_unsigned_int) + + # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). + # "If the data type is byte and _FillValue is not explicitly + # defined, then the valid range should include all possible + # values. Otherwise, the valid range should exclude the + # _FillValue (whether defined explicitly or by default) as + # follows. If the _FillValue is positive then it defines a + # valid maximum, otherwise it defines a valid minimum." + if safe_fillval: + fval = np.array(_FillValue, dtype) + else: + k = dtype.str[1:] + if k in ("u1", "i1"): + fval = None + else: + fval = np.array(default_fillvals[k], dtype) + + if dtype.kind != "S": + # Don't set mask for character data + + # Setting valid_min/valid_max to the _FillVaue is too + # surprising for many users (despite the netcdf docs + # attribute best practices suggesting clients should do + # this). + if validmin is not None: + totalmask += data < validmin + + if validmax is not None: + totalmask += data > validmax + + if fill_value is None and fval is not None: + fill_value = fval + + # If all else fails, use default _FillValue as fill_value for + # masked array. + if fill_value is None: + fill_value = default_fillvals[dtype.str[1:]] + + # Create masked array with computed mask + masked_values = totalmask.any() + if masked_values: + data = np.ma.masked_array( + data, mask=totalmask, fill_value=fill_value + ) + else: + # Always return masked array, if no values masked. + data = np.ma.masked_array(data) + + # Scalar array with mask=True should be converted to + # np.ma.MaskedConstant to be consistent with slicing + # behavior of masked arrays. + if data.shape == () and data.mask.all(): + # Return a scalar numpy masked constant not a 0-d masked + # array, so that data == np.ma.masked. + data = data[()] + + elif not always_mask and not masked_values: + # Return a regular numpy array if requested and there are + # no missing values + data = np.array(data, copy=False) + + return data + + @classmethod + def _scale(cls, data, attrs): + """TODOHDF.""" + # If variable has scale_factor and add_offset attributes, + # apply them. + scale_factor = attrs.get("scale_factor") + add_offset = attrs.get("add_offset") + try: + if scale_factor is not None: + float(scale_factor) + + if add_offset is not None: + float(add_offset) + except ValueError: + logging.warn( + "invalid scale_factor or add_offset attribute, " + "no unpacking done..." + ) + return data + + if scale_factor is not None and add_offset is not None: + if add_offset != 0.0 or scale_factor != 1.0: + data = data * scale_factor + add_offset + else: + data = data.astype(scale_factor.dtype) + elif scale_factor is not None and scale_factor != 1.0: + # If variable has only scale_factor attribute, rescale. + data = data * scale_factor + elif add_offset is not None and add_offset != 0.0: + # If variable has only add_offset attribute, add offset. + data = data + add_offset + + return data + + @classmethod + def apply(cls, variable, data, mask=True, scale=True, always_mask=False): + """TODO.""" + attrs = cls._attrs(variable) + + dtype = variable.dtype + data_dtype = data.dtype + kind = data_dtype.kind + if isinstance(data, str): + data = np.array(data, dtype="S") + elif kind in "OSU": + if kind == "S": + data = netCDF4.chartostring(data) + + # Assume that object arrays are arrays of strings + data = data.astype("S", copy=False) + if kind == "O": + dtype = data.dtype + + if mask or scale: + dtype_unsigned_int = None + is_unsigned_int = attrs.get("_Unsigned", False) in ("true", "True") + if is_unsigned_int: + dtype_unsigned_int = ( + f"{data_dtype.byteorder}u{data_dtype.itemsize}" + ) + data = data.view(dtype_unsigned_int) + + if mask: + attrs = cls._FillValue(attrs, variable) + data = cls._mask( + data, + dtype, + attrs, + scale=scale, + always_mask=always_mask, + dtype_unsigned_int=dtype_unsigned_int, + ) + + if scale: + data = cls._scale(data, attrs) + + if data.dtype.kind == "S": + # Assume that object arrays contain strings + data = data.astype("U", copy=False) + + return data diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index bd8648c50..5b93ea076 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -44,6 +44,18 @@ def shape(self): """Shape of the array.""" return self._get_component("shape") + def _get_attr(self, var, attr): + """TODOHDF. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + """ + raise NotImplementedError( + "Must implement {self.__class__.__name__}._get_attr" + ) # pragma: no cover + def close(self, dataset): """Close the dataset containing the data.""" raise NotImplementedError( diff --git a/cfdm/data/mixin/netcdffilemixin.py b/cfdm/data/mixin/netcdffilemixin.py index f7a35c1a5..9ef65f9e9 100644 --- a/cfdm/data/mixin/netcdffilemixin.py +++ b/cfdm/data/mixin/netcdffilemixin.py @@ -19,18 +19,6 @@ class NetCDFFileMixin: # """ # return f"<{self.__class__.__name__}{self.shape}: {self}>" - def _get_attr(self, var, attr): - """TODOHDF. - - .. versionadded:: (cfdm) HDFVER - - :Parameters: - - """ - raise NotImplementedError( - "Must implement {self.__class__.__name__}._get_attr" - ) # pragma: no cover - @classmethod def _process_string_and_char(cls, array): """TODOHDF.""" diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 70d967261..3830c1590 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -535,7 +535,7 @@ def file_open(self, filename, flatten=True, verbose=None): # nc = h5netcdf.File(filename, "r", decode_vlen_strings=True) try: if g["no_HDF"]: - print(99999999999999) + print("using netCDF4") raise OSError("Requested to not use HDF to open file") nc = h5netcdf.File(filename, "r", decode_vlen_strings=True) @@ -6174,10 +6174,11 @@ def _create_netcdfarray( return kwargs if g["original_netCDF"]: + # netCDF4 array = self.implementation.initialise_NetCDFArray(**kwargs) else: # h5netcdf - array = self.implementation.initialise_HDFArray(**kwargs) + array = self.implementation.initialise_H5netcdfArray(**kwargs) return array, kwargs diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index c3cb381bc..59c895634 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -276,7 +276,7 @@ def read( ``'s3://object-store/data/file.nc'``, then an ``'endpoint_url'`` key with value ``'https://object-store'`` would be created. - + .. versionadded:: (cfdm) HDFVER _no_HDF: `bool`, optional From 2e7f76f07d83de4c37375b2a593425cebec43512 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 30 Jan 2024 22:57:57 +0000 Subject: [PATCH 20/88] dev --- cfdm/cfdmimplementation.py | 8 +- cfdm/data/data.py | 24 +- cfdm/data/h5netcdfarray.py | 22 +- cfdm/data/maskscale.py | 70 ++++-- cfdm/data/mixin/filearraymixin.py | 2 +- cfdm/data/mixin/netcdffilemixin.py | 72 +++--- cfdm/data/netcdfarray.py | 34 ++- cfdm/flatten.py | 343 +++++--------------------- cfdm/read_write/netcdf/netcdfread.py | 152 ++++++------ cfdm/read_write/netcdf/netcdfwrite.py | 3 +- cfdm/read_write/read.py | 6 +- cfdm/test/test_groups.py | 78 +++++- cfdm/test/test_mask_scale.py | 115 +++++++++ cfdm/test/test_read_write.py | 21 +- 14 files changed, 489 insertions(+), 461 deletions(-) create mode 100644 cfdm/test/test_mask_scale.py diff --git a/cfdm/cfdmimplementation.py b/cfdm/cfdmimplementation.py index 3300039f5..ee098744f 100644 --- a/cfdm/cfdmimplementation.py +++ b/cfdm/cfdmimplementation.py @@ -1359,10 +1359,10 @@ def get_data_maximum(self, parent): :Returns: - Data instance + Scalar `Data` instance """ - return parent.data.maximum() + return parent.data.maximum(squeeze=True) def get_data_sum(self, parent): """Return the sum of the data. @@ -1373,10 +1373,10 @@ def get_data_sum(self, parent): :Returns: - Data instance + Scalar `Data` instance """ - return parent.data.sum() + return parent.data.sum(squeeze=True) def get_count(self, construct): """Return the count variable of compressed data. diff --git a/cfdm/data/data.py b/cfdm/data/data.py index 4f3a935d3..3f3315096 100644 --- a/cfdm/data/data.py +++ b/cfdm/data/data.py @@ -2194,7 +2194,7 @@ def _parse_indices(self, indices): return parsed_indices - def maximum(self, axes=None): + def maximum(self, axes=None, squeeze=False): """Return the maximum of an array or the maximum along axes. Missing data array elements are omitted from the calculation. @@ -2211,6 +2211,14 @@ def maximum(self, axes=None): {{axes int examples}} + squeeze: `bool`, optional}} + If this is set to False, the default, the axes which + are reduced are left in the result as dimensions with + size one. With this option, the result will broadcast + correctly against the original data. + + .. versionaded:: (cfdm) HDFVER + :Returns: `{{class}}` @@ -2255,7 +2263,7 @@ def maximum(self, axes=None): raise ValueError(f"Can't find maximum of data: {error}") array = self.array - array = np.amax(array, axis=axes, keepdims=True) + array = np.amax(array, axis=axes, keepdims=not squeeze) out = self.copy(array=False) out._set_Array(array, copy=False) @@ -2414,7 +2422,7 @@ def squeeze(self, axes=None, inplace=False): return d - def sum(self, axes=None): + def sum(self, axes=None, squeeze=False): """Return the sum of an array or the sum along axes. Missing data array elements are omitted from the calculation. @@ -2429,6 +2437,14 @@ def sum(self, axes=None): {{axes int examples}} + squeeze: `bool`, optional}} + If this is set to False, the default, the axes which + are reduced are left in the result as dimensions with + size one. With this option, the result will broadcast + correctly against the original data. + + .. versionaded:: (cfdm) HDFVER + :Returns: `{{class}}` @@ -2472,7 +2488,7 @@ def sum(self, axes=None): except ValueError as error: raise ValueError(f"Can't sum data: {error}") array = self.array - array = np.sum(array, axis=axes, keepdims=True) + array = np.sum(array, axis=axes, keepdims=not squeeze) d = self.copy(array=False) d._set_Array(array, copy=False) diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index 1e46e1885..7604167f9 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -191,13 +191,13 @@ def __getitem__(self, indices): Indexing is similar to numpy indexing. The only difference to numpy indexing (given the restrictions on the type of indices - allowed) is: + allowed) is: TODOHDF * When two or more dimension's indices are sequences of integers then these indices work independently along each dimension (similar to the way vector subscripts work in Fortran). - .. versionadded:: (cfdm) 1.7.0 + .. versionadded:: (cfdm) HDFVER """ dataset, address = self.open() @@ -207,11 +207,13 @@ def __getitem__(self, indices): groups, address = self.get_groups(address) if groups: - dataset = self._uuu(dataset, groups) + dataset = self._group(dataset, groups) # Get the variable by netCDF name variable = dataset.variables[address] array = variable[indices] + + # Apply masking and scaling array = MaskScale.apply( variable, array, mask=mask, scale=mask, always_mask=False ) @@ -222,12 +224,18 @@ def __getitem__(self, indices): self.close(dataset0) del dataset, dataset0 - if not self.ndim: - # Hmm netCDF4 has a thing for making scalar size 1, 1d - array = array.squeeze() - return array + def _get_attr(self, var, attr): + """TODOHDF. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + """ + return var.attrs[attr] + def close(self, dataset): """Close the dataset containing the data. diff --git a/cfdm/data/maskscale.py b/cfdm/data/maskscale.py index 7a62b9442..5fb08ff6c 100644 --- a/cfdm/data/maskscale.py +++ b/cfdm/data/maskscale.py @@ -47,11 +47,15 @@ def _check_safecast(cls, attname, dtype, attrs): def _FillValue(cls, attrs, variable): """TODO.""" if "_FillValue" not in attrs: - fillvalue = getattr(variable._h5ds, "fillvalue", None) - if fillvalue is not None: - attrs["_FillValue"] = fillvalue - elif variable.dtype.kind == "O": - attrs["_FillValue"] = default_fillvals["S1"] + try: + fillvalue = getattr(variable._h5ds, "fillvalue", None) + except AttributeError: + pass + else: + if fillvalue is not None: + attrs["_FillValue"] = fillvalue + elif variable.dtype.kind == "O": + attrs["_FillValue"] = default_fillvals["S1"] return attrs @@ -162,7 +166,12 @@ def _mask( # explicitly." (do this only for non-vlens, since vlens # don't have a default _FillValue) if no_fill != 1 or dtype.str[1:] not in ("u1", "i1"): - fillval = np.array(default_fillvals[dtype.str[1:]], dtype) + if dtype.kind == "S": + default_fillval = default_fillvals["S1"] + else: + default_fillval = default_fillvals[dtype.str[1:]] + + fillval = np.array(default_fillval, dtype) has_fillval = data == fillval # if data is an array scalar, has_fillval will be a # boolean. in that case convert to an array. @@ -177,7 +186,7 @@ def _mask( mask = data == fillval totalmask += mask - # Set mask=True for data outside valid_min, valid_max. + # Set mask=True for data outside [valid_min, valid_max] validmin = None validmax = None # If valid_range exists use that, otherwise look for @@ -224,10 +233,15 @@ def _mask( if k in ("u1", "i1"): fval = None else: - fval = np.array(default_fillvals[k], dtype) + if dtype.kind == "S": + default_fillval = default_fillvals["S1"] + else: + default_fillval = default_fillvals[k] + + fval = np.array(default_fillval, dtype) if dtype.kind != "S": - # Don't set mask for character data + # Don't set validmin/validmax mask for character data # Setting valid_min/valid_max to the _FillVaue is too # surprising for many users (despite the netcdf docs @@ -245,7 +259,10 @@ def _mask( # If all else fails, use default _FillValue as fill_value for # masked array. if fill_value is None: - fill_value = default_fillvals[dtype.str[1:]] + if dtype.kind == "S": + fill_value = default_fillvals["S1"] + else: + fill_value = default_fillvals[dtype.str[1:]] # Create masked array with computed mask masked_values = totalmask.any() @@ -308,15 +325,32 @@ def _scale(cls, data, attrs): @classmethod def apply(cls, variable, data, mask=True, scale=True, always_mask=False): - """TODO.""" - attrs = cls._attrs(variable) + """TODO. + + :Parameters: + + variable: `h5netcdf.Variable` or `netCDF4.Variable` + data: `numpy.ndarray` + + mask: `bool` + + scale: `bool` + + always_mask: `bool` + + :Returns: + + `numpy.ndarray` + + """ + attrs = cls._attrs(variable) dtype = variable.dtype - data_dtype = data.dtype - kind = data_dtype.kind + if isinstance(data, str): data = np.array(data, dtype="S") - elif kind in "OSU": + elif data.dtype.kind in "OSU": + kind = data.dtype.kind if kind == "S": data = netCDF4.chartostring(data) @@ -325,10 +359,14 @@ def apply(cls, variable, data, mask=True, scale=True, always_mask=False): if kind == "O": dtype = data.dtype - if mask or scale: + if dtype is str: # isinstance(dtype, str): + dtype = data.dtype + + if scale: dtype_unsigned_int = None is_unsigned_int = attrs.get("_Unsigned", False) in ("true", "True") if is_unsigned_int: + data_dtype = data.dtype dtype_unsigned_int = ( f"{data_dtype.byteorder}u{data_dtype.itemsize}" ) diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index 5b93ea076..db5cec47d 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -53,7 +53,7 @@ def _get_attr(self, var, attr): """ raise NotImplementedError( - "Must implement {self.__class__.__name__}._get_attr" + f"Must implement {self.__class__.__name__}._get_attr" ) # pragma: no cover def close(self, dataset): diff --git a/cfdm/data/mixin/netcdffilemixin.py b/cfdm/data/mixin/netcdffilemixin.py index 9ef65f9e9..73d7c149f 100644 --- a/cfdm/data/mixin/netcdffilemixin.py +++ b/cfdm/data/mixin/netcdffilemixin.py @@ -1,6 +1,3 @@ -import netCDF4 -import numpy as np - from ..numpyarray import NumpyArray @@ -11,40 +8,33 @@ class NetCDFFileMixin: """ - # def __repr__(self): - # """Called by the `repr` built-in function. - # - # x.__repr__() <==> repr(x) - # - # """ - # return f"<{self.__class__.__name__}{self.shape}: {self}>" - - @classmethod - def _process_string_and_char(cls, array): - """TODOHDF.""" - string_type = isinstance(array, str) - kind = array.dtype.kind - if not string_type and kind in "SU": - # Collapse by concatenation the outermost (fastest - # varying) dimension of char array into - # memory. E.g. [['a','b','c']] becomes ['abc'] - if kind == "U": - array = array.astype("S", copy=False) - - array = netCDF4.chartostring(array) - shape = array.shape - array = np.array([x.rstrip() for x in array.flat], dtype="U") - array = np.reshape(array, shape) - array = np.ma.masked_where(array == "", array) - elif not string_type and kind == "O": - # An N-d (N>=1) string variable comes out as a numpy - # object array, so convert it to numpy string array. - array = array.astype("U", copy=False) - - # Mask the VLEN variable - array = np.ma.where(array == "", np.ma.masked, array) - - return array + def _group(self, dataset, groups): + """Retrun the group object containing a variable. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + dataset: `netCDF4.Dataset` or `h5netcdf.File` + The dataset containging the variable. + + groups: sequence of `str` + The definition of which group the variable is in. For + instance, of the variable is in group + ``/forecast/model`` then *groups* would be + ``['forecast', 'model']``. + + :Returns: + + `netCDF4.Dataset` or `netCDF4.Group` + or `h5netcdf.File` or `h5netcdf.Group` + The group object, which might be the root group. + + """ + for g in groups: + dataset = dataset.groups[g] + + return dataset def _set_units(self, var): """The units and calendar properties. @@ -57,7 +47,7 @@ def _set_units(self, var): :Parameters: - var: `netCDF4.Variable` + var: `netCDF4.Variable` or `h5netcdf.Variable` The variable containing the units and calendar definitions. @@ -91,12 +81,6 @@ def _set_units(self, var): return units, calendar - def _uuu(self, dataset, groups): - for g in groups: # [:-1]: - dataset = dataset.groups[g] - - return dataset # dataset = dataset.groups[groups[-1]] - @property def array(self): """Return an independent numpy array containing the data. diff --git a/cfdm/data/netcdfarray.py b/cfdm/data/netcdfarray.py index 47980399d..c0e4b1aa0 100644 --- a/cfdm/data/netcdfarray.py +++ b/cfdm/data/netcdfarray.py @@ -208,11 +208,7 @@ def __getitem__(self, indices): if groups: # Traverse the group structure, if there is one (CF>=1.8). - netcdf = self._uuu(netcdf, groups) - # for g in groups[:-1]: - # netcdf = netcdf.groups[g] - # - # netcdf = netcdf.groups[groups[-1]] + netcdf = self._group(netcdf, groups) if isinstance(address, str): # Get the variable by netCDF name @@ -245,7 +241,33 @@ def __getitem__(self, indices): # Hmm netCDF4 has a thing for making scalar size 1, 1d array = array.squeeze() - array = self._process_string_and_char(array) + kind = array.dtype.kind + if not string_type and kind in "SU": + # -------------------------------------------------------- + # Collapse (by concatenation) the outermost (fastest + # varying) dimension of char array into + # memory. E.g. [['a','b','c']] becomes ['abc'] + # -------------------------------------------------------- + if kind == "U": + array = array.astype("S", copy=False) + + array = netCDF4.chartostring(array) + shape = array.shape + array = np.array([x.rstrip() for x in array.flat], dtype="U") + array = np.reshape(array, shape) + array = np.ma.masked_where(array == "", array) + elif not string_type and kind == "O": + # -------------------------------------------------------- + # A netCDF string type N-d (N>=1) variable comes out as a + # numpy object array, so convert it to numpy string array. + # -------------------------------------------------------- + array = array.astype("U", copy=False) + + # -------------------------------------------------------- + # netCDF4 does not auto-mask VLEN variable, so do it here. + # -------------------------------------------------------- + array = np.ma.where(array == "", np.ma.masked, array) + return array def __repr__(self): diff --git a/cfdm/flatten.py b/cfdm/flatten.py index 8ddaa2c3f..120c098ec 100644 --- a/cfdm/flatten.py +++ b/cfdm/flatten.py @@ -155,23 +155,29 @@ class _AttributeProperties(Enum): """ - ancillary_variables = (0, (False, True, True, False, False, False, False)) - bounds = (1, (False, True, True, False, False, False, False)) - cell_measures = (2, (False, True, False, True, False, False, False)) - climatology = (3, (False, True, True, False, False, False, False)) - coordinates = (4, (False, True, True, False, True, False, False)) + # Coordinates + coordinates = (0, (False, True, True, False, True, False, False)) + ancillary_variables = (1, (False, True, True, False, False, False, False)) + climatology = (2, (False, True, True, False, False, False, False)) + bounds = (3, (False, True, True, False, False, False, False)) + # Cell measures + cell_measures = (4, (False, True, False, True, False, False, False)) + # Coordinate references formula_terms = (5, (False, True, False, True, False, False, False)) - geometry = (6, (False, True, True, False, False, False, False)) - grid_mapping = (7, (False, True, True, True, False, False, False)) + grid_mapping = (6, (False, True, True, True, False, False, False)) # Geometry variables + geometry = (7, (False, True, True, False, False, False, False)) interior_ring = (8, (False, True, True, False, False, False, False)) node_coordinates = (9, (False, True, True, False, False, False, False)) node_count = (10, (False, True, True, False, False, False, False)) nodes = (11, (False, True, True, False, False, False, False)) part_node_count = (12, (False, True, True, False, False, False, False)) + # Compression by gathering compress = (13, (True, False, True, False, False, False, False)) + # Discrete sampling geometries instance_dimension = (14, (True, False, True, False, False, False, False)) sample_dimension = (15, (True, False, True, False, False, False, False)) + # Cell methods cell_methods = (16, (2, 1, True, False, False, True, True)) # Domain variable dimensions dimensions = (17, (True, False, True, False, False, False, False)) @@ -345,14 +351,6 @@ def __init__(self, input_ds, lax_mode, _copy_data=True, copy_slices=None): # 'ncattrs', 'path'): # setattr(self, method, getattr(self, f"_{method}_{dataset_type}")) - # def _attrs_netCDF4(self, variable): - # return { - # attr: variable.getncattr(attr) for attr in variable.ncattrs() - # } - # - # def _attrs_h5netcdf(self, variable): - # return variable.attrs - def attrs(self, variable): try: # h5netcdf @@ -363,56 +361,24 @@ def attrs(self, variable): attr: variable.getncattr(attr) for attr in variable.ncattrs() } - # def _chunksizes_h5netcdf(self, variable): - # return variable.chunks - # - # def _chunksizes_netCDF4(self, variable): - # chunking = variable.chunking() - # if chunking == "contiguous": - # return None - def chunksizes(self, variable): + """TODO.""" try: # netCDF4 chunking = variable.chunking() if chunking == "contiguous": return None + + return chunking except AttributeError: # h5netcdf return variable.chunks - # def _contiguous_h5netcdf(self, variable): - # """Whether or not the variable data is contiguous on disk. - # - # See `_contiguous_netCDF4` for details. - # """ - # return variable.chunks is None - # - # def _contiguous_netCDF4(self, variable): - # """Whether or not the variable data is contiguous on disk. - # - # :Parameters: - # - # variable: - # The variable. - # - # :Returns: - # - # `bool` - # `True` if the variable data is contiguous on disk, - # otherwise `False`. - # - # **Examples** - # - # >>> f.contiguous(variable) - # False - # - # """ - # return variable.chunking() == "contiguous" - def contiguous(self, variable): """Whether or not the variable data is contiguous on disk. + .. versionadded:: (cfdm) HDFVER + :Parameters: variable: `netCDF4.Variable` or `h5netcdf.Variable` @@ -432,43 +398,16 @@ def contiguous(self, variable): """ try: # netCDF4 - return (variable.chunking() == "contiguous",) + return variable.chunking() == "contiguous" except AttributeError: # h5netcdf return variable.chunks is None - # def data_model(self, dataset): - # """Return the netCDF data model version of the dataset. - # - # :Parameters: - # - # dataset: `netCDF4.Dataset` or `h5netcdf.File` - # The dataset. - # - # :Returns: - # - # `str` - # The data model version, one of ``'NETCDF4'``, - # ``'NETCDF4_CLASSIC'``, ``'NETCDF3_CLASSIC'``, - # ``'NETCDF3_64BIT_OFFSET'``, or - # ``'NETCDF3_64BIT_DATA'``. - # - # **Examples** - # - # >>> f.data_model(dataset) - # 'NETCDF4' - # - # """ - # try: - # # netCDF4 - # return dataset.data_model - # except AttributeError: - # # h5netcdf - # return "NETCDF4" - def dtype(self, variable): """Return the data type of a variable. + .. versionadded:: (cfdm) HDFVER + :Parameters: variable: @@ -476,7 +415,7 @@ def dtype(self, variable): :Returns: - `numpy.dtype` + `numpy.dtype` or `str` The data type. **Examples** @@ -484,6 +423,9 @@ def dtype(self, variable): >>> f.dtype(variable) dtype('>> f.dtype(variable) + str + """ out = variable.dtype if out == "O": @@ -491,38 +433,11 @@ def dtype(self, variable): return out - # def _endian_netCDF4(self, variable): - # """Return the endian-ness of a variable. - # - # :Parameters: - # - # variable: - # The variable. - # - # :Returns: - # - # `str` - # The endian-ness (``'little'``, ``'big'``, or - # ``'native'``) of the variable. - # - # **Examples** - # - # >>> f.endian(variable) - # 'native' - # - # """ - # return variable.endian() - # - # def _endian_h5netcdf(self, variable): - # """Return the endian-ness of a variable. - # - # """ - # dtype = variable.dtype - # return self._dtype_endian_lookup[getattr(dtype, "byteorder", None)] - def endian(self, variable): """Return the endian-ness of a variable. + .. versionadded:: (cfdm) HDFVER + :Parameters: variable: `netCDF4.Variable` or `h5netcdf.Variable` @@ -548,37 +463,11 @@ def endian(self, variable): dtype = variable.dtype return self._dtype_endian_lookup[getattr(dtype, "byteorder", None)] - # def _filepath_netCDF4(self, dataset): - # """Return the file path for the dataset. - # - # :Parameters: - # - # dataset: - # The dataset. - # - # :Returns: - # - # `str` - # The file system path, or the opendap URL, for the - # dataset. - # - # **Examples** - # - # >>> f.filepath(dataset) - # '/home/data/file.nc' - # - # """ - # return dataset.filepath() - # - # def _filepath_h5netcdf(self, dataset): - # """Return the file path for the dataset. - # - # """ - # return dataset.filename - def filepath(self, dataset): """Return the file path for the dataset. + .. versionadded:: (cfdm) HDFVER + :Parameters: dataset: `netCDF4.Dataset` or `h5netcdf.File` @@ -603,89 +492,45 @@ def filepath(self, dataset): # h5netcdf return dataset.filename - # def _get_dims_netCDF4(self, variable): - # """Return. - # - # :Returns: - # - # `str` - # - # """ - # return variable.get_dims() - # - # def _get_dims_h5netcdf(self, variable): - # """Return. - # - # :Returns: - # - # `str` - # - # """ - # out = [] - # dimension_names = list(variable.dimensions) - # group = variable._parent - # while dimension_names: - # for name in dimension_names[:]: - # if name in group.dims: - # out.append(group.dims[name]) - # dimension_names.remove(name) - # - # group = group.parent - # if group is None: - # break - # - # return out - def get_dims(self, variable): """Return. + .. versionadded:: (cfdm) HDFVER + :Returns: - `str` + `list` """ try: + # netCDF4 return variable.get_dims() except AttributeError: - out = [] + # h5netcdf + dims = {} dimension_names = list(variable.dimensions) group = variable._parent - while dimension_names: - for name in dimension_names[:]: - if name in group.dims: - out.append(group.dims[name]) + for name, dim in group.dims.items(): + if name in dimension_names: + dims[name] = dim + dimension_names.remove(name) + + group = group.parent + while group is not None and dimension_names: + for name, dim in group.dims.items(): + if name in dimension_names: + dims[name] = dim dimension_names.remove(name) group = group.parent - if group is None: - break - return out - - # def _getncattr_netCDF4(self, x, attr): - # """Retrieve a netCDF attribute. - # - # :Parameters: - # - # x: variable, group, or dataset - # - # attr: `str` - # - # :Returns: - # - # """ - # return getattr(x, attr) - # - # def _getncattr_h5netcdf(self, x, attr): - # """Retrieve a netCDF attribute. - # - # - # """ - # return x.attrs[attr] + return [dims[name] for name in variable.dimensions] def getncattr(self, x, attr): """Retrieve a netCDF attribute. + .. versionadded:: (cfdm) HDFVER + :Parameters: x: variable, group, or dataset @@ -702,25 +547,11 @@ def getncattr(self, x, attr): # h5netcdf return x.attrs[attr] - # def _group_netCDF4(self, x): - # """Return a. - # - # :Returns: - # - # `Group` - # - # """ - # return x.group() - # - # def _group_h5netcdf(self, x): - # """Return a. - # - # """ - # return x._parent - def group(self, x): """Return a. + .. versionadded:: (cfdm) HDFVER + :Returns: `Group` @@ -733,31 +564,11 @@ def group(self, x): # h5netcdf return x._parent - # def _name_netCDF4(self, x): - # """Return the netCDF name, without its groups. - # - # :Returns: - # - # """ - # return x.name - # - # def _name_h5netcdf(self, x): - # """Return the netCDF name, without its groups. - # - # :Returns: - # - # `str` - # - # """ - # out = x.name - # if "/" in out: - # out = x.name.split("/")[-1] - # - # return out - def name(self, x): """Return the netCDF name, without its groups. + .. versionadded:: (cfdm) HDFVER + :Returns: `str` @@ -770,29 +581,11 @@ def name(self, x): return out - # def _ncattrs_netCDF4(self, x): - # """Return netCDF attribute names. - # - # :Parameters: - # - # x: variable, group, or dataset - # - # :Returns: - # - # `list` - # - # """ - # return x.ncattrs() - # - # def _ncattrs_h5netcdf(self, x): - # """Return netCDF attribute names. - # - # """ - # return list(x.attrs) - def ncattrs(self, x): """Return netCDF attribute names. + .. versionadded:: (cfdm) HDFVER + :Parameters: x: variable, group, or dataset @@ -812,6 +605,8 @@ def ncattrs(self, x): def parent(self, group): """Return a simulated unix directory path to a group. + .. versionadded:: (cfdm) HDFVER + :Returns: `str` @@ -822,32 +617,11 @@ def parent(self, group): except AttributeError: return - # def _path_netCDF4(self, group): - # """Return a simulated unix directory path to a group. - # - # :Returns: - # - # `str` - # - # """ - # return group.path - # - # def _path_h5netcdf(self, group): - # """Return a simulated unix directory path to a group. - # - # :Returns: - # - # `str` - # - # """ - # try: - # return group.name - # except AttributeError: - # return "/" - def path(self, group): """Return a simulated unix directory path to a group. + .. versionadded:: (cfdm) HDFVER + :Returns: `str` @@ -1008,7 +782,6 @@ def flatten_variable(self, var): # Replace old by new dimension names # new_dims = list(map(lambda x: self.__dim_map[self.pathname(x.group(), x.name)], var.get_dims())) - new_dims = list( map( lambda x: self.__dim_map[ diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 3830c1590..e3c66cdc5 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -16,13 +16,13 @@ import h5netcdf import netCDF4 -import netcdf_flattener import numpy as np from packaging.version import Version from s3fs import S3FileSystem from ...decorators import _manage_log_level_via_verbosity -from ...flatten import flatten as flatten2 +from ...flatten import _Flattener +from ...flatten import flatten as netcdf_flatten from ...functions import is_log_level_debug, is_log_level_detail from .. import IORead @@ -30,7 +30,8 @@ _cached_temporary_files = {} -_flattener_separator = netcdf_flattener._Flattener._Flattener__new_separator +# _flattener_separator = netcdf_flattener._Flattener._Flattener__new_separator +_flattener_separator = _Flattener._Flattener__new_separator @dataclass() @@ -503,6 +504,7 @@ def file_open(self, filename, flatten=True, verbose=None): netCDF = False HDF = False + library = g["library"] # Deal with an file in an S3 object store u = urlparse(filename) @@ -532,25 +534,35 @@ def file_open(self, filename, flatten=True, verbose=None): f" s3: s3fs.S3FileSystem options: {s3}\n" ) # pragma: no cover - # nc = h5netcdf.File(filename, "r", decode_vlen_strings=True) - try: - if g["no_HDF"]: - print("using netCDF4") - raise OSError("Requested to not use HDF to open file") - - nc = h5netcdf.File(filename, "r", decode_vlen_strings=True) - HDF = True - except OSError: - # File could not be read by h5netcdf, or we've insisted - # that we don't use h5netcdf, so try to open it with - # netCDF4. + if library is None: try: - nc = netCDF4.Dataset(filename, "r") + nc = self._open_netCDF4(filename) netCDF = True - except RuntimeError as error: - raise RuntimeError(f"{error}: {filename}") - except Exception as error: - raise Exception(f"{error}: {filename}") + except Exception: + # File could not be read by netCDF4 so try to open it + # with h5netcdf + try: + nc = self._open_h5netcdf(filename) + HDF = True + except Exception as error: + raise error + + elif library == "netCDF4": + try: + nc = self._open_netCDF4(filename) + netCDF = True + except Exception as error: + raise error + + elif library == "h5netcdf": + try: + nc = self._open_h5netcdf(filename) + HDF = True + except Exception as error: + raise error + + else: + raise ValueError("TODO") g["original_HDF"] = HDF g["original_netCDF"] = netCDF @@ -559,20 +571,6 @@ def file_open(self, filename, flatten=True, verbose=None): # If the file has a group structure then flatten it (CF>=1.8) # ------------------------------------------------------------ if flatten and nc.groups: - # if HDF: - # # TODOHDF: Can't yet use HDF access to process groups - # logger.warning( - # "WARNING: Using netCDF4 (rather than h5netcdf) " - # f"to access file {filename} containing groups" - # ) # pragma: no cover - # nc.close() - # HDF = False - # try: - # nc = netCDF4.Dataset(filename, "r") - # netCDF = True - # except RuntimeError as error: - # raise RuntimeError(f"{error}: {filename}") - # Create a diskless, non-persistent container for the # flattened file flat_file = tempfile.NamedTemporaryFile( @@ -590,7 +588,7 @@ def file_open(self, filename, flatten=True, verbose=None): # Flatten the file # netcdf_flattener.flatten( - flatten2(nc, flat_nc, lax_mode=True, _copy_data=False) + netcdf_flatten(nc, flat_nc, lax_mode=True, _copy_data=False) # Store the original grouped file. This is primarily # because the unlimited dimensions in the flattened @@ -611,6 +609,14 @@ def file_open(self, filename, flatten=True, verbose=None): g["nc"] = nc return nc + def _open_netCDF4(self, filename): + """TODO.""" + return netCDF4.Dataset(filename, "r") + + def _open_h5netcdf(self, filename): + """TODO.""" + return h5netcdf.File(filename, "r", decode_vlen_strings=True) + @classmethod def cdl_to_netcdf(cls, filename): """Create a temporary netCDF-4 file from a CDL text file. @@ -866,7 +872,7 @@ def read( domain=False, s3=None, _s3_file_systems=None, - _no_HDF=False, + library=None, ): """Reads a netCDF dataset from file or OPenDAP URL. @@ -922,7 +928,7 @@ def read( .. versionadded:: (cfdm) HDFVER - _no_HDF: `bool`, optional + library: `None` or `str`, optional See `cfdm.read` for details .. versionadded:: (cfdm) HDFVER @@ -1027,10 +1033,10 @@ def read( # "cfa": False, # -------------------------------------------------------- - # HDF + # Library # -------------------------------------------------------- # - "no_HDF": _no_HDF, + "library": library, # -------------------------------------------------------- # S3 # -------------------------------------------------------- @@ -1126,16 +1132,23 @@ def read( # for attr in map(str,nc.ncattrs()): for attr, value in self._file_global_attributes().items(): attr = str(attr) - try: - if isinstance(value, str): - try: - global_attributes[attr] = str(value) - except UnicodeEncodeError: - global_attributes[attr] = value.encode(errors="ignore") - else: - global_attributes[attr] = value - except UnicodeDecodeError: - pass + if isinstance(value, bytes): + value = value.decode(errors="ignore") + + global_attributes[attr] = value + # print (attr, value, type(value)) + + # var + # try: + # if isinstance(value, str): + # try: + # global_attributes[attr] = str(value) + # except UnicodeEncodeError: + # global_attributes[attr] = value.encode(errors="ignore") + # else: + # global_attributes[attr] = value.decode('utf-8') + # except UnicodeDecodeError: + # pass g["global_attributes"] = global_attributes if is_log_level_debug(logger): @@ -2448,6 +2461,7 @@ def _parse_indexed_contiguous_compression( elements_per_profile = contiguous["count_variable"] instance_dimension_size = indexed["instance_dimension_size"] + element_dimension_1_size = int(profiles_per_instance.max()) element_dimension_2_size = int( self.implementation.get_data_maximum(elements_per_profile) @@ -2710,7 +2724,11 @@ def _parse_geometry(self, parent_ncvar, attributes): for cell_no in range( self.implementation.get_data_size(nodes_per_geometry) ): - n_nodes_in_this_cell = int(nodes_per_geometry_data[cell_no]) + n_nodes_in_this_cell = int( + self.implementation.get_array( + nodes_per_geometry_data[cell_no] + )[0] + ) # Initialise partial_node_count, a running count of # how many nodes there are in this geometry @@ -2718,7 +2736,9 @@ def _parse_geometry(self, parent_ncvar, attributes): for k in range(i, total_number_of_parts): index.data[k] = instance_index - n_nodes += int(parts_data[k]) + n_nodes += int( + self.implementation.get_array(parts_data[k])[0] + ) if n_nodes >= n_nodes_in_this_cell: instance_index += 1 i += k + 1 @@ -4769,7 +4789,7 @@ def _is_char_or_string(self, ncvar): """ datatype = self.read_vars["variables"][ncvar].dtype - return datatype == str or datatype.kind in "SU" + return datatype == str or datatype.kind in "OSU" def _is_char(self, ncvar): """Return True if the netCDf variable has char datatype. @@ -6093,7 +6113,7 @@ def _create_netcdfarray( return None dtype = variable.dtype - if dtype is str: + if dtype is str or dtype.kind == "O": # netCDF string types have a dtype of `str`, which needs # to be reset as a numpy.dtype, but we don't know what # without reading the data, so set it to None for now. @@ -6104,7 +6124,6 @@ def _create_netcdfarray( ndim = variable.ndim shape = variable.shape - # size = variable.size size = self._file_variable_size(variable) if size < 2: @@ -7328,14 +7347,9 @@ def _create_ragged_contiguous_array( `RaggedContiguousArray` """ - # uncompressed_ndim = len(uncompressed_shape) - # uncompressed_size = int(reduce(operator.mul, uncompressed_shape, 1)) - return self.implementation.initialise_RaggedContiguousArray( compressed_array=ragged_contiguous_array, - # ndim=uncompressed_ndim, shape=uncompressed_shape, - # size=uncompressed_size, count_variable=count_variable, ) @@ -7354,14 +7368,9 @@ def _create_ragged_indexed_array( `RaggedIndexedArray` """ - # uncompressed_ndim = len(uncompressed_shape) - # uncompressed_size = int(reduce(operator.mul, uncompressed_shape, 1)) - return self.implementation.initialise_RaggedIndexedArray( compressed_array=ragged_indexed_array, - # ndim=uncompressed_ndim, shape=uncompressed_shape, - # size=uncompressed_size, index_variable=index_variable, ) @@ -7381,14 +7390,9 @@ def _create_ragged_indexed_contiguous_array( `RaggedIndexedContiguousArray` """ - # uncompressed_ndim = len(uncompressed_shape) - # uncompressed_size = int(reduce(operator.mul, uncompressed_shape, 1)) - return self.implementation.initialise_RaggedIndexedContiguousArray( compressed_array=ragged_indexed_contiguous_array, - # ndim=uncompressed_ndim, shape=uncompressed_shape, - # size=uncompressed_size, count_variable=count_variable, index_variable=index_variable, ) @@ -10029,6 +10033,7 @@ def _ugrid_check_connectivity_variable( return ok def _file_global_attributes(self): + """TODOHDF.""" g = self.read_vars nc = g["nc"] if g["netCDF"]: @@ -10039,17 +10044,20 @@ def _file_global_attributes(self): return nc.attrs def _file_dimensions(self): + """TODOHDF.""" g = self.read_vars return g["nc"].dimensions def _file_dimension(self, dim_name): + """TODOHDF.""" return self._file_dimensions()[dim_name] def _file_dimension_isunlimited(self, dim_name): return self._file_dimension(dim_name).isunlimited() def _file_dimension_size(self, dim_name): - return self._file_dimensions(dim_name).size + """TODOHDF.""" + return self._file_dimension(dim_name).size def _file_variables(self): """TOODHDF.""" @@ -10057,9 +10065,11 @@ def _file_variables(self): return g["nc"].variables def _file_variable(self, var_name): + """TODOHDF.""" return self._file_variables()[var_name] def _file_variable_attributes(self, var, names_only=False): + """TODOHDF.""" g = self.read_vars if not names_only: if g["netCDF"]: @@ -10077,9 +10087,11 @@ def _file_variable_attributes(self, var, names_only=False): return list(var.attrs) def _file_variable_dimensions(self, var): + """TODOHDF.""" return var.dimensions def _file_variable_size(self, var): + """TODOHDF.""" try: # netCDF4 return var.size diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index 5669ccced..e39fed2a4 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -2661,7 +2661,8 @@ def _write_netcdf_variable( if g["dry_run"]: return - logger.info(f" Writing {cfvar!r}") # pragma: no cover + # print (ncvar, repr(cfvar.properties())) + # logger.info(f" Writing {cfvar!r}") # pragma: no cover # Set 'construct_type' if not construct_type: diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 59c895634..e6cd8b5f4 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -18,7 +18,7 @@ def read( mask=True, domain=False, s3=None, - _no_HDF=False, + library=None, _implementation=_implementation, ): """Read field or domain constructs from a dataset. @@ -279,7 +279,7 @@ def read( .. versionadded:: (cfdm) HDFVER - _no_HDF: `bool`, optional + library: `None` or `str`, optional TODOHDF .. versionadded:: (cfdm) HDFVER @@ -357,7 +357,7 @@ def read( mask=mask, domain=domain, s3=s3, - _no_HDF=_no_HDF, + library=library, extra_read_vars=None, ) except MaskError: diff --git a/cfdm/test/test_groups.py b/cfdm/test/test_groups.py index 3610a5439..313c25411 100644 --- a/cfdm/test/test_groups.py +++ b/cfdm/test/test_groups.py @@ -46,6 +46,30 @@ def _remove_tmpfiles(): class GroupsTest(unittest.TestCase): """Test treatment of netCDF4 files with hierarchical groups.""" + def _check_h5netcdf_groups(self, h5, nc): + """Check that an h5netcdf read gives same results as netCDF4. + + :Parameters: + + h5: `Field` + + nc: `Field` + + :Returns: + + `None` + + """ + self.assertTrue(h5.equals(nc, verbose=3)) + self.assertEqual(h5.nc_variable_groups(), nc.nc_variable_groups()) + for key, ch5 in h5.constructs.items(): + if hasattr(ch5, "nc_variable_groups"): + self.assertEqual( + ch5.nc_variable_groups(), + nc.constructs[key].nc_variable_groups(), + key, + ) + def setUp(self): """Preparations called immediately before each test method.""" # Disable log messages to silence expected warnings @@ -65,7 +89,7 @@ def test_groups(self): ungrouped_file = ungrouped_file1 grouped_file = grouped_file1 - grouped_file = "delme_grouped.nc" + # grouped_file = "delme_grouped.nc" # Add a second grid mapping datum = cfdm.Datum(parameters={"earth_radius": 7000000}) @@ -108,7 +132,8 @@ def test_groups(self): h = cfdm.read(grouped_file, verbose=1) self.assertEqual(len(h), 1, repr(h)) - self.assertTrue(f.equals(h[0], verbose=2)) + h = h[0] + self.assertTrue(f.equals(h)) # ------------------------------------------------------------ # Move constructs one by one to the /forecast group. The order @@ -138,7 +163,7 @@ def test_groups(self): # Check that the field construct hasn't changed h = cfdm.read(grouped_file, verbose=1) - self.assertEqual(len(h), 1, repr(h)) + self.assertEqual(len(h), 1) self.assertTrue(f.equals(h[0], verbose=2), name) # ------------------------------------------------------------ @@ -155,9 +180,15 @@ def test_groups(self): ) nc.close() - h = cfdm.read(grouped_file, verbose="WARNING") - self.assertEqual(len(h), 1, repr(h)) - self.assertTrue(f.equals(h[0], verbose=2)) + h = cfdm.read(grouped_file, library="netCDF4", verbose="WARNING") + self.assertEqual(len(h), 1) + h = h[0] + self.assertTrue(f.equals(h, verbose=2)) + + # Check that h5netcdf reads the file correctly + h5 = cfdm.read(grouped_file, library="h5netcdf") + self.assertEqual(len(h5), 1) + self._check_h5netcdf_groups(h5[0], h) def test_groups_geometry(self): """Test that geometries are considered in the correct groups.""" @@ -284,20 +315,26 @@ def test_groups_geometry(self): # Check that the field construct hasn't changed h = cfdm.read(grouped_file, verbose=1) self.assertEqual(len(h), 1, repr(h)) - self.assertTrue(f.equals(h[0], verbose=2)) + h = h[0] + self.assertTrue(f.equals(h, verbose=2)) + + # Check that h5netcdf reads the file correctly + h5 = cfdm.read(grouped_file, library="h5netcdf") + self.assertEqual(len(h5), 1) + self._check_h5netcdf_groups(h5[0], h) def test_groups_compression(self): """Test the compression of hierarchical groups.""" f = cfdm.example_field(4) - ungrouped_file = ungrouped_file3 + ungrouped_file = "ungrouped_file3.nc" grouped_file = grouped_file3 f.compress("indexed_contiguous", inplace=True) f.data.get_count().nc_set_variable("count") f.data.get_index().nc_set_variable("index") - cfdm.write(f, ungrouped_file, verbose=1) + cfdm.write(f, ungrouped_file) g = cfdm.read(ungrouped_file)[0] self.assertTrue(f.equals(g, verbose=2)) @@ -351,7 +388,13 @@ def test_groups_compression(self): h = cfdm.read(grouped_file, verbose=1) self.assertEqual(len(h), 1, repr(h)) - self.assertTrue(f.equals(h[0], verbose=2)) + h = h[0] + self.assertTrue(f.equals(h, verbose=2)) + + # Check that h5netcdf reads the file correctly + h5 = cfdm.read(grouped_file, library="h5netcdf") + self.assertEqual(len(h5), 1) + self._check_h5netcdf_groups(h5[0], h) def test_groups_dimension(self): """Test the dimensions of hierarchical groups.""" @@ -421,6 +464,11 @@ def test_groups_dimension(self): h = h[0] self.assertTrue(f.equals(h, verbose=3)) + # Check that h5netcdf reads the file correctly + h5 = cfdm.read(grouped_file, library="h5netcdf") + self.assertEqual(len(h5), 1) + self._check_h5netcdf_groups(h5[0], h) + def test_groups_unlimited_dimension(self): """Test the group behaviour of an unlimited dimension.""" f = cfdm.example_field(0) @@ -451,12 +499,18 @@ def test_groups_unlimited_dimension(self): f.nc_set_variable_groups(["forecast", "model"]) grouped_file = grouped_file5 + cfdm.write(f, grouped_file5, verbose=1) - h = cfdm.read(grouped_file, verbose=1) + h = cfdm.read(grouped_file, library="netCDF4") self.assertEqual(len(h), 1) h = h[0] - self.assertTrue(f.equals(h, verbose=3)) + self.assertTrue(f.equals(h)) + + # Check that h5netcdf reads the file correctly + h5 = cfdm.read(grouped_file, library="h5netcdf") + self.assertEqual(len(h5), 1) + self._check_h5netcdf_groups(h5[0], h) def test_groups_identical_coordinates(self): """Test for identical coordinates in different groups.""" diff --git a/cfdm/test/test_mask_scale.py b/cfdm/test/test_mask_scale.py new file mode 100644 index 000000000..2139cc668 --- /dev/null +++ b/cfdm/test/test_mask_scale.py @@ -0,0 +1,115 @@ +import atexit +import datetime +import faulthandler +import os +import tempfile +import unittest + +import netCDF4 +import numpy as np + +import cfdm + +faulthandler.enable() # to debug seg faults and timeouts + +n_tmpfiles = 1 +tmpfiles = [ + tempfile.mkstemp("_test_geometry.nc", dir=os.getcwd())[1] + for i in range(n_tmpfiles) +] +(tempfile,) = tmpfiles + + +def _remove_tmpfiles(): + """Remove temporary files created during tests.""" + for f in tmpfiles: + try: + os.remove(f) + except OSError: + pass + + +atexit.register(_remove_tmpfiles) + + +class MaskScaleTest(unittest.TestCase): + """Test the masking and scaling of netCDF data.""" + + def test_mask(self): + """Test CF masking.""" + f0 = cfdm.example_field(0) + f0.del_property("missing_value", None) + f0.del_property("_FillValue", None) + fields = [f0.copy()] + + f0.data[1, :] = np.ma.masked + fields.append(f0) + + f = f0.copy() + f.set_property("missing_value", 999) + fields.append(f) + + f = f0.copy() + f.set_property("_FillValue", 999) + fields.append(f) + + f = f0.copy() + valid_min = f.array.min() * 1.1 + f.set_property("valid_min", valid_min) + fields.append(f) + + f = f0.copy() + valid_max = f.array.max() * 0.9 + f.set_property("valid_max", valid_max) + fields.append(f) + + f = f0.copy() + f.set_property("valid_range", [valid_min, valid_max]) + fields.append(f) + + cfdm.write(fields, tempfile, warn_valid=False) + + fh5 = cfdm.read(tempfile, library="h5netcdf") + fnc = cfdm.read(tempfile, library="netCDF4") + for h, n in zip(fh5, fnc): + self.assertTrue(h.data.mask.equals(n.data.mask)) + + def test_scale(self): + """Test CF scaling.""" + f = cfdm.example_field(0) + + array = np.ma.arange(40, dtype="int32").reshape(f.shape) + array[1, :] = np.ma.masked + + data = cfdm.Data(array, units=f.get_property("units")) + f.set_data(data) + scale_factor = 0.5 + add_offset = 10.0 + f.set_property("scale_factor", scale_factor) + f.set_property("add_offset", add_offset) + f.set_property("missing_value", 999) + + cfdm.write(f, tempfile) + x = cfdm.read(tempfile)[0] + + nc = netCDF4.Dataset(tempfile, "r") + q = nc.variables["q"] + q.set_auto_maskandscale(False) + + raw = (array - add_offset) / scale_factor + raw[1, :] = 999 + raw = raw.astype(array.dtype) + self.assertEqual(q.dtype, raw.dtype) + self.assertTrue((q[...] == raw).all()) + nc.close() + + x = x.array + self.assertTrue((x.mask == array.mask).all()) + self.assertTrue((x == array).all()) + + +if __name__ == "__main__": + print("Run date:", datetime.datetime.now()) + cfdm.environment() + print() + unittest.main(verbosity=2) diff --git a/cfdm/test/test_read_write.py b/cfdm/test/test_read_write.py index 5e9c0ab8a..0c6210212 100644 --- a/cfdm/test/test_read_write.py +++ b/cfdm/test/test_read_write.py @@ -671,7 +671,8 @@ def test_read_CDL(self): def test_read_write_string(self): """Test the `string` keyword argument to `read` and `write`.""" - f = cfdm.read(self.string_filename) + f = cfdm.read(self.string_filename, library="netCDF4") + fh = cfdm.read(self.string_filename, library="h5netcdf") n = int(len(f) / 2) @@ -684,6 +685,10 @@ def test_read_write_string(self): f[j].data.equals(f[i].data, verbose=3), f"{f[j]!r} {f[i]!r}" ) + # Check that netCDF4 and h5netcdf give the same results + for i, j in zip(f, fh): + self.assertTrue(i.data.equals(j.data)) + # Note: Don't loop round all netCDF formats for better # performance. Just one netCDF3 and one netCDF4 format # is sufficient to test the functionality @@ -926,8 +931,8 @@ def test_write_omit_data(self): g = g[0] # Check that the data are missing - self.assertFalse(g.array.count()) - self.assertFalse(g.construct("grid_latitude").array.count()) + self.assertFalse(np.ma.count(g.array)) + self.assertFalse(np.ma.count(g.construct("grid_latitude").array)) # Check that a dump works g.dump(display=False) @@ -937,16 +942,16 @@ def test_write_omit_data(self): # Check that only the field and dimension coordinate data are # missing - self.assertFalse(g.array.count()) - self.assertFalse(g.construct("grid_latitude").array.count()) - self.assertTrue(g.construct("latitude").array.count()) + self.assertFalse(np.ma.count(g.array)) + self.assertFalse(np.ma.count(g.construct("grid_latitude").array)) + self.assertTrue(np.ma.count(g.construct("latitude").array)) cfdm.write(f, tmpfile, omit_data="field") g = cfdm.read(tmpfile)[0] # Check that only the field data are missing - self.assertFalse(g.array.count()) - self.assertTrue(g.construct("grid_latitude").array.count()) + self.assertFalse(np.ma.count(g.array)) + self.assertTrue(np.ma.count(g.construct("grid_latitude").array)) def test_read_write_domain_ancillary(self): """Test when domain ancillary equals dimension coordinate.""" From f36c14386c75001702059ce9245b4fd35363a88d Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 31 Jan 2024 13:09:12 +0000 Subject: [PATCH 21/88] dev --- cfdm/flatten.py | 1498 -------------------------- cfdm/read_write/netcdf/netcdfread.py | 5 +- 2 files changed, 3 insertions(+), 1500 deletions(-) delete mode 100644 cfdm/flatten.py diff --git a/cfdm/flatten.py b/cfdm/flatten.py deleted file mode 100644 index 120c098ec..000000000 --- a/cfdm/flatten.py +++ /dev/null @@ -1,1498 +0,0 @@ -"""Project: NetCDF Flattener - -Copyright (c) 2020 EUMETSAT -License: Apache License 2.0 - -Licensed to the Apache Software Foundation (ASF) under one -or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. The ASF licenses this file -to you under the Apache License, Version 2.0 (the -"License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, -software distributed under the License is distributed on an -"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -KIND, either express or implied. See the License for the -specific language governing permissions and limitations -under the License. - -This code has been modified from the original found in the -netcdf_flattener package. - -""" - -import hashlib -import logging -import re -import warnings -from enum import Enum - - -def flatten( - input_ds, output_ds, lax_mode=False, _copy_data=True, copy_slices=None -): - """Create a flattened version of a netCDF dataset. - - For variable that are too big to fit in memory, the optional - "copy_slices" input allows to copy some or all of the variables in - slices. - - :param input_ds: input netcdf4 dataset - :param output_ds: output netcdf4 dataset - :param lax_mode: if false (default), not resolving a reference halts the execution. If true, continue with warning. - :param _copy_data: if true (default), then all data arrays are copied from the input to the output dataset. - If false, then this does not happen. - Use this option *only* if the data arrays of the flattened dataset are never to be accessed. - If false then consider setting the fill mode for the output netcd4 dataset to "off" for improved performance. - :param copy_slices: dictionary containing variable_name: shape pairs, where variable_name is the path to the - variable name in the original Dataset (for instance /group1/group2/my_variable), and shape is either None for - using default slice value, or a custom slicing shap in the form of a tuple of the same dimension as the variable - (for instance (1000,2000,1500,) for a 3-dimensional variable). If a variable from the Dataset is not contained - in the dict, it will not be sliced and copied normally. - - """ - _Flattener( - input_ds, lax_mode, _copy_data=_copy_data, copy_slices=copy_slices - ).flatten(output_ds) - - -def parse_var_attr(input_str): - """Parse variable attribute of any form into a dict: - - * 'time' -> OrderedDict([('time', [])]) - * 'lat lon' -> OrderedDict([('lat', []), ('lon', [])]) - * 'area: time volume: lat lon' -> OrderedDict([('area', ['time']), ('volume', ['lat', 'lon'])]) - - :param input_str: string to parse - :return: parsed string in an OrderedDict - - """ - - def subst(s): - """Substitute tokens for WORD and SEP.""" - return s.replace("WORD", r"[A-Za-z0-9_#/.\(\)]+").replace( - "SEP", r"(\s+|$)" - ) - - # Regex for 'dict form': "k1: v1 v2 k2: v3" - pat_value = subst("(?PWORD)SEP") - pat_values = "({})*".format(pat_value) - pat_mapping = subst( - "(?PWORD):SEP(?P{})".format(pat_values) - ) - pat_mapping_list = "({})+".format(pat_mapping) - - # Regex for 'list form': "v1 v2 v3" (including single-item form) - pat_list_item = subst("(?PWORD)SEP") - pat_list = "({})+".format(pat_list_item) - - # Regex for any form: - pat_all = subst( - "((?P{})|(?P{}))$".format( - pat_list, pat_mapping_list - ) - ) - - m = re.match(pat_all, input_str) - - # Output is always a dict. If input form is a list, dict values are set as empty lists - out = {} # collections.OrderedDict() - - if m is not None: - list_match = m.group("list") - # Parse as a list - if list_match: - for mapping in re.finditer(pat_list_item, list_match): - item = mapping.group("list_item") - out[item] = None - # Parse as a dict: - else: - mapping_list = m.group("mapping_list") - for mapping in re.finditer(pat_mapping, mapping_list): - term = mapping.group("mapping_name") - values = [ - value.group("value") - for value in re.finditer( - pat_value, mapping.group("values") - ) - ] - out[term] = values - else: - raise ReferenceException( - "Error while parsing attribute value: '{}'".format(input_str) - ) - - return out - - -def generate_var_attr_str(d): - """Re-generate the attribute string from a dictionary. - - :param d: dictionary - :return: valid attribute string - - """ - parsed_list = [] - for k, v in d.items(): - if v is None: - parsed_list.append(k) - elif not v: - parsed_list.append("{}:".format(k)) - else: - parsed_list.append(k + ": " + (" ".join(v))) - return " ".join(parsed_list) - - -class _AttributeProperties(Enum): - """Utility class containing the properties for each attribute. - - For each variable attribute, defines how contained references to - dimensions and variables should be parsed and processed. - - """ - - # Coordinates - coordinates = (0, (False, True, True, False, True, False, False)) - ancillary_variables = (1, (False, True, True, False, False, False, False)) - climatology = (2, (False, True, True, False, False, False, False)) - bounds = (3, (False, True, True, False, False, False, False)) - # Cell measures - cell_measures = (4, (False, True, False, True, False, False, False)) - # Coordinate references - formula_terms = (5, (False, True, False, True, False, False, False)) - grid_mapping = (6, (False, True, True, True, False, False, False)) - # Geometry variables - geometry = (7, (False, True, True, False, False, False, False)) - interior_ring = (8, (False, True, True, False, False, False, False)) - node_coordinates = (9, (False, True, True, False, False, False, False)) - node_count = (10, (False, True, True, False, False, False, False)) - nodes = (11, (False, True, True, False, False, False, False)) - part_node_count = (12, (False, True, True, False, False, False, False)) - # Compression by gathering - compress = (13, (True, False, True, False, False, False, False)) - # Discrete sampling geometries - instance_dimension = (14, (True, False, True, False, False, False, False)) - sample_dimension = (15, (True, False, True, False, False, False, False)) - # Cell methods - cell_methods = (16, (2, 1, True, False, False, True, True)) - # Domain variable dimensions - dimensions = (17, (True, False, True, False, False, False, False)) - # CFA instructsions - aggregated_dimensions = ( - 18, - (True, False, True, False, False, False, False), - ) - aggregated_data = (19, (False, True, False, True, False, False, False)) - # UGRID variables - # - # * node_coordinates has already been assigned under Geometry - # variables - # * IDs 20, 23, 29, 30, 31, 32, 35, 36, 37 are reserved for potential - # further UGRID usage - edge_coordinates = (21, (False, True, True, False, False, False, False)) - face_coordinates = (22, (False, True, True, False, False, False, False)) - edge_node_connectivity = ( - 24, - (False, True, True, False, False, False, False), - ) - face_node_connectivity = ( - 25, - (False, True, True, False, False, False, False), - ) - face_face_connectivity = ( - 26, - (False, True, True, False, False, False, False), - ) - edge_face_connectivity = ( - 27, - (False, True, True, False, False, False, False), - ) - face_edge_connectivity = ( - 28, - (False, True, True, False, False, False, False), - ) - edge_dimension = (33, (True, False, True, False, False, False, False)) - face_dimension = (34, (True, False, True, False, False, False, False)) - mesh = (38, (False, True, True, False, False, False, False)) - - def __init__(self, n, props): - """_AttributeProperties enum constructor. - - :Parameters: - - n: `int` - Enum id. - - props: `tuple` - A sequence containing the attribute's properties - (ref_to_dim, ref_to_var, resolve_key, resolve_value, - stop_at_local_apex, accept_standard_names, - limit_to_scalar_coordinates): - - 1. ref_to_dim: True or integer if contains references - to dimensions (highest int have - priority) - - 2. ref_to_var: True or integer if contains references - to variables (highest int have - priority) - - 3. resolve_key: True if 'keys' have to be resolved in - 'key1: value1 key2: value2 value3' or - 'key1 key2' - - 4. resolve_value: True if 'values' have to be resolved - in 'key1: value1 key2: value2 - value3' - - 5. stop_at_local_apex: True if upward research in the - hierarchy has to stop at local - apex - - 6. accept_standard_names: True if any standard name is - valid in place of references - (in which case no exception - is raised if a reference - cannot be resolved, and the - standard name is used in - place) - - 7. limit_to_scalar_coordinates: True if references to - variables are only - resolved if present as - well in the - 'coordinates' - attributes of the - variable, and they are - scalar. - - """ - self.id = n - self.ref_to_dim = props[0] - self.ref_to_var = props[1] - self.resolve_key = props[2] - self.resolve_value = props[3] - self.stop_at_local_apex = props[4] - self.accept_standard_names = props[5] - self.limit_to_scalar_coordinates = props[6] - - -class _Flattener: - """Information and methods needed to flatten a netCDF dataset. - - Contains the input file, the output file being flattened, and all - the logic of the flattening process. - - """ - - __max_name_len = 256 - __default_separator = "/" - __new_separator = "__" - __pathname_format = "{}/{}" - __mapping_str_format = "{}: {}" - __ref_not_found_error = "REF_NOT_FOUND" - __default_copy_slice_size = 134217728 # 128 MiB - - # name of the attributes used to store the mapping between original and flattened names - __attr_map_name = "__flattener_name_mapping_attributes" - __dim_map_name = "__flattener_name_mapping_dimensions" - __var_map_name = "__flattener_name_mapping_variables" - - # Mapping from numpy dtype endian format to what we expect - _dtype_endian_lookup = { - "=": "native", - ">": "big", - "<": "little", - "|": "native", - None: "native", - } - - def __init__(self, input_ds, lax_mode, _copy_data=True, copy_slices=None): - """**Initialisation** - - :param input_ds: input netcdf dataset - :param lax_mode: if false (default), not resolving a reference halts the execution. If true, continue with warning. - :param _copy_data: if true (default), then all data arrays are copied from the input to the output dataset - If false, then this does not happen. - Use this option *only* if the data arrays of the flattened dataset are never to be accessed. - :param copy_slices: dictionary containing variable_name: shape pairs, where variable_name is the path to the - variable name in the original Dataset (for instance /group1/group2/my_variable), and shape is either None - for using default slice value, or a custom slicing shape in the form of a tuple of the same dimension as the - variable (for instance (1000,2000,1500,) for a 3-dimensional variable). If a variable from the Dataset is - not contained in the dict, it will not be sliced and copied normally. - - """ - self.__attr_map_value = [] - self.__dim_map_value = [] - self.__var_map_value = [] - - self.__dim_map = {} # dict() - self.__var_map = {} # dict() - - self.__lax_mode = lax_mode - - self.__copy_data = _copy_data - self.__copy_slices = copy_slices - - self.__input_file = input_ds - self.__output_file = None - - # if hasattr(input_ds, "_h5file"): - # dataset_type = 'h5netcdf' - # else: - # dataset_type = 'netCDF4' - # - # for method in ('attrs', 'chunksizes', 'contiguous', 'endian', - # 'filepath', 'get_dims', 'getncattr', 'group', 'name', - # 'ncattrs', 'path'): - # setattr(self, method, getattr(self, f"_{method}_{dataset_type}")) - - def attrs(self, variable): - try: - # h5netcdf - return variable.attrs - except AttributeError: - # netCDF4 - return { - attr: variable.getncattr(attr) for attr in variable.ncattrs() - } - - def chunksizes(self, variable): - """TODO.""" - try: - # netCDF4 - chunking = variable.chunking() - if chunking == "contiguous": - return None - - return chunking - except AttributeError: - # h5netcdf - return variable.chunks - - def contiguous(self, variable): - """Whether or not the variable data is contiguous on disk. - - .. versionadded:: (cfdm) HDFVER - - :Parameters: - - variable: `netCDF4.Variable` or `h5netcdf.Variable` - The variable. - - :Returns: - - `bool` - `True` if the variable data is contiguous on disk, - otherwise `False`. - - **Examples** - - >>> f.contiguous(variable) - False - - """ - try: - # netCDF4 - return variable.chunking() == "contiguous" - except AttributeError: - # h5netcdf - return variable.chunks is None - - def dtype(self, variable): - """Return the data type of a variable. - - .. versionadded:: (cfdm) HDFVER - - :Parameters: - - variable: - The dataset variable. - - :Returns: - - `numpy.dtype` or `str` - The data type. - - **Examples** - - >>> f.dtype(variable) - dtype('>> f.dtype(variable) - str - - """ - out = variable.dtype - if out == "O": - out = str - - return out - - def endian(self, variable): - """Return the endian-ness of a variable. - - .. versionadded:: (cfdm) HDFVER - - :Parameters: - - variable: `netCDF4.Variable` or `h5netcdf.Variable` - The variable. - - :Returns: - - `str` - The endian-ness (``'little'``, ``'big'``, or - ``'native'``) of the variable. - - **Examples** - - >>> f.endian(variable) - 'native' - - """ - try: - # netCDF4 - return variable.endian() - except AttributeError: - # h5netcdf - dtype = variable.dtype - return self._dtype_endian_lookup[getattr(dtype, "byteorder", None)] - - def filepath(self, dataset): - """Return the file path for the dataset. - - .. versionadded:: (cfdm) HDFVER - - :Parameters: - - dataset: `netCDF4.Dataset` or `h5netcdf.File` - The dataset. - - :Returns: - - `str` - The file system path, or the opendap URL, for the - dataset. - - **Examples** - - >>> f.filepath(dataset) - '/home/data/file.nc' - - """ - try: - # netCDF4 - return dataset.filepath() - except AttributeError: - # h5netcdf - return dataset.filename - - def get_dims(self, variable): - """Return. - - .. versionadded:: (cfdm) HDFVER - - :Returns: - - `list` - - """ - try: - # netCDF4 - return variable.get_dims() - except AttributeError: - # h5netcdf - dims = {} - dimension_names = list(variable.dimensions) - group = variable._parent - for name, dim in group.dims.items(): - if name in dimension_names: - dims[name] = dim - dimension_names.remove(name) - - group = group.parent - while group is not None and dimension_names: - for name, dim in group.dims.items(): - if name in dimension_names: - dims[name] = dim - dimension_names.remove(name) - - group = group.parent - - return [dims[name] for name in variable.dimensions] - - def getncattr(self, x, attr): - """Retrieve a netCDF attribute. - - .. versionadded:: (cfdm) HDFVER - - :Parameters: - - x: variable, group, or dataset - - attr: `str` - - :Returns: - - """ - try: - # netCDF4 - return getattr(x, attr) - except AttributeError: - # h5netcdf - return x.attrs[attr] - - def group(self, x): - """Return a. - - .. versionadded:: (cfdm) HDFVER - - :Returns: - - `Group` - - """ - try: - # netCDF4 - return x.group() - except AttributeError: - # h5netcdf - return x._parent - - def name(self, x): - """Return the netCDF name, without its groups. - - .. versionadded:: (cfdm) HDFVER - - :Returns: - - `str` - - """ - out = x.name - if "/" in out: - # h5netcdf - out = x.name.split("/")[-1] - - return out - - def ncattrs(self, x): - """Return netCDF attribute names. - - .. versionadded:: (cfdm) HDFVER - - :Parameters: - - x: variable, group, or dataset - - :Returns: - - `list` - - """ - try: - # netCDF4 - return x.ncattrs() - except AttributeError: - # h5netcdf - return list(x.attrs) - - def parent(self, group): - """Return a simulated unix directory path to a group. - - .. versionadded:: (cfdm) HDFVER - - :Returns: - - `str` - - """ - try: - return group.parent - except AttributeError: - return - - def path(self, group): - """Return a simulated unix directory path to a group. - - .. versionadded:: (cfdm) HDFVER - - :Returns: - - `str` - - """ - try: - # netCDF4 - return group.path - except AttributeError: - # h5netcdf - try: - return group.name - except AttributeError: - return "/" - - def flatten(self, output_ds): - """Flattens and write to output file. - - :param output_ds: The dataset in which to store the flattened result. - - """ - # or output_ds.filepath() == self.__input_file.filepath() \ - # or output_ds.data_model != 'NETCDF4': - if ( - output_ds == self.__input_file - or output_ds.filepath() == self.filepath(self.__input_file) - or output_ds.data_model != "NETCDF4" - ): - raise ValueError( - "Invalid inputs. Input and output datasets should be different, and output should be of " - "the 'NETCDF4' format." - ) - - self.__output_file = output_ds - - # Flatten product - self.process_group(self.__input_file) - - # Add name mapping attributes - self.__output_file.setncattr( - self.__attr_map_name, self.__attr_map_value - ) - self.__output_file.setncattr(self.__dim_map_name, self.__dim_map_value) - self.__output_file.setncattr(self.__var_map_name, self.__var_map_value) - - # Browse flattened variables to rename references: - logging.info( - "Browsing flattened variables to rename references in attributes:" - ) - for var in self.__output_file.variables.values(): - self.adapt_references(var) - - def process_group(self, input_group): - """Flattens a given group to the output file. - - :param input_group: group to flatten - - """ - # logging.info("Browsing group " + input_group.path) - logging.info("Browsing group " + self.path(input_group)) - # for attr_name in input_group.ncattrs(): - for attr_name in self.ncattrs(input_group): - self.flatten_attribute(input_group, attr_name) - - for dim in input_group.dimensions.values(): - self.flatten_dimension(dim) - - for var in input_group.variables.values(): - self.flatten_variable(var) - - for child_group in input_group.groups.values(): - self.process_group(child_group) - - def flatten_attribute(self, input_group, attr_name): - """Flattens a given attribute from a group to the output file. - - :param input_group: group containing the attribute to flatten - :param attr_name: name of the attribute to flatten - - """ - # logging.info(" Copying attribute {} from group {} to root".format(attr_name, input_group.path)) - logging.info( - " Copying attribute {} from group {} to root".format( - attr_name, self.path(input_group) - ) - ) - - # Create new name - new_attr_name = self.generate_flattened_name(input_group, attr_name) - - # Write attribute - # self.__output_file.setncattr(new_attr_name, input_group.getncattr(attr_name)) - self.__output_file.setncattr( - new_attr_name, self.getncattr(input_group, attr_name) - ) - - # Store new naming for later and in mapping attribute - self.__attr_map_value.append( - self.generate_mapping_str(input_group, attr_name, new_attr_name) - ) - - def flatten_dimension(self, dim): - """Flattens a given dimension to the output file. - - :param dim: dimension to flatten - - """ - # logging.info(" Copying dimension {} from group {} to root".format(dim.name, dim.group().path)) - logging.info( - " Copying dimension {} from group {} to root".format( - self.name(dim), self.path(self.group(dim)) - ) - ) - - # Create new name - # new_name = self.generate_flattened_name(dim.group(), dim.name) - new_name = self.generate_flattened_name( - self.group(dim), self.name(dim) - ) - - # Write dimension - self.__output_file.createDimension( - new_name, (len(dim), None)[dim.isunlimited()] - ) - - # Store new name in dict for resolving references later - # self.__dim_map[self.pathname(dim.group(), dim.name)] = new_name - self.__dim_map[ - self.pathname(self.group(dim), self.name(dim)) - ] = new_name - - # Add to name mapping attribute - # self.__dim_map_value.append(self.generate_mapping_str(dim.group(), dim.name, new_name)) - self.__dim_map_value.append( - self.generate_mapping_str( - self.group(dim), self.name(dim), new_name - ) - ) - - def flatten_variable(self, var): - """Flattens a given variable to the output file. - - :param var: variable to flatten - - """ - # logging.info(" Copying variable {} from group {} to root".format(var.name, var.group().path)) - logging.info( - " Copying variable {} from group {} to root".format( - self.name(var), self.path(self.group(var)) - ) - ) - - # Create new name - # new_name = self.generate_flattened_name(var.group(), self.name(var)) - new_name = self.generate_flattened_name( - self.group(var), self.name(var) - ) - - # Replace old by new dimension names - # new_dims = list(map(lambda x: self.__dim_map[self.pathname(x.group(), x.name)], var.get_dims())) - new_dims = list( - map( - lambda x: self.__dim_map[ - self.pathname(self.group(x), self.name(x)) - ], - self.get_dims(var), - ) - ) - - # Write variable - # fullname = self.pathname(var.group(), self.name(var)) - fullname = self.pathname(self.group(var), self.name(var)) - logging.info("create variable {} from {}".format(new_name, fullname)) - - new_var = self.__output_file.createVariable( - new_name, - self.dtype(var), - new_dims, - zlib=False, - complevel=4, - shuffle=True, - fletcher32=False, - contiguous=self.contiguous(var), - chunksizes=self.chunksizes(var), - endian=self.endian(var), - least_significant_digit=None, - fill_value=None, - ) - - if self.__copy_data: - # Find out slice method for variable and copy data - if ( - self.__copy_slices is None - or fullname not in self.__copy_slices - ): - # Copy data as a whole - new_var[:] = var[:] - elif self.__copy_slices[fullname] is None: - # Copy with default slice size - copy_slice = tuple( - self.__default_copy_slice_size // len(var.shape) - for _ in range(len(var.shape)) - ) - self.copy_var_by_slices(new_var, var, copy_slice) - else: - # Copy in slices - copy_slice = self.__copy_slices[fullname] - self.copy_var_by_slices(new_var, var, copy_slice) - - # Copy attributes - # new_var.setncatts(var.__dict__) - new_var.setncatts(self.attrs(var)) - - # Store new name in dict for resolving references later - # self.__var_map[self.pathname(var.group(), var.name)] = new_name - self.__var_map[ - self.pathname(self.group(var), self.name(var)) - ] = new_name - - # Add to name mapping attribute - # self.__var_map_value.append(self.generate_mapping_str(var.group(), var.name, new_name)) - self.__var_map_value.append( - self.generate_mapping_str( - self.group(var), self.name(var), new_name - ) - ) - - # Resolve references in variable attributes and replace by absolute path: - self.resolve_references(new_var, var) - - def increment_pos(self, pos, dim, copy_slice_shape, var_shape): - """TODOHDF. - - Increment position vector in a variable along a dimension by - the matching slice length along than dimension. If end of the - dimension is reached, recursively increment the next - dimensions until a valid position is found. - - :param pos: current position - :param dim: dimension to be incremented - :param copy_slice_shape: shape of the slice - :param var_shape: shape of the variable - :return True if a valid position is found within the variable, False otherwise - - """ - # Try to increment dimension - pos[dim] += copy_slice_shape[dim] - - # Test new position - dim_end_reached = pos[dim] > var_shape[dim] - var_end_reached = (dim + 1) >= len(copy_slice_shape) - - # End of this dimension not reached yet - if not dim_end_reached: - return True - # End of this dimension reached. Reset to 0 and try increment next one recursively - elif dim_end_reached and not var_end_reached: - pos[: dim + 1] = [0 for j in range(dim + 1)] - return self.increment_pos( - pos, dim + 1, copy_slice_shape, var_shape - ) - # End of this dimension reached, and no dimension to increment. Finish. - else: - return False - - def copy_var_by_slices(self, new_var, old_var, copy_slice_shape): - """Copy the data of a variable to a new one by slice. - - :param new_var: new variable where to copy data - :param old_var: variable where data should be copied from - :param copy_slice_shape: shape of the slice - - """ - # logging.info(" copying data of {} in {} slices".format(old_var.name, copy_slice_shape)) - logging.info( - " copying data of {} in {} slices".format( - self.name(old_var), copy_slice_shape - ) - ) - - # Initial position vector - pos = [0 for _ in range(len(copy_slice_shape))] - - # Copy in slices until end reached - var_end_reached = False - while not var_end_reached: - # Create current slice - current_slice = tuple( - slice( - pos[dim_i], min(old_var.shape[dim_i], pos[dim_i] + dim_l) - ) - for dim_i, dim_l in enumerate(copy_slice_shape) - ) - - # Copy data in slice - new_var[current_slice] = old_var[current_slice] - - # Get next position - var_end_reached = not self.increment_pos( - pos, 0, copy_slice_shape, old_var.shape - ) - - def resolve_reference(self, orig_ref, orig_var, attr): - """Resolve a refrence. - - Resolves the absolute path to a coordinate variable within the - group structure. - - :param orig_ref: reference to resolve - :param orig_var: variable originally containing the reference - :param attr: _AttributeProperties object enum item to know if ref to dim or var - :return: absolute path to the reference - - """ - ref = orig_ref - absolute_ref = None - ref_type = "" - - # Resolve first as dim (True), or var (False) - resolve_dim_or_var = attr.ref_to_dim > attr.ref_to_var - - # Resolve var (resp. dim) if resolving as dim (resp. var) failed - resolve_alt = attr.ref_to_dim and attr.ref_to_var - - # Reference is already given by absolute path - if ref.startswith(self.__default_separator): - method = "absolute" - absolute_ref = ref - - # Reference is given by relative path - elif self.__default_separator in ref: - method = " relative" - - # First tentative as dim OR var - # ref_type = "dimension" if resolve_dim_or_var else "variable" - if resolve_dim_or_var: - ref_type = "dimension" - else: - ref_type = "variable" - # absolute_ref = self.search_by_relative_path(orig_ref, orig_var.group(), resolve_dim_or_var) - absolute_ref = self.search_by_relative_path( - orig_ref, self.group(orig_var), resolve_dim_or_var - ) - - # If failed and alternative possible, second tentative - if absolute_ref is None and resolve_alt: - # ref_type = ( - # "dimension" if not resolve_dim_or_var else "variable" - # ) - if resolve_dim_or_var: - ref_type = "variable" - else: - ref_type = "dimension" - # absolute_ref = self.search_by_relative_path(orig_ref, orig_var.group(), not resolve_dim_or_var) - absolute_ref = self.search_by_relative_path( - orig_ref, self.groupp(orig_var), not resolve_dim_or_var - ) - - # Reference is to be searched by proximity - else: - method = " proximity" - absolute_ref, ref_type = self.resolve_reference_proximity( - ref, resolve_dim_or_var, resolve_alt, orig_var, attr - ) - - # Post-search checks and return result - return self.resolve_reference_post_processing( - absolute_ref, orig_ref, orig_var, attr, ref_type, method - ) - - def resolve_reference_proximity( - self, ref, resolve_dim_or_var, resolve_alt, orig_var, attr - ): - """Resolve reference: search by proximity.""" - # First tentative as dim OR var - # ref_type = "dimension" if resolve_dim_or_var else "variable" - if resolve_dim_or_var: - ref_type = "dimension" - else: - ref_type = "variable" - # resolved_var = self.search_by_proximity(ref, orig_var.group(), resolve_dim_or_var, False, - # attr.stop_at_local_apex) - resolved_var = self.search_by_proximity( - ref, - self.group(orig_var), - resolve_dim_or_var, - False, - attr.stop_at_local_apex, - ) - - # If failed and alternative possible, second tentative - if resolved_var is None and resolve_alt: - # ref_type = "dimension" if not resolve_dim_or_var else "variable" - if resolve_dim_or_var: - ref_type = "variable" - else: - ref_type = "dimension" - # resolved_var = self.search_by_proximity(ref, orig_var.group(), not resolve_dim_or_var, False, - # attr.stop_at_local_apex) - resolved_var = self.search_by_proximity( - ref, - self.group(orig_var), - not resolve_dim_or_var, - False, - attr.stop_at_local_apex, - ) - - # If found, create ref string - if resolved_var is not None: - # return self.pathname(resolved_var.group(), resolved_var.name), ref_type - return ( - self.pathname( - self.group(resolved_var), self.name(resolved_var) - ), - ref_type, - ) - else: - return None, "" - - def resolve_reference_post_processing( - self, absolute_ref, orig_ref, orig_var, attr, ref_type, method - ): - """Post-processing operations after resolving reference.""" - # If not found and accept standard name, assume standard name - if absolute_ref is None and attr.accept_standard_names: - logging.info( - " coordinate reference to '{}' not resolved. Assumed to be a standard name.".format( - orig_ref - ) - ) - ref_type = "standard_name" - absolute_ref = orig_ref - # Else if not found, raise exception - elif absolute_ref is None: - # absolute_ref = self.handle_reference_error(orig_ref, orig_var.group().path) - absolute_ref = self.handle_reference_error( - orig_ref, self.path(self.group(orig_var)) - ) - # If found: - else: - logging.info( - " {} coordinate reference to {} '{}' resolved as '{}'".format( - method, ref_type, orig_ref, absolute_ref - ) - ) - - # If variables refs are limited to coordinate variable, additional check - # and (("coordinates" not in orig_var.ncattrs() or orig_ref not in orig_var.coordinates) - if ( - ref_type == "variable" - and attr.limit_to_scalar_coordinates - and ( - ( - "coordinates" not in self.ncattrs(orig_var) - or orig_ref not in self.getncattr(orig_var, "coordinates") - ) - or self._Flattener__input_file[absolute_ref].ndim > 0 - ) - ): - logging.info( - " coordinate reference to '{}' is not a SCALAR COORDINATE variable. " - "Assumed to be a standard name.".format(orig_ref) - ) - absolute_ref = orig_ref - - # Return result - return absolute_ref - - def search_by_relative_path(self, ref, current_group, search_dim): - """Search by relative path. - - Resolves the absolute path to a reference within the group - structure, using search by relative path. - - :param ref: reference to resolve - :param current_group: current group where searching - :param search_dim: if true, search references to dimensions, if false, search references to variables - :return: absolute path to the coordinate - - """ - # Go up parent groups - while ref.startswith("../"): - if current_group.parent is None: - return None - - ref = ref[3:] - current_group = current_group.parent - - # Go down child groups - ref_split = ref.split(self.__default_separator) - for g in ref_split[:-1]: - try: - current_group = current_group.groups[g] - except KeyError: - return None - - # Get variable or dimension - # elt = ( - # current_group.dimensions[ref_split[-1]] - # if search_dim - # else current_group.variables[ref_split[-1]] - # ) - if search_dim: - elt = current_group.dimensions[ref_split[-1]] - else: - elt = current_group.variables[ref_split[-1]] - - # Get absolute reference - # return self.pathname(elt.group(), elt.name) - return self.pathname(self.group(elt), self.name(elt)) - - def search_by_proximity( - self, - ref, - current_group, - search_dim, - local_apex_reached, - is_coordinate_variable, - ): - """Search by proximity. - - Resolves the absolute path to a reference within the group - structure, using search by proximity. - - First search up in the hierarchy for the reference, until root - group is reached. If coordinate variable, search until local - apex is reached, Then search down in siblings. - - :param ref: reference to resolve - :param current_group: current group where searching - :param search_dim: if true, search references to dimensions, if false, search references to variables - :param local_apex_reached: False initially, until apex is reached. - :param is_coordinate_variable: true, if looking for a coordinate variable - :return: absolute path to the coordinate - - """ - # dims_or_vars = ( - # current_group.dimensions if search_dim else current_group.variables - # ) - if search_dim: - dims_or_vars = current_group.dimensions - else: - dims_or_vars = current_group.variables # DCH - - # Found in current group - if ref in dims_or_vars.keys(): - return dims_or_vars[ref] - - local_apex_reached = ( - local_apex_reached or ref in current_group.dimensions.keys() - ) - - # Check if has to continue looking in parent group - # - normal search: continue until root is reached - # - coordinate variable: continue until local apex is reached - if is_coordinate_variable: - top_reached = local_apex_reached or current_group.parent is None - else: - top_reached = current_group.parent is None - - # Search up - if not top_reached: - return self.search_by_proximity( - ref, - current_group.parent, - search_dim, - local_apex_reached, - is_coordinate_variable, - ) - - elif is_coordinate_variable and local_apex_reached: - # Coordinate variable and local apex reached, so search - # down in siblings - found_elt = None - for child_group in current_group.groups.values(): - found_elt = self.search_by_proximity( - ref, - child_group, - search_dim, - local_apex_reached, - is_coordinate_variable, - ) - if found_elt is not None: - break - - return found_elt - - # If here, did not find - else: - return None - - def __escape_index_error(self, match, group_name): - """TODOHDF. - - :param match: regex match - :param group_name: group name - - :Returns: - - `str` - The group in a match if it exists, an empty string - otherwise. - - """ - try: - return match.group(group_name) - except IndexError: - return "" - - def resolve_references(self, var, old_var): - """Resolve references. - - In a given variable, replace all references to other variables - in its attributes by absolute references. - - :param var: flattened variable in which references should be renamed with absolute references - :param old_var: original variable (in group structure) - - """ - var_attrs = self.attrs(var) - var_attrs_names = tuple(var_attrs) - for attr in _AttributeProperties: - # if attr.name in var.__dict__: - # if attr.name in self.ncattrs(var): - if attr.name in var_attrs_names: # self.ncattrs(var): - # attr_value = var.getncattr(attr.name) - attr_value = var_attrs[ - attr.name - ] # self.getncattr(var, attr.name) - # Parse attribute value - parsed_attr = parse_var_attr(attr_value) - - # Resolved references in parsed as required by attribute properties - resolved_parsed_attr = {} # collections.OrderedDict() - - for k, v in parsed_attr.items(): - # new_k = ( - # self.resolve_reference(k, old_var, attr) - # if attr.resolve_key - # else k - # ) - if attr.resolve_key: - k = self.resolve_reference(k, old_var, attr) - - # new_v = ( - # [ - # self.resolve_reference(x, old_var, attr) - # for x in parsed_attr[k] - # ] - # if attr.resolve_value and parsed_attr[k] is not None - # else parsed_attr[k] - # ) - if attr.resolve_value and v is not None: - v = [ - self.resolve_reference(x, old_var, attr) for x in v - ] - - resolved_parsed_attr[k] = v - - # Re-generate attribute value string with resolved references - var.setncattr( - attr.name, generate_var_attr_str(resolved_parsed_attr) - ) - - def adapt_references(self, var): - """Adapt references. - - In a given variable, replace all references to variables in - attributes by references to the new names in the flattened - netCDF. All references have to be already resolved as absolute - references. - - :param var: flattened variable in which references should be renamed with new names - - """ - var_attrs = self.attrs(var) - var_attrs_names = tuple(var_attrs) - for attr in _AttributeProperties: - # if attr.name in var.__dict__: - if attr.name in var_attrs_names: # self.ncattrs(var): - # attr_value = var.getncattr(attr.name) - attr_value = var_attrs[ - attr.name - ] # self.getncattr(var, attr.name) - # Parse attribute value - parsed_attr = parse_var_attr(attr_value) - - adapted_parsed_attr = {} # collections.OrderedDict() - - for k, v in parsed_attr.items(): - # new_k = self.adapt_name(k, attr) if attr.resolve_key else k - if attr.resolve_key: - k = self.adapt_name(k, attr) - - # new_v = ( - # [self.adapt_name(x, attr) for x in parsed_attr[k]] - # if attr.resolve_value and parsed_attr[k] is not None - # else parsed_attr[k] - # ) - if attr.resolve_value and v is not None: - v = [self.adapt_name(x, attr) for x in v] - - adapted_parsed_attr[k] = v - - new_attr_value = generate_var_attr_str(adapted_parsed_attr) - var.setncattr(attr.name, new_attr_value) - - logging.info( - " attribute '{}' in '{}': references '{}' renamed as '{}'".format( - attr.name, self.name(var), attr_value, new_attr_value - ) - ) - - # .format(attr.name, var.name, attr_value, new_attr_value)) - - def adapt_name(self, resolved_ref, attr): - """Apapt the name. - - Return name of flattened reference. If not found, raise - exception or continue warning. - - :param resolved_ref: resolved reference to adapt - :param attr: _AttributeProperties object enum item to know in which dict to look for name mapping - :return: adapted reference - - """ - # If ref contains Error message, leave as such - if self.__ref_not_found_error in resolved_ref: - return resolved_ref - - # Select highest priority map - if attr.ref_to_dim > attr.ref_to_var: - name_mapping = self.__dim_map - if attr.ref_to_dim < attr.ref_to_var: - name_mapping = self.__var_map - - # Try to find mapping - try: - return name_mapping[resolved_ref] - - # If not found, look in other map if allowed - except KeyError: - if attr.ref_to_dim and attr.ref_to_var: - name_mapping = ( - self.__dim_map - if attr.ref_to_dim < attr.ref_to_var - else self.__var_map - ) - try: - return name_mapping[resolved_ref] - except KeyError: - pass - - # If still not found, check if any standard name is allowed - if attr.accept_standard_names: - return resolved_ref - # If not, raise exception - else: - return self.handle_reference_error(resolved_ref) - - def pathname(self, group, name): - """Compose full path name to an element in a group structure: - - /path/to/group/elt. - - :param group: group containing element - :param name: name of the element - :return: pathname - - """ - # if group.parent is None: - if self.parent(group) is None: - return self.__default_separator + name - else: - # return self.__pathname_format.format(group.path, name) - return self.__pathname_format.format(self.path(group), name) - - def generate_mapping_str(self, input_group, name, new_name): - """Generate string mapping. - - Generates a string representing the name mapping of an element - before and after flattening. - - :param input_group: group containing the non-flattened element - :param name: name of the non-flattened element - :param new_name: name of the flattened element - :return: string representing the name mapping for the element - - """ - original_pathname = self.pathname(input_group, name) - mapping_str = self.__mapping_str_format.format( - new_name, original_pathname - ) - return mapping_str - - def convert_path_to_valid_name(self, pathname): - """Generate valid name from path. - - :param pathname: pathname - :return: valid NetCDF name - - """ - return pathname.replace(self.__default_separator, "", 1).replace( - self.__default_separator, self.__new_separator - ) - - def generate_flattened_name(self, input_group, orig_name): - """Convert full path of an element to a valid NetCDF name: - - - the name of an element is the concatenation of its containing group and its name, - - replaces / from paths (forbidden as NetCDF name), - - if name is longer than 255 characters, replace path to group by hash, - - if name is still too long, replace complete name by hash. - - :param input_group: group containing element - :param orig_name: original name of the element - :return: new valid name of the element - - """ - # If element is at root: no change - # if input_group.parent is None: - if self.parent(input_group) is None: - new_name = orig_name - - # If element in child group, concatenate group path and element name - else: - # full_name = self.convert_path_to_valid_name(input_group.path) + self.__new_separator + orig_name - full_name = ( - self.convert_path_to_valid_name(self.path(input_group)) - + self.__new_separator - + orig_name - ) - new_name = full_name - - # If resulting name is too long, hash group path - if len(new_name) >= self.__max_name_len: - # group_hash = hashlib.sha1(input_group.path.encode("UTF-8")).hexdigest() - group_hash = hashlib.sha1( - self.path(input_group).encode("UTF-8") - ).hexdigest() - new_name = group_hash + self.__new_separator + orig_name - - # If resulting name still too long, hash everything - if len(new_name) >= self.__max_name_len: - new_name = hashlib.sha1( - full_name.encode("UTF-8") - ).hexdigest() - return new_name - - def handle_reference_error(self, ref, context=None): - """Handle reference error. - - Depending on lax/strict mode, either raise exception or log - warning. If lax, return reference placeholder. - - :param ref: reference - :param context: additional context info to add to message - :return: if continue with warning, error replacement name for reference - - """ - message = "Reference '{}' could not be resolved".format(ref) - if context is not None: - message = message + " from {}".format(context) - if self.__lax_mode: - warnings.warn(message) - return self.__ref_not_found_error + "_" + ref - else: - raise ReferenceException(message) - - -class ReferenceException(Exception): - """Exception for unresolvable references in attributes.""" - - pass diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index e3c66cdc5..b6a2aaca6 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -21,8 +21,9 @@ from s3fs import S3FileSystem from ...decorators import _manage_log_level_via_verbosity -from ...flatten import _Flattener +#from ...flatten import _Flattener from ...flatten import flatten as netcdf_flatten +from ...flatten.config import _flattener_separator from ...functions import is_log_level_debug, is_log_level_detail from .. import IORead @@ -31,7 +32,7 @@ _cached_temporary_files = {} # _flattener_separator = netcdf_flattener._Flattener._Flattener__new_separator -_flattener_separator = _Flattener._Flattener__new_separator +#_flattener_separator = _Flattener._Flattener__new_separator @dataclass() From be747753e7f568a30a40aeb9c80e62fbace74392 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 31 Jan 2024 17:48:08 +0000 Subject: [PATCH 22/88] dev --- cfdm/read_write/netcdf/flatten/__init__.py | 1 + cfdm/read_write/netcdf/flatten/config.py | 178 +++ cfdm/read_write/netcdf/flatten/flatten.py | 1290 ++++++++++++++++++++ cfdm/read_write/netcdf/netcdfread.py | 327 +++-- 4 files changed, 1717 insertions(+), 79 deletions(-) create mode 100644 cfdm/read_write/netcdf/flatten/__init__.py create mode 100644 cfdm/read_write/netcdf/flatten/config.py create mode 100644 cfdm/read_write/netcdf/flatten/flatten.py diff --git a/cfdm/read_write/netcdf/flatten/__init__.py b/cfdm/read_write/netcdf/flatten/__init__.py new file mode 100644 index 000000000..d09591bec --- /dev/null +++ b/cfdm/read_write/netcdf/flatten/__init__.py @@ -0,0 +1 @@ +from .flatten import netcdf_flatten diff --git a/cfdm/read_write/netcdf/flatten/config.py b/cfdm/read_write/netcdf/flatten/config.py new file mode 100644 index 000000000..0a76e6f12 --- /dev/null +++ b/cfdm/read_write/netcdf/flatten/config.py @@ -0,0 +1,178 @@ +"""Configuration for netCDF group flattening. + +.. versionadded:: (cfdm) HDFVER + +""" +from dataclasses import dataclass + +# Maximum length of name after which it is replaced with its hash +max_name_len = 256 + +# Separator for groups in the input dataset +group_separator = "/" + +# Replacment for 'group_separator' in flattened names +flattener_separator = "__" + +# Name prefix when reference can't be resolved. Only used if +# 'lax_mode=True' in `flatten`. +ref_not_found_error = "REF_NOT_FOUND" + +# Default size, in bytes, of slice to use when copying data arrays +default_copy_slice_size = 134217728 + +# NetCDF global attribute containing the mapping of flattened +# attribute names to grouped attribute names +flattener_attribute_map = "__flattener_attribute_map" + +# NetCDF global attribute containing the mapping of flattened +# dimension names to grouped attribute names +flattener_dimension_map = "__flattener_dimension_map" + +# NetCDF global attribute containing the mapping of flattened +# variable names to grouped attribute names +flattener_variable_map = "__flattener_variable_map" + + +@dataclass() +class AttributeFeatures: + """Data class that defines attribute flattening features. + + For a named netCDF attribute, the features a define how the + contents of the attribute are flattened. + + .. versionadded:: (cfdm) HDFVER + + """ + + # name: The attribute name + name: str + # ref_to_dim: Positive integer if contains references to + # dimensions (highest int have priority) + ref_to_dim: int = 0 + # ref_to_var: Positive integer if contains references to variables + # (highest int have priority) + ref_to_var: int = 0 + # resolve_key: True if 'keys' have to be resolved in 'key1: value1 + # key2: value2 value3' or 'key1 key2' + resolve_key: bool = False + # resolve_value: True if 'values' have to be resolved in 'key1: + # value1 key2: value2 value3' + resolve_value: bool = False + # stop_at_local_apex: True if upward research in the hierarchy has + # to stop at local apex + stop_at_local_apex: bool = False + # accept_standard_names: True if any standard name is valid in + # place of references (in which case no + # exception is raised if a reference cannot + # be resolved, and the standard name is + # used in place) + accept_standard_names: bool = False + # limit_to_scalar_coordinates: True if references to variables are + # only resolved if present as well in + # the 'coordinates' attributes of the + # variable, and they are scalar. + limit_to_scalar_coordinates: bool = False + + +# -------------------------------------------------------------------- +# Set flattening features for named CF attributes +# -------------------------------------------------------------------- +attribute_features = { + attr.name: attr + for attr in ( + # Coordinates + AttributeFeatures( + name="coordinates", + ref_to_var=1, + resolve_key=True, + stop_at_local_apex=True, + ), + AttributeFeatures( + name="ancillary_variables", ref_to_var=1, resolve_key=True + ), + AttributeFeatures(name="climatology", ref_to_var=1, resolve_key=True), + # Cell measures + AttributeFeatures( + name="cell_measures", ref_to_var=1, resolve_value=True + ), + # Coordinate references + AttributeFeatures( + name="formula_terms", ref_to_var=1, resolve_value=True + ), + AttributeFeatures( + name="grid_mapping", + ref_to_var=1, + resolve_key=True, + resolve_value=True, + ), + AttributeFeatures(name="geometry", ref_to_var=1, resolve_key=True), + AttributeFeatures( + name="interior_ring", ref_to_var=1, resolve_key=True + ), + AttributeFeatures( + name="node_coordinates", ref_to_var=1, resolve_key=True + ), + AttributeFeatures(name="node_count", ref_to_var=1, resolve_key=True), + AttributeFeatures(name="nodes", ref_to_var=1, resolve_key=True), + AttributeFeatures( + name="part_node_count", ref_to_var=1, resolve_key=True + ), + # Compression by gathering + AttributeFeatures(name="compress", ref_to_dim=1, resolve_key=True), + # Discrete sampling geometries + AttributeFeatures( + name="instance_dimension", ref_to_dim=1, resolve_key=True + ), + AttributeFeatures( + name="sample_dimension", ref_to_dim=1, resolve_key=True + ), + # Cell methods + AttributeFeatures( + name="cell_methods", + ref_to_dim=2, + ref_to_var=1, + resolve_key=True, + accept_standard_names=True, + limit_to_scalar_coordinates=True, + ), + # Domain variables + AttributeFeatures(name="dimensions", ref_to_dim=1, resolve_key=True), + # Aggregation variables + AttributeFeatures( + name="aggregated_dimensions", ref_to_dim=1, resolve_key=True + ), + AttributeFeatures( + name="aggregated_data", ref_to_var=1, resolve_value=True + ), + # UGRID variables + AttributeFeatures(name="mesh", ref_to_var=1, resolve_key=True), + AttributeFeatures( + name="edge_coordinates", ref_to_var=1, resolve_key=True + ), + AttributeFeatures( + name="face_coordinates", ref_to_var=1, resolve_key=True + ), + AttributeFeatures( + name="edge_node_connectivity", ref_to_var=1, resolve_key=True + ), + AttributeFeatures( + name="face_node_connectivity", ref_to_var=1, resolve_key=True + ), + AttributeFeatures( + name="face_face_connectivity", ref_to_var=1, resolve_key=True + ), + AttributeFeatures( + name="edge_face_connectivity", ref_to_var=1, resolve_key=True + ), + AttributeFeatures( + name="face_edge_connectivity", ref_to_var=1, resolve_key=True + ), + AttributeFeatures( + name="edge_dimension", ref_to_dim=1, resolve_key=True + ), + AttributeFeatures( + name="face_dimension", ref_to_dim=1, resolve_key=True + ), + ) +} diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py new file mode 100644 index 000000000..d747d233a --- /dev/null +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -0,0 +1,1290 @@ +"""Netcdf flattener. + +This code has been adapted from the code found in the `netcdf_flattener` +package, which is licensed with Apache License 2.0 +(http://www.apache.org/licenses/LICENSE-2.0) + +""" + +import hashlib +import logging +import re +import warnings + +from .config import ( + attribute_features, + default_copy_slice_size, + flattener_attribute_map, + flattener_dimension_map, + flattener_separator, + flattener_variable_map, + group_separator, + max_name_len, + ref_not_found_error, +) + +# Mapping from numpy dtype endian format to what we expect +_dtype_endian_lookup = { + "=": "native", + ">": "big", + "<": "little", + "|": "native", + None: "native", +} + + +def netcdf_flatten( + input_ds, output_ds, lax_mode=False, _copy_data=True, copy_slices=None +): + """Create a flattened version of a netCDF dataset. + + For variable that are too big to fit in memory, the optional + "copy_slices" input allows to copy some or all of the variables in + slices. + + :param input_ds: input netcdf4 dataset + :param output_ds: output netcdf4 dataset + :param lax_mode: if false (default), not resolving a reference halts the execution. If true, continue with warning. + :param _copy_data: if true (default), then all data arrays are copied from the input to the output dataset. + If false, then this does not happen. + Use this option *only* if the data arrays of the flattened dataset are never to be accessed. + If false then consider setting the fill mode for the output netcd4 dataset to "off" for improved performance. + :param copy_slices: dictionary containing variable_name: shape pairs, where variable_name is the path to the + variable name in the original Dataset (for instance /group1/group2/my_variable), and shape is either None for + using default slice value, or a custom slicing shap in the form of a tuple of the same dimension as the variable + (for instance (1000,2000,1500,) for a 3-dimensional variable). If a variable from the Dataset is not contained + in the dict, it will not be sliced and copied normally. + + """ + _Flattener( + input_ds, lax_mode, _copy_data=_copy_data, copy_slices=copy_slices + ).flatten(output_ds) + + +def parse_var_attr(input_str): + """Parse variable attribute of any form into a dict: + + * 'time' -> OrderedDict([('time', [])]) + * 'lat lon' -> OrderedDict([('lat', []), ('lon', [])]) + * 'area: time volume: lat lon' -> OrderedDict([('area', ['time']), ('volume', ['lat', 'lon'])]) + + :param input_str: string to parse + :return: parsed string in an OrderedDict + + """ + + def subst(s): + """Substitute tokens for WORD and SEP.""" + return s.replace("WORD", r"[A-Za-z0-9_#/.\(\)]+").replace( + "SEP", r"(\s+|$)" + ) + + # Regex for 'dict form': "k1: v1 v2 k2: v3" + pat_value = subst("(?PWORD)SEP") + pat_values = "({})*".format(pat_value) + pat_mapping = subst( + "(?PWORD):SEP(?P{})".format(pat_values) + ) + pat_mapping_list = "({})+".format(pat_mapping) + + # Regex for 'list form': "v1 v2 v3" (including single-item form) + pat_list_item = subst("(?PWORD)SEP") + pat_list = "({})+".format(pat_list_item) + + # Regex for any form: + pat_all = subst( + "((?P{})|(?P{}))$".format( + pat_list, pat_mapping_list + ) + ) + + m = re.match(pat_all, input_str) + + # Output is always a dict. If input form is a list, dict values + # are set as empty lists + out = {} + + if m is not None: + list_match = m.group("list") + # Parse as a list + if list_match: + for mapping in re.finditer(pat_list_item, list_match): + item = mapping.group("list_item") + out[item] = None + + # Parse as a dict: + else: + mapping_list = m.group("mapping_list") + for mapping in re.finditer(pat_mapping, mapping_list): + term = mapping.group("mapping_name") + values = [ + value.group("value") + for value in re.finditer( + pat_value, mapping.group("values") + ) + ] + out[term] = values + else: + raise ReferenceException( + f"Error while parsing attribute value: {input_str!r}" + ) + + return out + + +def generate_var_attr_str(d): + """Re-generate the attribute string from a dictionary. + + :param d: dictionary + :return: valid attribute string + + """ + parsed_list = [] + for k, v in d.items(): + if v is None: + parsed_list.append(k) + elif not v: + parsed_list.append(f"{k}:") + else: + # parsed_list.append(k + ": " + (" ".join(v))) + parsed_list.append(f"{k}: {' '.join(v)}") + + return " ".join(parsed_list) + + +class _Flattener: + """Information and methods needed to flatten a netCDF dataset. + + Contains the input file, the output file being flattened, and all + the logic of the flattening process. + + """ + + def __init__(self, input_ds, lax_mode, _copy_data=True, copy_slices=None): + """**Initialisation** + + :param input_ds: input netcdf dataset + :param lax_mode: if false (default), not resolving a reference halts the execution. If true, continue with warning. + :param _copy_data: if true (default), then all data arrays are copied from the input to the output dataset + If false, then this does not happen. + Use this option *only* if the data arrays of the flattened dataset are never to be accessed. + :param copy_slices: dictionary containing variable_name: shape pairs, where variable_name is the path to the + variable name in the original Dataset (for instance /group1/group2/my_variable), and shape is either None + for using default slice value, or a custom slicing shape in the form of a tuple of the same dimension as the + variable (for instance (1000,2000,1500,) for a 3-dimensional variable). If a variable from the Dataset is + not contained in the dict, it will not be sliced and copied normally. + + """ + self.__attr_map_value = [] + self.__dim_map_value = [] + self.__var_map_value = [] + + self.__dim_map = {} + self.__var_map = {} + + self.__lax_mode = lax_mode + + self.__copy_data = _copy_data + self.__copy_slices = copy_slices + + self.__input_file = input_ds + self.__output_file = None + + def attrs(self, variable): + try: + # h5netcdf + return variable.attrs + except AttributeError: + # netCDF4 + return { + attr: variable.getncattr(attr) for attr in variable.ncattrs() + } + + def chunksizes(self, variable): + """TODO.""" + try: + # netCDF4 + chunking = variable.chunking() + if chunking == "contiguous": + return None + + return chunking + except AttributeError: + # h5netcdf + return variable.chunks + + def contiguous(self, variable): + """Whether or not the variable data is contiguous on disk. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + variable: `netCDF4.Variable` or `h5netcdf.Variable` + The variable. + + :Returns: + + `bool` + `True` if the variable data is contiguous on disk, + otherwise `False`. + + **Examples** + + >>> f.contiguous(variable) + False + + """ + try: + # netCDF4 + return variable.chunking() == "contiguous" + except AttributeError: + # h5netcdf + return variable.chunks is None + + def dtype(self, variable): + """Return the data type of a variable. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + variable: + The dataset variable. + + :Returns: + + `numpy.dtype` or `str` + The data type. + + **Examples** + + >>> f.dtype(variable) + dtype('>> f.dtype(variable) + str + + """ + out = variable.dtype + if out == "O": + out = str + + return out + + def endian(self, variable): + """Return the endian-ness of a variable. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + variable: `netCDF4.Variable` or `h5netcdf.Variable` + The variable. + + :Returns: + + `str` + The endian-ness (``'little'``, ``'big'``, or + ``'native'``) of the variable. + + **Examples** + + >>> f.endian(variable) + 'native' + + """ + try: + # netCDF4 + return variable.endian() + except AttributeError: + # h5netcdf + dtype = variable.dtype + return _dtype_endian_lookup[getattr(dtype, "byteorder", None)] + + def filepath(self, dataset): + """Return the file path for the dataset. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + dataset: `netCDF4.Dataset` or `h5netcdf.File` + The dataset. + + :Returns: + + `str` + The file system path, or the opendap URL, for the + dataset. + + **Examples** + + >>> f.filepath(dataset) + '/home/data/file.nc' + + """ + try: + # netCDF4 + return dataset.filepath() + except AttributeError: + # h5netcdf + return dataset.filename + + def get_dims(self, variable): + """Return. + + .. versionadded:: (cfdm) HDFVER + + :Returns: + + `list` + + """ + try: + # netCDF4 + return variable.get_dims() + except AttributeError: + # h5netcdf + dims = {} + dimension_names = list(variable.dimensions) + group = variable._parent + for name, dim in group.dims.items(): + if name in dimension_names: + dims[name] = dim + dimension_names.remove(name) + + group = group.parent + while group is not None and dimension_names: + for name, dim in group.dims.items(): + if name in dimension_names: + dims[name] = dim + dimension_names.remove(name) + + group = group.parent + + return [dims[name] for name in variable.dimensions] + + def getncattr(self, x, attr): + """Retrieve a netCDF attribute. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + x: variable, group, or dataset + + attr: `str` + + :Returns: + + """ + try: + # netCDF4 + return getattr(x, attr) + except AttributeError: + # h5netcdf + return x.attrs[attr] + + def group(self, x): + """Return a. + + .. versionadded:: (cfdm) HDFVER + + :Returns: + + `Group` + + """ + try: + # netCDF4 + return x.group() + except AttributeError: + # h5netcdf + return x._parent + + def name(self, x): + """Return the netCDF name, without its groups. + + .. versionadded:: (cfdm) HDFVER + + :Returns: + + `str` + + """ + out = x.name + if group_separator in out: + # h5netcdf + out = x.name.split(group_separator)[-1] + + return out + + def ncattrs(self, x): + """Return netCDF attribute names. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + x: variable, group, or dataset + + :Returns: + + `list` + + """ + try: + # netCDF4 + return x.ncattrs() + except AttributeError: + # h5netcdf + return list(x.attrs) + + def parent(self, group): + """Return a simulated unix directory path to a group. + + .. versionadded:: (cfdm) HDFVER + + :Returns: + + `str` + + """ + try: + return group.parent + except AttributeError: + return + + def path(self, group): + """Return a simulated unix directory path to a group. + + .. versionadded:: (cfdm) HDFVER + + :Returns: + + `str` + + """ + try: + # netCDF4 + return group.path + except AttributeError: + # h5netcdf + try: + return group.name + except AttributeError: + return group_separator + + def flatten(self, output_ds): + """Flattens and write to output file. + + :param output_ds: The dataset in which to store the flattened result. + + """ + logging.info( + f"Flattening the groups of {self.filepath(self.__input_file)}" + ) + + if ( + output_ds == self.__input_file + or output_ds.filepath() == self.filepath(self.__input_file) + or output_ds.data_model != "NETCDF4" + ): + raise ValueError( + "Invalid inputs. Input and output datasets should " + "be different, and output should be of the 'NETCDF4' format." + ) + + self.__output_file = output_ds + + # Flatten product + self.process_group(self.__input_file) + + # Add name mapping attributes + self.__output_file.setncattr( + flattener_attribute_map, self.__attr_map_value + ) + self.__output_file.setncattr( + flattener_dimension_map, self.__dim_map_value + ) + self.__output_file.setncattr( + flattener_variable_map, self.__var_map_value + ) + + # Browse flattened variables to rename references: + logging.info( + " Browsing flattened variables to rename references " + "in attributes" + ) + for var in self.__output_file.variables.values(): + self.adapt_references(var) + + def process_group(self, input_group): + """Flattens a given group to the output file. + + :param input_group: group to flatten + + """ + logging.info(f" Browsing group {self.path(input_group)}") + + for attr_name in self.ncattrs(input_group): + self.flatten_attribute(input_group, attr_name) + + for dim in input_group.dimensions.values(): + self.flatten_dimension(dim) + + for var in input_group.variables.values(): + self.flatten_variable(var) + + for child_group in input_group.groups.values(): + self.process_group(child_group) + + def flatten_attribute(self, input_group, attr_name): + """Flattens a given attribute from a group to the output file. + + :param input_group: group containing the attribute to flatten + :param attr_name: name of the attribute to flatten + + """ + logging.info( + f" Copying attribute {attr_name} from " + f"group {self.path(input_group)} to root" + ) + + # Create new name + new_attr_name = self.generate_flattened_name(input_group, attr_name) + + # Write attribute + self.__output_file.setncattr( + new_attr_name, self.getncattr(input_group, attr_name) + ) + + # Store new naming for later and in mapping attribute + self.__attr_map_value.append( + self.generate_mapping_str(input_group, attr_name, new_attr_name) + ) + + def flatten_dimension(self, dim): + """Flattens a given dimension to the output file. + + :param dim: dimension to flatten + + """ + logging.info( + f" Copying dimension {self.name(dim)} from " + f"group {self.path(self.group(dim))} to root" + ) + + # Create new name + new_name = self.generate_flattened_name( + self.group(dim), self.name(dim) + ) + + # Write dimension + self.__output_file.createDimension( + new_name, (len(dim), None)[dim.isunlimited()] + ) + + # Store new name in dict for resolving references later + self.__dim_map[ + self.pathname(self.group(dim), self.name(dim)) + ] = new_name + + # Add to name mapping attribute + self.__dim_map_value.append( + self.generate_mapping_str( + self.group(dim), self.name(dim), new_name + ) + ) + + def flatten_variable(self, var): + """Flattens a given variable to the output file. + + :param var: variable to flatten + + """ + logging.info( + f" Copying variable {self.name(var)} from " + f"group {self.path(self.group(var))} to root" + ) + + # Create new name + new_name = self.generate_flattened_name( + self.group(var), self.name(var) + ) + + # Replace old by new dimension names + new_dims = list( + map( + lambda x: self.__dim_map[ + self.pathname(self.group(x), self.name(x)) + ], + self.get_dims(var), + ) + ) + + # Write variable + fullname = self.pathname(self.group(var), self.name(var)) + logging.info(f" Creating variable {new_name} from {fullname}") + + new_var = self.__output_file.createVariable( + new_name, + self.dtype(var), + new_dims, + zlib=False, + complevel=4, + shuffle=True, + fletcher32=False, + contiguous=self.contiguous(var), + chunksizes=self.chunksizes(var), + endian=self.endian(var), + least_significant_digit=None, + fill_value=None, + ) + + if self.__copy_data: + # Find out slice method for variable and copy data + if ( + self.__copy_slices is None + or fullname not in self.__copy_slices + ): + # Copy data as a whole + new_var[:] = var[:] + elif self.__copy_slices[fullname] is None: + # Copy with default slice size + copy_slice = tuple( + default_copy_slice_size // len(var.shape) + for _ in range(len(var.shape)) + ) + self.copy_var_by_slices(new_var, var, copy_slice) + else: + # Copy in slices + copy_slice = self.__copy_slices[fullname] + self.copy_var_by_slices(new_var, var, copy_slice) + + # Copy attributes + new_var.setncatts(self.attrs(var)) + + # Store new name in dict for resolving references later + self.__var_map[ + self.pathname(self.group(var), self.name(var)) + ] = new_name + + # Add to name mapping attribute + self.__var_map_value.append( + self.generate_mapping_str( + self.group(var), self.name(var), new_name + ) + ) + + # Resolve references in variable attributes and replace by + # absolute path + self.resolve_references(new_var, var) + + def increment_pos(self, pos, dim, copy_slice_shape, var_shape): + """TODOHDF. + + Increment position vector in a variable along a dimension by + the matching slice length along than dimension. If end of the + dimension is reached, recursively increment the next + dimensions until a valid position is found. + + :param pos: current position + :param dim: dimension to be incremented + :param copy_slice_shape: shape of the slice + :param var_shape: shape of the variable + :return True if a valid position is found within the variable, False otherwise + + """ + # Try to increment dimension + pos[dim] += copy_slice_shape[dim] + + # Test new position + dim_end_reached = pos[dim] > var_shape[dim] + var_end_reached = (dim + 1) >= len(copy_slice_shape) + + # End of this dimension not reached yet + if not dim_end_reached: + return True + + # End of this dimension reached. Reset to 0 and try increment + # next one recursively + elif dim_end_reached and not var_end_reached: + pos[: dim + 1] = [0 for j in range(dim + 1)] + return self.increment_pos( + pos, dim + 1, copy_slice_shape, var_shape + ) + + else: + # End of this dimension reached, and no dimension to + # increment. Finish. + return False + + def copy_var_by_slices(self, new_var, old_var, copy_slice_shape): + """Copy the data of a variable to a new one by slice. + + :param new_var: new variable where to copy data + :param old_var: variable where data should be copied from + :param copy_slice_shape: shape of the slice + + """ + logging.info( + f" Copying data of {self.name(old_var)} in " + f"{copy_slice_shape} slices" + ) + + # Initial position vector + pos = [0 for _ in range(len(copy_slice_shape))] + + # Copy in slices until end reached + var_end_reached = False + while not var_end_reached: + # Create current slice + current_slice = tuple( + slice( + pos[dim_i], min(old_var.shape[dim_i], pos[dim_i] + dim_l) + ) + for dim_i, dim_l in enumerate(copy_slice_shape) + ) + + # Copy data in slice + new_var[current_slice] = old_var[current_slice] + + # Get next position + var_end_reached = not self.increment_pos( + pos, 0, copy_slice_shape, old_var.shape + ) + + def resolve_reference(self, orig_ref, orig_var, attr): + """Resolve a refrence. + + Resolves the absolute path to a coordinate variable within the + group structure. + + :param orig_ref: reference to resolve + :param orig_var: variable originally containing the reference + :param attr: AttributeFeatures object item to know if ref to dim or var + :return: absolute path to the reference + + """ + ref = orig_ref + absolute_ref = None + ref_type = "" + + # Resolve first as dim (True), or var (False) + resolve_dim_or_var = attr.ref_to_dim > attr.ref_to_var + + # Resolve var (resp. dim) if resolving as dim (resp. var) failed + resolve_alt = attr.ref_to_dim and attr.ref_to_var + + # Reference is already given by absolute path + if ref.startswith(group_separator): + method = "Absolute" + absolute_ref = ref + + # Reference is given by relative path + elif group_separator in ref: + method = "Relative" + + # First tentative as dim OR var + if resolve_dim_or_var: + ref_type = "dimension" + else: + ref_type = "variable" + + absolute_ref = self.search_by_relative_path( + orig_ref, self.group(orig_var), resolve_dim_or_var + ) + + # If failed and alternative possible, second tentative + if absolute_ref is None and resolve_alt: + if resolve_dim_or_var: + ref_type = "variable" + else: + ref_type = "dimension" + + absolute_ref = self.search_by_relative_path( + orig_ref, self.groupp(orig_var), not resolve_dim_or_var + ) + + # Reference is to be searched by proximity + else: + method = "Proximity" + absolute_ref, ref_type = self.resolve_reference_proximity( + ref, resolve_dim_or_var, resolve_alt, orig_var, attr + ) + + # Post-search checks and return result + return self.resolve_reference_post_processing( + absolute_ref, orig_ref, orig_var, attr, ref_type, method + ) + + def resolve_reference_proximity( + self, ref, resolve_dim_or_var, resolve_alt, orig_var, attr + ): + """Resolve reference: search by proximity.""" + # First tentative as dim OR var + if resolve_dim_or_var: + ref_type = "dimension" + else: + ref_type = "variable" + + resolved_var = self.search_by_proximity( + ref, + self.group(orig_var), + resolve_dim_or_var, + False, + attr.stop_at_local_apex, + ) + + # If failed and alternative possible, second tentative + if resolved_var is None and resolve_alt: + if resolve_dim_or_var: + ref_type = "variable" + else: + ref_type = "dimension" + + resolved_var = self.search_by_proximity( + ref, + self.group(orig_var), + not resolve_dim_or_var, + False, + attr.stop_at_local_apex, + ) + + # If found, create ref string + if resolved_var is not None: + return ( + self.pathname( + self.group(resolved_var), self.name(resolved_var) + ), + ref_type, + ) + else: + return None, "" + + def resolve_reference_post_processing( + self, absolute_ref, orig_ref, orig_var, attr, ref_type, method + ): + """Post-processing operations after resolving reference.""" + # If not found and accept standard name, assume standard name + if absolute_ref is None and attr.accept_standard_names: + logging.info( + f" Reference to {orig_ref!r} not " + "resolved. Assumed to be a standard name." + ) + ref_type = "standard_name" + absolute_ref = orig_ref + elif absolute_ref is None: + # Not found, so raise exception. + absolute_ref = self.handle_reference_error( + orig_ref, self.path(self.group(orig_var)) + ) + else: + # Found + logging.info( + f" {method} reference to {ref_type} " + f"{orig_ref!r} resolved as {absolute_ref!r}" + ) + + # If variables refs are limited to coordinate variable, + # additional check + if ( + ref_type == "variable" + and attr.limit_to_scalar_coordinates + and ( + ( + "coordinates" not in self.ncattrs(orig_var) + or orig_ref not in self.getncattr(orig_var, "coordinates") + ) + or self._Flattener__input_file[absolute_ref].ndim > 0 + ) + ): + logging.info( + f" Reference to {orig_ref!r} is not a " + "scalar coordinate variable. Assumed to be a standard name." + ) + absolute_ref = orig_ref + + # Return result + return absolute_ref + + def search_by_relative_path(self, ref, current_group, search_dim): + """Search by relative path. + + Resolves the absolute path to a reference within the group + structure, using search by relative path. + + :param ref: reference to resolve + :param current_group: current group where searching + :param search_dim: if true, search references to dimensions, if false, search references to variables + :return: absolute path to the coordinate + + """ + # Go up parent groups + while ref.startswith("../"): + if current_group.parent is None: + return None + + ref = ref[3:] + current_group = current_group.parent + + # Go down child groups + ref_split = ref.split(group_separator) + for g in ref_split[:-1]: + try: + current_group = current_group.groups[g] + except KeyError: + return None + + # Get variable or dimension + if search_dim: + elt = current_group.dimensions[ref_split[-1]] + else: + elt = current_group.variables[ref_split[-1]] + + # Get absolute reference + return self.pathname(self.group(elt), self.name(elt)) + + def search_by_proximity( + self, + ref, + current_group, + search_dim, + local_apex_reached, + is_coordinate_variable, + ): + """Search by proximity. + + Resolves the absolute path to a reference within the group + structure, using search by proximity. + + First search up in the hierarchy for the reference, until root + group is reached. If coordinate variable, search until local + apex is reached, Then search down in siblings. + + :param ref: reference to resolve + :param current_group: current group where searching + :param search_dim: if true, search references to dimensions, if false, search references to variables + :param local_apex_reached: False initially, until apex is reached. + :param is_coordinate_variable: true, if looking for a coordinate variable + :return: absolute path to the coordinate + + """ + if search_dim: + dims_or_vars = current_group.dimensions + else: + dims_or_vars = current_group.variables # DCH + + # Found in current group + if ref in dims_or_vars.keys(): + return dims_or_vars[ref] + + local_apex_reached = ( + local_apex_reached or ref in current_group.dimensions.keys() + ) + + # Check if has to continue looking in parent group + # - normal search: continue until root is reached + # - coordinate variable: continue until local apex is reached + if is_coordinate_variable: + top_reached = local_apex_reached or current_group.parent is None + else: + top_reached = current_group.parent is None + + # Search up + if not top_reached: + return self.search_by_proximity( + ref, + current_group.parent, + search_dim, + local_apex_reached, + is_coordinate_variable, + ) + + elif is_coordinate_variable and local_apex_reached: + # Coordinate variable and local apex reached, so search + # down in siblings + found_elt = None + for child_group in current_group.groups.values(): + found_elt = self.search_by_proximity( + ref, + child_group, + search_dim, + local_apex_reached, + is_coordinate_variable, + ) + if found_elt is not None: + break + + return found_elt + + else: + # Did not find + return None + + def __escape_index_error(self, match, group_name): + """TODOHDF. + + :param match: regex match + :param group_name: group name + + :Returns: + + `str` + The group in a match if it exists, an empty string + otherwise. + + """ + try: + return match.group(group_name) + except IndexError: + return "" + + def resolve_references(self, var, old_var): + """Resolve references. + + In a given variable, replace all references to other variables + in its attributes by absolute references. + + :param var: flattened variable in which references should be renamed with absolute references + :param old_var: original variable (in group structure) + + """ + for name, attr_value in self.attrs(var).items(): + attr = attribute_features.get(name) + if attr is None: + continue + + # Still here? Then resolve the references. + + # Parse attribute value + parsed_attr = parse_var_attr(attr_value) + + # Resolved references in parsed as required by attribute + # properties + resolved_parsed_attr = {} + + for k, v in parsed_attr.items(): + if attr.resolve_key: + k = self.resolve_reference(k, old_var, attr) + + if attr.resolve_value and v is not None: + v = [self.resolve_reference(x, old_var, attr) for x in v] + + resolved_parsed_attr[k] = v + + # Re-generate attribute value string with resolved + # references + var.setncattr( + attr.name, generate_var_attr_str(resolved_parsed_attr) + ) + + def adapt_references(self, var): + """Adapt references. + + In a given variable, replace all references to variables in + attributes by references to the new names in the flattened + netCDF. All references have to be already resolved as absolute + references. + + :param var: flattened variable in which references should be renamed with new names + + """ + for name, attr_value in self.attrs(var).items(): + attr = attribute_features.get(name) + if attr is None: + continue + + # Still here? Then adapt the references. + + # Parse attribute value + parsed_attr = parse_var_attr(attr_value) + + adapted_parsed_attr = {} # collections.OrderedDict() + + for k, v in parsed_attr.items(): + if attr.resolve_key: + k = self.adapt_name(k, attr) + + if attr.resolve_value and v is not None: + v = [self.adapt_name(x, attr) for x in v] + + adapted_parsed_attr[k] = v + + new_attr_value = generate_var_attr_str(adapted_parsed_attr) + var.setncattr(attr.name, new_attr_value) + + logging.info( + f" Value of {self.name(var)}.{attr.name} changed from " + f"{attr_value!r} to {new_attr_value!r}" + # f" attribute {attr.name!r} in {self.name(var)!r}: " + # f"references {attr_value!r} renamed as {new_attr_value!r}" + ) + + def adapt_name(self, resolved_ref, attr): + """Apapt the name. + + Return name of flattened reference. If not found, raise + exception or continue warning. + + :param resolved_ref: resolved reference to adapt + :param attr: AttributeFeatures object item to know in which dict to look for name mapping + :return: adapted reference + + """ + # If ref contains Error message, leave as such + if ref_not_found_error in resolved_ref: + return resolved_ref + + # Select highest priority map + if attr.ref_to_dim > attr.ref_to_var: + name_mapping = self.__dim_map + + if attr.ref_to_dim < attr.ref_to_var: + name_mapping = self.__var_map + + # Try to find mapping + try: + return name_mapping[resolved_ref] + + # If not found, look in other map if allowed + except KeyError: + if attr.ref_to_dim and attr.ref_to_var: + name_mapping = ( + self.__dim_map + if attr.ref_to_dim < attr.ref_to_var + else self.__var_map + ) + try: + return name_mapping[resolved_ref] + except KeyError: + pass + + # If still not found, check if any standard name is allowed + if attr.accept_standard_names: + return resolved_ref + + else: + # If not found, raise exception + return self.handle_reference_error(resolved_ref) + + def pathname(self, group, name): + """Compose full path name to an element in a group structure: + + /path/to/group/elt. + + :param group: group containing element + :param name: name of the element + :return: pathname + + """ + if self.parent(group) is None: + return group_separator + name + + # return pathname_format.format(self.path(group), name) + # + return group_separator.join((self.path(group), name)) + + # return f"{self.path(group)}{group_separator}{name}") + # pathname_format.format(self.path(group), name) + + def generate_mapping_str(self, input_group, name, new_name): + """Generate string mapping. + + Generates a string representing the name mapping of an element + before and after flattening. + + :param input_group: group containing the non-flattened element + :param name: name of the non-flattened element + :param new_name: name of the flattened element + :return: string representing the name mapping for the element + + """ + original_pathname = self.pathname(input_group, name) + mapping_str = f"{new_name}: {original_pathname}" + # mapping_str_format.format( + # new_name, original_pathname + # ) + return mapping_str + + def convert_path_to_valid_name(self, pathname): + """Generate valid name from path. + + :param pathname: pathname + :return: valid NetCDF name + + """ + return pathname.replace(group_separator, "", 1).replace( + group_separator, flattener_separator + ) + + def generate_flattened_name(self, input_group, orig_name): + """Convert full path of an element to a valid NetCDF name: + + - the name of an element is the concatenation of its containing group and its name, + - replaces / from paths (forbidden as NetCDF name), + - if name is longer than 255 characters, replace path to group by hash, + - if name is still too long, replace complete name by hash. + + :param input_group: group containing element + :param orig_name: original name of the element + :return: new valid name of the element + + """ + # If element is at root: no change + if self.parent(input_group) is None: + new_name = orig_name + + # If element in child group, concatenate group path and + # element name + else: + full_name = ( + self.convert_path_to_valid_name(self.path(input_group)) + + flattener_separator + + orig_name + ) + new_name = full_name + + # If resulting name is too long, hash group path + if len(new_name) >= max_name_len: + group_hash = hashlib.sha1( + self.path(input_group).encode("UTF-8") + ).hexdigest() + new_name = group_hash + flattener_separator + orig_name + + # If resulting name still too long, hash everything + if len(new_name) >= max_name_len: + new_name = hashlib.sha1( + full_name.encode("UTF-8") + ).hexdigest() + + return new_name + + def handle_reference_error(self, ref, context=None): + """Handle reference error. + + Depending on lax/strict mode, either raise exception or log + warning. If lax, return reference placeholder. + + :param ref: reference + :param context: additional context info to add to message + :return: if continue with warning, error replacement name for reference + + """ + message = f"Reference {ref!r} could not be resolved" + if context is not None: + message = f"{message} from {context}" + + if self.__lax_mode: + warnings.warn(message) + return f"{ref_not_found_error}_{ref}" + else: + raise ReferenceException(message) + + +class ReferenceException(Exception): + """Exception for unresolvable references in attributes.""" + + pass diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index b6a2aaca6..4ad4d2725 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -21,19 +21,20 @@ from s3fs import S3FileSystem from ...decorators import _manage_log_level_via_verbosity -#from ...flatten import _Flattener -from ...flatten import flatten as netcdf_flatten -from ...flatten.config import _flattener_separator from ...functions import is_log_level_debug, is_log_level_detail from .. import IORead +from .flatten import netcdf_flatten +from .flatten.config import ( + flattener_attribute_map, + flattener_dimension_map, + flattener_separator, + flattener_variable_map, +) logger = logging.getLogger(__name__) _cached_temporary_files = {} -# _flattener_separator = netcdf_flattener._Flattener._Flattener__new_separator -#_flattener_separator = _Flattener._Flattener__new_separator - @dataclass() class Mesh: @@ -510,19 +511,24 @@ def file_open(self, filename, flatten=True, verbose=None): # Deal with an file in an S3 object store u = urlparse(filename) if u.scheme == "s3": - # Create an openable s3 file object + # Create an openable S3 file object s3 = g["s3"] - g["s3_file_system_options"][filename] = s3 + if s3 is None: + # Default s3 file system options + s3 = {"anon": True} + if "endpoint_url" not in s3: # Derive endpoint_url from filename - s3 = g["s3"].copy() + s3 = s3.copy() s3["endpoint_url"] = f"https://{u.netloc}" + g["s3_file_system_options"][filename] = s3 + key = tuple(sorted(s3.items())) s3_file_systems = g["s3_file_systems"] fs = s3_file_systems.get(key) if fs is None: - # An s3 file system with these options does not exist, + # An S3 file system with these options does not exist, # so create one. fs = S3FileSystem(**s3) s3_file_systems[key] = fs @@ -563,10 +569,10 @@ def file_open(self, filename, flatten=True, verbose=None): raise error else: - raise ValueError("TODO") + raise ValueError("Unknown library name: library={library!r}") - g["original_HDF"] = HDF - g["original_netCDF"] = netCDF + g["original_h5netcdf"] = HDF + g["original_netCDF4"] = netCDF # ------------------------------------------------------------ # If the file has a group structure then flatten it (CF>=1.8) @@ -588,7 +594,6 @@ def file_open(self, filename, flatten=True, verbose=None): flat_nc.set_fill_off() # Flatten the file - # netcdf_flattener.flatten( netcdf_flatten(nc, flat_nc, lax_mode=True, _copy_data=False) # Store the original grouped file. This is primarily @@ -605,17 +610,43 @@ def file_open(self, filename, flatten=True, verbose=None): g["has_groups"] = True g["flat_files"].append(flat_file) - g["netCDF"] = netCDF - g["HDF"] = HDF + g["netCDF4"] = netCDF + g["h5netcdf"] = HDF g["nc"] = nc return nc def _open_netCDF4(self, filename): - """TODO.""" + """Return an open `netCDF4.Dataset`. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + filename: `str` + The file to open + + :Returns: + + `netCDF4.Dataset` + + """ return netCDF4.Dataset(filename, "r") def _open_h5netcdf(self, filename): - """TODO.""" + """Return an open `h5netcdf.File`. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + filename: `str` + The file to open + + :Returns: + + `h5netcdf.File` + + """ return h5netcdf.File(filename, "r", decode_vlen_strings=True) @classmethod @@ -923,7 +954,6 @@ def read( .. versionadded:: (cfdm) HDFVER - _s3_file_systems: `dict`, optional TODOHDF @@ -1131,7 +1161,7 @@ def read( # ---------------------------------------------------------------- global_attributes = {} # for attr in map(str,nc.ncattrs()): - for attr, value in self._file_global_attributes().items(): + for attr, value in self._file_global_attributes(nc).items(): attr = str(attr) if isinstance(value, bytes): value = value.decode(errors="ignore") @@ -1238,7 +1268,7 @@ def read( if has_groups: flattener_name_mapping_variables = getattr( - nc, "__flattener_name_mapping_variables", None + nc, flattener_variable_map, None ) if flattener_name_mapping_variables is not None: if isinstance(flattener_name_mapping_variables, str): @@ -1251,7 +1281,7 @@ def read( ) flattener_name_mapping_dimensions = getattr( - nc, "__flattener_name_mapping_dimensions", None + nc, flattener_dimension_map, None ) if flattener_name_mapping_dimensions is not None: if isinstance(flattener_name_mapping_dimensions, str): @@ -1270,7 +1300,7 @@ def read( flattener_dimensions[key] = value[1:] flattener_name_mapping_attributes = getattr( - nc, "__flattener_name_mapping_attributes", None + nc, flattener_attribute_map, None ) if flattener_name_mapping_attributes is not None: if isinstance(flattener_name_mapping_attributes, str): @@ -1301,19 +1331,19 @@ def read( # Remove flattener attributes from the global attributes for attr in ( - "__flattener_name_mapping_variables", - "__flattener_name_mapping_dimensions", - "__flattener_name_mapping_attributes", + flattener_variable_map, + flattener_dimension_map, + flattener_attribute_map, ): g["global_attributes"].pop(attr, None) - for ncvar in self._file_variables(): + for ncvar in self._file_variables(nc): ncvar_basename = ncvar groups = () group_attributes = {} # variable = nc.variables[ncvar] - variable = self._file_variable(ncvar) + variable = self._file_variable(nc, ncvar) # -------------------------------------------------------- # Specify the group structure for each variable (CF>=1.8) @@ -1335,7 +1365,7 @@ def read( # structure that was prepended to the netCDF # variable name by the netCDF flattener. ncvar_basename = re.sub( - f"^{_flattener_separator.join(groups)}{_flattener_separator}", + f"^{flattener_separator.join(groups)}{flattener_separator}", "", ncvar_flat, ) @@ -1389,7 +1419,7 @@ def read( # Populate dimensions_groups abd dimension_basename # dictionaries # for ncdim in nc.dimensions: - for ncdim in self._file_dimensions(): + for ncdim in self._file_dimensions(nc): ncdim_org = ncdim ncdim_basename = ncdim groups = () @@ -1406,7 +1436,7 @@ def read( if groups: # This dimension is in a group. ncdim_basename = re.sub( - "^{_flattener_separator.join(groups)}{_flattener_separator}", + "^{flattener_separator.join(groups)}{flattener_separator}", "", ncdim_flat, ) @@ -1418,7 +1448,7 @@ def read( # ncdim_org # ].isunlimited() dimension_isunlimited[ncdim] = self._file_dimension_isunlimited( - ncdim_org + nc, ncdim_org ) if has_groups: @@ -1468,7 +1498,7 @@ def read( # The netCDF dimensions of the parent file internal_dimension_sizes = {} # for name, dimension in nc.dimensions.items(): - for name, dimension in self._file_dimensions().items(): + for name, dimension in self._file_dimensions(nc).items(): if ( has_groups and dimension_isunlimited[flattener_dimensions[name]] @@ -2281,6 +2311,8 @@ def _get_variables_from_external_files(self, netcdf_external_variables): # Remove this ncvar from the set of external variables external_variables.remove(ncvar) + # TODO h5netcdf S3: include s3 vars here? + def _parse_compression_gathered(self, ncvar, compress): """Parse a list variable for compressing arrays by gathering.""" g = self.read_vars @@ -2631,8 +2663,7 @@ def _parse_geometry(self, parent_ncvar, attributes): # variable in this case. # -------------------------------------------------------- nodes_per_geometry = self.implementation.initialise_Count() - # size = g["nc"].dimensions[node_dimension].size - size = self._file_dimension_size(node_dimension) + size = self._file_dimension_size(g["nc"], node_dimension) ones = self.implementation.initialise_Data( array=np.ones((size,), dtype="int32"), copy=False ) @@ -6106,7 +6137,7 @@ def _create_netcdfarray( group, name = self._netCDF4_group( g["variable_grouped_dataset"][ncvar], ncvar ) - variable = group.variables.get(name) # h5netcdf + variable = group.variables.get(name) else: variable = g["variables"].get(ncvar) @@ -6193,8 +6224,7 @@ def _create_netcdfarray( if return_kwargs_only: return kwargs - if g["original_netCDF"]: - # netCDF4 + if g["original_netCDF4"]: array = self.implementation.initialise_NetCDFArray(**kwargs) else: # h5netcdf @@ -10033,66 +10063,205 @@ def _ugrid_check_connectivity_variable( return ok - def _file_global_attributes(self): - """TODOHDF.""" - g = self.read_vars - nc = g["nc"] - if g["netCDF"]: + def _file_global_attributes(self, nc): + """Return the global attributes from a dataset. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + nc: `netCDF4.Dataset` or `h5netcdf.File` + The dataset. + + :Returns: + + `dict'-like + A dictionary of the attribute values keyed by their + names. + + """ + try: + # h5netcdf + return nc.attrs + except AttributeError: # netCDF4 return {attr: nc.getncattr(attr) for attr in nc.ncattrs()} - # h5netcdf - return nc.attrs + def _file_dimensions(self, nc): + """Return all dimensions in the root group. - def _file_dimensions(self): - """TODOHDF.""" - g = self.read_vars - return g["nc"].dimensions + .. versionadded:: (cfdm) HDFVER - def _file_dimension(self, dim_name): - """TODOHDF.""" - return self._file_dimensions()[dim_name] + :Returns: - def _file_dimension_isunlimited(self, dim_name): - return self._file_dimension(dim_name).isunlimited() + `dict'-like + A dictionary of the dimensions keyed by their names. - def _file_dimension_size(self, dim_name): - """TODOHDF.""" - return self._file_dimension(dim_name).size + """ + return nc.dimensions - def _file_variables(self): - """TOODHDF.""" - g = self.read_vars - return g["nc"].variables + def _file_dimension(self, nc, dim_name): + """Return a dimension from the root group of a dataset. - def _file_variable(self, var_name): - """TODOHDF.""" - return self._file_variables()[var_name] + .. versionadded:: (cfdm) HDFVER - def _file_variable_attributes(self, var, names_only=False): - """TODOHDF.""" - g = self.read_vars - if not names_only: - if g["netCDF"]: - # netCDF4 - return {attr: var.getncattr(attr) for attr in var.ncattrs()} + :Parameters: + + nc: `netCDF4.Dataset` or `h5netcdf.File` + The dataset. + + dim_name: `str` + The dimension name. + :Returns: + + `netCDF.Dimension` or `h5netcdf.Dimension` + The dimension. + + """ + return self._file_dimensions(nc)[dim_name] + + def _file_dimension_isunlimited(self, nc, dim_name): + """Return a whether a dimension is unlimited. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + nc: `netCDF4.Dataset` or `h5netcdf.File` + The dataset. + + dim_name: `str` + The dimension name. + + :Returns: + + `bool` + Whether the dimension is unlimited. + + """ + return self._file_dimension(nc, dim_name).isunlimited() + + def _file_dimension_size(self, nc, dim_name): + """Return a dimension is size. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + nc: `netCDF4.Dataset` or `h5netcdf.File` + The dataset. + + dim_name: `str` + The dimension name. + + :Returns: + + `int` + The dimssion size + + """ + return self._file_dimension(nc, dim_name).size + + def _file_variables(self, nc): + """Return all variables in the root group. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + nc: `netCDF4.Dataset` or `h5netcdf.File` + The dataset. + + :Returns: + + `dict'-like + A dictionary of the variables keyed by their names. + + """ + return nc.variables + + def _file_variable(self, nc, var_name): + """Return a variable. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + nc: `netCDF4.Dataset` or `h5netcdf.File` + The dataset. + + var_name: `str` + The variable name. + + :Returns: + + `netCDF4.Variable` or `h5netcdf.Variable` + The variable. + + """ + return self._file_variables(nc)[var_name] + + def _file_variable_attributes(self, var): + """Return the variable attribute names. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + var: `netCDF4.Variable` or `h5netcdf.Variable` + The variable. + + :Returns: + + `dict`-like + A dictionary of the attribute values keyed by their + names. + + """ + try: # h5netcdf return var.attrs - - if g["netCDF"]: + except AttributeError: # netCDF4 - return var.ncattrs() - - # h5netcdf - return list(var.attrs) + return {attr: var.getncattr(attr) for attr in var.ncattrs()} def _file_variable_dimensions(self, var): - """TODOHDF.""" + """Return the variable dimension names. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + var: `netCDF4.Variable` or `h5netcdf.Variable` + The variable. + + :Returns: + + `tuple` or `str` + The dimension names. + + """ return var.dimensions def _file_variable_size(self, var): - """TODOHDF.""" + """Return the size of a variable's array. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + var: `netCDF4.Variable` or `h5netcdf.Variable` + The variable. + + :Returns: + + `int` + The array size. + + """ + # Use try/except here because the variable type could differ + # from that implied by the value of self.read_vars["netCDF4"] try: # netCDF4 return var.size From 29faa9d802e7ab678240a2d4f0d47d74c9685f76 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 1 Feb 2024 09:56:42 +0000 Subject: [PATCH 23/88] dev --- cfdm/read_write/netcdf/flatten/config.py | 59 ++++++++++---------- cfdm/read_write/netcdf/flatten/flatten.py | 65 ++++++++++------------- cfdm/read_write/read.py | 27 ++++++---- 3 files changed, 78 insertions(+), 73 deletions(-) diff --git a/cfdm/read_write/netcdf/flatten/config.py b/cfdm/read_write/netcdf/flatten/config.py index 0a76e6f12..c697befc8 100644 --- a/cfdm/read_write/netcdf/flatten/config.py +++ b/cfdm/read_write/netcdf/flatten/config.py @@ -21,16 +21,16 @@ # Default size, in bytes, of slice to use when copying data arrays default_copy_slice_size = 134217728 -# NetCDF global attribute containing the mapping of flattened -# attribute names to grouped attribute names +# NetCDF global attribute in the flattened dataset containing the +# mapping of flattened attribute names to grouped attribute names flattener_attribute_map = "__flattener_attribute_map" -# NetCDF global attribute containing the mapping of flattened -# dimension names to grouped attribute names +# NetCDF global attribute in the flattened dataset containing the +# mapping of flattened dimension names to grouped attribute names flattener_dimension_map = "__flattener_dimension_map" -# NetCDF global attribute containing the mapping of flattened -# variable names to grouped attribute names +# NetCDF global attribute in the flattened dataset containing the +# mapping of flattened variable names to grouped attribute names flattener_variable_map = "__flattener_variable_map" @@ -48,7 +48,7 @@ class AttributeFeatures: # name: The attribute name name: str # ref_to_dim: Positive integer if contains references to - # dimensions (highest int have priority) + # dimensions (higher values have priority) ref_to_dim: int = 0 # ref_to_var: Positive integer if contains references to variables # (highest int have priority) @@ -88,10 +88,17 @@ class AttributeFeatures: resolve_key=True, stop_at_local_apex=True, ), + AttributeFeatures(name="bounds", ref_to_var=1, resolve_key=True), + AttributeFeatures(name="climatology", ref_to_var=1, resolve_key=True), + # Cell methods AttributeFeatures( - name="ancillary_variables", ref_to_var=1, resolve_key=True + name="cell_methods", + ref_to_dim=2, + ref_to_var=1, + resolve_key=True, + accept_standard_names=True, + limit_to_scalar_coordinates=True, ), - AttributeFeatures(name="climatology", ref_to_var=1, resolve_key=True), # Cell measures AttributeFeatures( name="cell_measures", ref_to_var=1, resolve_value=True @@ -106,17 +113,9 @@ class AttributeFeatures: resolve_key=True, resolve_value=True, ), - AttributeFeatures(name="geometry", ref_to_var=1, resolve_key=True), + # Ancillary variables AttributeFeatures( - name="interior_ring", ref_to_var=1, resolve_key=True - ), - AttributeFeatures( - name="node_coordinates", ref_to_var=1, resolve_key=True - ), - AttributeFeatures(name="node_count", ref_to_var=1, resolve_key=True), - AttributeFeatures(name="nodes", ref_to_var=1, resolve_key=True), - AttributeFeatures( - name="part_node_count", ref_to_var=1, resolve_key=True + name="ancillary_variables", ref_to_var=1, resolve_key=True ), # Compression by gathering AttributeFeatures(name="compress", ref_to_dim=1, resolve_key=True), @@ -127,15 +126,6 @@ class AttributeFeatures: AttributeFeatures( name="sample_dimension", ref_to_dim=1, resolve_key=True ), - # Cell methods - AttributeFeatures( - name="cell_methods", - ref_to_dim=2, - ref_to_var=1, - resolve_key=True, - accept_standard_names=True, - limit_to_scalar_coordinates=True, - ), # Domain variables AttributeFeatures(name="dimensions", ref_to_dim=1, resolve_key=True), # Aggregation variables @@ -145,6 +135,19 @@ class AttributeFeatures: AttributeFeatures( name="aggregated_data", ref_to_var=1, resolve_value=True ), + # Cell geometries + AttributeFeatures(name="geometry", ref_to_var=1, resolve_key=True), + AttributeFeatures( + name="interior_ring", ref_to_var=1, resolve_key=True + ), + AttributeFeatures( + name="node_coordinates", ref_to_var=1, resolve_key=True + ), + AttributeFeatures(name="node_count", ref_to_var=1, resolve_key=True), + AttributeFeatures(name="nodes", ref_to_var=1, resolve_key=True), + AttributeFeatures( + name="part_node_count", ref_to_var=1, resolve_key=True + ), # UGRID variables AttributeFeatures(name="mesh", ref_to_var=1, resolve_key=True), AttributeFeatures( diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index d747d233a..031e7a3a4 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -32,6 +32,8 @@ None: "native", } +special_attributes = set(attribute_features) + def netcdf_flatten( input_ds, output_ds, lax_mode=False, _copy_data=True, copy_slices=None @@ -975,7 +977,7 @@ def search_by_proximity( if search_dim: dims_or_vars = current_group.dimensions else: - dims_or_vars = current_group.variables # DCH + dims_or_vars = current_group.variables # Found in current group if ref in dims_or_vars.keys(): @@ -1024,23 +1026,23 @@ def search_by_proximity( # Did not find return None - def __escape_index_error(self, match, group_name): - """TODOHDF. - - :param match: regex match - :param group_name: group name - - :Returns: - - `str` - The group in a match if it exists, an empty string - otherwise. - - """ - try: - return match.group(group_name) - except IndexError: - return "" + # def __escape_index_error(self, match, group_name): + # """TODOHDF. + # + # :param match: regex match + # :param group_name: group name + # + # :Returns: + # + # `str` + # The group in a match if it exists, an empty string + # otherwise. + # + # """ + # try: + # return match.group(group_name) + # except IndexError: + # return "" def resolve_references(self, var, old_var): """Resolve references. @@ -1052,20 +1054,16 @@ def resolve_references(self, var, old_var): :param old_var: original variable (in group structure) """ - for name, attr_value in self.attrs(var).items(): - attr = attribute_features.get(name) - if attr is None: - continue - - # Still here? Then resolve the references. - + var_attrs = self.attrs(var) + for name in special_attributes.intersection(var_attrs): # Parse attribute value - parsed_attr = parse_var_attr(attr_value) + parsed_attr = parse_var_attr(var_attrs[name]) # Resolved references in parsed as required by attribute # properties resolved_parsed_attr = {} + attr = attribute_features.get(name) for k, v in parsed_attr.items(): if attr.resolve_key: k = self.resolve_reference(k, old_var, attr) @@ -1092,18 +1090,15 @@ def adapt_references(self, var): :param var: flattened variable in which references should be renamed with new names """ - for name, attr_value in self.attrs(var).items(): - attr = attribute_features.get(name) - if attr is None: - continue - - # Still here? Then adapt the references. - + var_attrs = self.attrs(var) + for name in special_attributes.intersection(var_attrs): # Parse attribute value + attr_value = var_attrs[name] parsed_attr = parse_var_attr(attr_value) - adapted_parsed_attr = {} # collections.OrderedDict() + adapted_parsed_attr = {} + attr = attribute_features.get(name) for k, v in parsed_attr.items(): if attr.resolve_key: k = self.adapt_name(k, attr) @@ -1119,8 +1114,6 @@ def adapt_references(self, var): logging.info( f" Value of {self.name(var)}.{attr.name} changed from " f"{attr_value!r} to {new_attr_value!r}" - # f" attribute {attr.name!r} in {self.name(var)!r}: " - # f"references {attr_value!r} renamed as {new_attr_value!r}" ) def adapt_name(self, resolved_ref, attr): diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index e6cd8b5f4..8ca3284fe 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -267,20 +267,29 @@ def read( s3: `dict` or `None`, optional Keyword parameters to be passed to `s3fs.S3FileSystem` to control the opening of files in an S3 object store. By - default, or if `None`, then ``s3={'anon': True}``. Ignored - for file names that don't start with ``s3:``. - - If and only if *s3* has no ``'endpoint_url'`` key, then - one will be automatically derived from the *filename*. For - example, if *filename* was - ``'s3://object-store/data/file.nc'``, then an + default, or if `None`, then a value of ``{'anon': True}`` + is used. Ignored for file names that don't start with + ``s3:``. + + If and only if *s3* has no ``'endpoint_url'`` key (which + will always be the case when *s3* is `None`), then one + will be automatically derived from the file name and + included in the keyword parameters. For example, for a + file name of ``'s3://object-store/data/file.nc'``, then an ``'endpoint_url'`` key with value - ``'https://object-store'`` would be created. + ``'https://object-store'`` would be created. To disable + this behaviour, assign `None` to the ``'endpoint_url'`` + key. .. versionadded:: (cfdm) HDFVER library: `None` or `str`, optional - TODOHDF + Specify which library to use for opening input files. By + default, or if `None`, then `netCDF4` will used unless it + fails to open a given file, in which case `h5netcdf` will + be used. Setting *library* to ``'netCDF4'`` or + ``'h5netcdf'`` will force the use of the `netCDF4` or + `h5netcdf` libraries respectively. .. versionadded:: (cfdm) HDFVER From f811f93926ad138e988836699208e8a57af80e31 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 2 Feb 2024 13:59:58 +0000 Subject: [PATCH 24/88] dev --- cfdm/__init__.py | 2 +- cfdm/data/__init__.py | 2 +- cfdm/data/h5netcdfarray.py | 10 +- .../data/{maskscale.py => variableindexer.py} | 389 +++++++++++++----- cfdm/read_write/netcdf/flatten/config.py | 17 + ..._mask_scale.py => test_VariableIndexer.py} | 4 +- 6 files changed, 312 insertions(+), 112 deletions(-) rename cfdm/data/{maskscale.py => variableindexer.py} (63%) rename cfdm/test/{test_mask_scale.py => test_VariableIndexer.py} (96%) diff --git a/cfdm/__init__.py b/cfdm/__init__.py index 46636a582..5773d63eb 100644 --- a/cfdm/__init__.py +++ b/cfdm/__init__.py @@ -140,7 +140,6 @@ Data, GatheredArray, H5netcdfArray, - MaskScale, NetCDFArray, NumpyArray, PointTopologyArray, @@ -150,6 +149,7 @@ RaggedIndexedContiguousArray, SparseArray, SubsampledArray, + VariableIndexer, ) from .data import ( diff --git a/cfdm/data/__init__.py b/cfdm/data/__init__.py index 19ffdd01d..ce1c8ff9f 100644 --- a/cfdm/data/__init__.py +++ b/cfdm/data/__init__.py @@ -19,7 +19,6 @@ from .cellconnectivityarray import CellConnectivityArray from .gatheredarray import GatheredArray from .h5netcdfarray import H5netcdfArray -from .maskscale import MaskScale from .netcdfarray import NetCDFArray from .numpyarray import NumpyArray from .pointtopologyarray import PointTopologyArray @@ -28,5 +27,6 @@ from .raggedindexedcontiguousarray import RaggedIndexedContiguousArray from .sparsearray import SparseArray from .subsampledarray import SubsampledArray +from .variableindexer import VariableIndexer from .data import Data diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index 7604167f9..cf153fde5 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -4,8 +4,8 @@ import netCDF4 from . import abstract -from .maskscale import MaskScale from .mixin import FileArrayMixin, NetCDFFileMixin +from .variableindexer import VariableIndexer _safecast = netCDF4.utils._safecast default_fillvals = netCDF4.default_fillvals.copy() @@ -211,12 +211,12 @@ def __getitem__(self, indices): # Get the variable by netCDF name variable = dataset.variables[address] - array = variable[indices] - # Apply masking and scaling - array = MaskScale.apply( - variable, array, mask=mask, scale=mask, always_mask=False + # Get the data, applying masking and scaling as required. + array = VariableIndexer( + variable, mask=mask, scale=mask, always_mask=False ) + array = array[indices] # Set the units, if they haven't been set already. self._set_units(variable) diff --git a/cfdm/data/maskscale.py b/cfdm/data/variableindexer.py similarity index 63% rename from cfdm/data/maskscale.py rename to cfdm/data/variableindexer.py index 5fb08ff6c..1a3189cf6 100644 --- a/cfdm/data/maskscale.py +++ b/cfdm/data/variableindexer.py @@ -9,18 +9,212 @@ logger = logging.getLogger(__name__) -class MaskScale: - """TODO.""" +class VariableIndexer: + """An indexer of netCDF variables that applies masking and scaling. - @classmethod - def _check_safecast(cls, attname, dtype, attrs): - """TODOHDF. + During indexing, masking and scaling is applied according to the + CF conventions, either of which may be disabled via initialisation + options. - Check to see that variable attribute exists can can be safely + String and character variables are converted to unicode arrays, + the latter with the last dimension concatenated. + + .. versionadded:: (cfdm) HDFVER + + **Examples** + + >>> nc = netCDF4.Dataset('file.nc', 'r') + >>> x = cfdm.VariableIndexer(nc.variables['x']) + >>> x.shape + (12, 64, 128) + >>> print(x[0, 0:4, 0:3]) + [[236.5, 236.2, 236.0], + [240.9, -- , 239.6], + [243.4, 242.4, 241.3], + [243.1, 241.7, 240.4]] + + >>> h5 = h5netcdf.File('file.nc', 'r') + >>> x = cfdm.VariableIndexer(h5.variables['x']) + >>> x.shape + (12, 64, 128) + >>> print(x[0, 0:4, 0:3]) + [[236.5, 236.2, 236.0], + [240.9, -- , 239.6], + [243.4, 242.4, 241.3], + [243.1, 241.7, 240.4]] + + """ + + def __init__(self, variable, mask=True, scale=True, always_masked=False): + """**Initialisation** + + :Parameters: + + variable: `netCDF4.Variable` or `h5netcdf.Variable` + The variable to be indexed. Any masking and scaling + that may be applied by the *variable* itself is + disabled, i.e. Any masking and scaling is always + applied by the `VariableIndexer` instance. + + mask: `bool` + If True, the default, then an array returned by + indexing is automatically converted to a masked array + when missing values or fill values are present. + + scale: `bool` + If True, the default, then the ``scale_factor`` and + ``add_offset`` are applied to an array returned by + indexing, and signed integer data is automatically + converted to unsigned integer data if the + ``_Unsigned`` attribute is set to "true" or "True". + + always_masked: `bool` + If False, the default, then an array returned by + indexing which has no missing values is created as a + regular numpy array. If True then an array returned by + indexing is always a masked array, even if there are + no missing values. + + """ + self.variable = variable + self.mask = mask + self.scale = scale + self.always_masked = always_masked + + self.shape = variable.shape + + def __getitem__(self, index): + """Return a subspace of the variable as a `numpy` array. + + v.__getitem__(index) <==> v[index] + + Indexing follows rules defined by the variable. + + .. versionadded:: (cfdm) HDFVER + + """ + variable = self.variable + scale = self.scale + + attrs = self._attrs(variable) + dtype = variable.dtype + + netCDF4_scale = False + netCDF4_mask = False + try: + netCDF4_scale = variable.scale + netCDF4_mask = variable.mask + except AttributeError: + pass + else: + # Prevent netCDF4 from doing any masking and scaling + variable.set_auto_maskandscale(False) + + # Index the variable + data = variable[index] + + if isinstance(data, str): + data = np.array(data, dtype="S") + elif data.dtype.kind in "OSU": + kind = data.dtype.kind + if kind == "S": + data = netCDF4.chartostring(data) + + # Assume that object arrays are arrays of strings + data = data.astype("S", copy=False) + if kind == "O": + dtype = data.dtype + + if dtype is str: + dtype = data.dtype + + if scale: + dtype_unsigned_int = None + is_unsigned_int = attrs.get("_Unsigned", False) in ("true", "True") + if is_unsigned_int: + data_dtype = data.dtype + dtype_unsigned_int = ( + f"{data_dtype.byteorder}u{data_dtype.itemsize}" + ) + data = data.view(dtype_unsigned_int) + + if self.mask: + attrs = self._FillValue(variable, attrs) + data = self._mask( + data, + dtype, + attrs, + scale=scale, + always_masked=self.always_masked, + dtype_unsigned_int=dtype_unsigned_int, + ) + + if scale: + data = self._scale(data, attrs) + + if data.dtype.kind == "S": + # Assume that object arrays contain strings + data = data.astype("U", copy=False) + + if netCDF4_scale: + variable.set_auto_scale(True) + + if netCDF4_mask: + variable.set_auto_mask(True) + + return data + + def _attrs(self, variable): + """Return the variable attributes. + + .. versionadded:: (cfdm) HDFVER + + :Parameter: + + variable: `netCDF4.Variable` or `h5netcdf.Variable` + The variable to be indexed. + + :Returns: + + `dict` + The attributes. + + """ + try: + # h5netcdf + return dict(variable.attrs) + except AttributeError: + # netCDF4 + return { + attr: variable.getncattr(attr) for attr in variable.ncattrs() + } + + def _check_safecast(self, attname, dtype, attrs): + """Check an attribute's data type. + + Checks to see that variable attribute exists and can be safely cast to variable data type. + .. versionadded:: (cfdm) HDFVER + + :Parameter: + + attname: `str` + The attribute name. + + dtype: `numpy.dtype` + The variable data type. + + attrs: `dict` + The variable attributes. + + :Returns: + + `bool`, value + Whether or not the attribute data type is consistent + with the variable data type, and the attribute value. + """ - # attrs = self.variable.attrs if attname in attrs: attvalue = attrs[attname] att = np.array(attvalue) @@ -38,20 +232,39 @@ def _check_safecast(cls, attname, dtype, attrs): if not is_safe: logger.warn( f"WARNING: {attname} not used since it cannot " - "be safely cast to variable data type" + "be safely cast to variable data type {dtype!r}" ) # pragma: no cover return is_safe, attvalue - @classmethod - def _FillValue(cls, attrs, variable): - """TODO.""" + def _FillValue(self, variable, attrs): + """Set the variable _FillValue. + + .. versionadded:: (cfdm) HDFVER + + :Parameter: + + variable: `netCDF4.Variable` or `h5netcdf.Variable` + The variable to be indexed. + + attrs: `dict` + The variable attributes. May get updated in-place. + + :Returns: + + `dict` + The variable attributes, updated in-place with + ``_FillValue`` if present and not previously set.. + + """ if "_FillValue" not in attrs: try: fillvalue = getattr(variable._h5ds, "fillvalue", None) except AttributeError: + # netCDf4 pass else: + # h5netcdf if fillvalue is not None: attrs["_FillValue"] = fillvalue elif variable.dtype.kind == "O": @@ -59,31 +272,53 @@ def _FillValue(cls, attrs, variable): return attrs - @classmethod - def _attrs(cls, variable): - """TODO.""" - try: - return dict(variable.attrs) - except AttributeError: - return { - attr: variable.getncattr(attr) for attr in variable.ncattrs() - } - - @classmethod def _mask( - cls, + self, data, dtype, attrs, scale=True, - always_mask=False, + always_masked=False, dtype_unsigned_int=None, ): - """TODOHDF.""" + """Mask the data. + + .. versionadded:: (cfdm) HDFVER + + :Parameter: + + data: `numpy.ndarray` + The unmasked and unscaled data indexed from the + variable. + + dtype: `numpy.dtype` + The data type of the variable (which may be different + to that of *data*). + + attrs: `dict` + The variable attributes. + + scale: `bool` + Whether the data is to be scaled. + + always_masked: `bool` + Whether or not return a regular numpy array when there + are no missing values. + + dtype_unsigned_int: `dtype` or `None` + The data type to which unsigned integer data has been + cast. + + :Returns: + + `nump.ndarray` + The masked (but not scaled) data. + + """ totalmask = np.zeros(data.shape, np.bool_) fill_value = None - safe_missval, missing_value = cls._check_safecast( + safe_missval, missing_value = self._check_safecast( "missing_value", dtype, attrs ) if safe_missval: @@ -116,7 +351,7 @@ def _mask( totalmask += mvalmask # set mask=True for data == fill value - safe_fillval, _FillValue = cls._check_safecast( + safe_fillval, _FillValue = self._check_safecast( "_FillValue", dtype, attrs ) if safe_fillval: @@ -193,13 +428,13 @@ def _mask( # valid_min, valid_max. No special treatment of byte data as # described at # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). - safe_validrange, valid_range = cls._check_safecast( + safe_validrange, valid_range = self._check_safecast( "valid_range", dtype, attrs ) - safe_validmin, valid_min = cls._check_safecast( + safe_validmin, valid_min = self._check_safecast( "valid_min", dtype, attrs ) - safe_validmax, valid_max = cls._check_safecast( + safe_validmax, valid_max = self._check_safecast( "valid_max", dtype, attrs ) if safe_validrange and valid_range.size == 2: @@ -282,16 +517,33 @@ def _mask( # array, so that data == np.ma.masked. data = data[()] - elif not always_mask and not masked_values: + elif not always_masked and not masked_values: # Return a regular numpy array if requested and there are # no missing values data = np.array(data, copy=False) return data - @classmethod - def _scale(cls, data, attrs): - """TODOHDF.""" + def _scale(self, data, attrs): + """Scale the data.. + + .. versionadded:: (cfdm) HDFVER + + :Parameter: + + data: `numpy.ndarray` + The unmasked and unscaled data indexed from the + variable. + + attrs: `dict` + The variable attributes. + + :Returns: + + `nump.ndarray` + The scaled data. + + """ # If variable has scale_factor and add_offset attributes, # apply them. scale_factor = attrs.get("scale_factor") @@ -322,72 +574,3 @@ def _scale(cls, data, attrs): data = data + add_offset return data - - @classmethod - def apply(cls, variable, data, mask=True, scale=True, always_mask=False): - """TODO. - - :Parameters: - - variable: `h5netcdf.Variable` or `netCDF4.Variable` - - data: `numpy.ndarray` - - mask: `bool` - - scale: `bool` - - always_mask: `bool` - - :Returns: - - `numpy.ndarray` - - """ - attrs = cls._attrs(variable) - dtype = variable.dtype - - if isinstance(data, str): - data = np.array(data, dtype="S") - elif data.dtype.kind in "OSU": - kind = data.dtype.kind - if kind == "S": - data = netCDF4.chartostring(data) - - # Assume that object arrays are arrays of strings - data = data.astype("S", copy=False) - if kind == "O": - dtype = data.dtype - - if dtype is str: # isinstance(dtype, str): - dtype = data.dtype - - if scale: - dtype_unsigned_int = None - is_unsigned_int = attrs.get("_Unsigned", False) in ("true", "True") - if is_unsigned_int: - data_dtype = data.dtype - dtype_unsigned_int = ( - f"{data_dtype.byteorder}u{data_dtype.itemsize}" - ) - data = data.view(dtype_unsigned_int) - - if mask: - attrs = cls._FillValue(attrs, variable) - data = cls._mask( - data, - dtype, - attrs, - scale=scale, - always_mask=always_mask, - dtype_unsigned_int=dtype_unsigned_int, - ) - - if scale: - data = cls._scale(data, attrs) - - if data.dtype.kind == "S": - # Assume that object arrays contain strings - data = data.astype("U", copy=False) - - return data diff --git a/cfdm/read_write/netcdf/flatten/config.py b/cfdm/read_write/netcdf/flatten/config.py index c697befc8..427665fc4 100644 --- a/cfdm/read_write/netcdf/flatten/config.py +++ b/cfdm/read_write/netcdf/flatten/config.py @@ -177,5 +177,22 @@ class AttributeFeatures: AttributeFeatures( name="face_dimension", ref_to_dim=1, resolve_key=True ), + # Compression by coordinate subsampling + AttributeFeatures( + name="coordinate_interpolation", + ref_to_var=1, + resolve_key=True, + resolve_value=True, + ), + AttributeFeatures( + name="tie_point_mapping", + ref_to_dim=2, + ref_to_var=1, + resolve_key=True, + resolve_value=True, + ), + AttributeFeatures( + name="interpolation_parameters", ref_to_var=1, resolve_value=True + ), ) } diff --git a/cfdm/test/test_mask_scale.py b/cfdm/test/test_VariableIndexer.py similarity index 96% rename from cfdm/test/test_mask_scale.py rename to cfdm/test/test_VariableIndexer.py index 2139cc668..ca9da448d 100644 --- a/cfdm/test/test_mask_scale.py +++ b/cfdm/test/test_VariableIndexer.py @@ -14,7 +14,7 @@ n_tmpfiles = 1 tmpfiles = [ - tempfile.mkstemp("_test_geometry.nc", dir=os.getcwd())[1] + tempfile.mkstemp("_test_VariableIndxer.nc", dir=os.getcwd())[1] for i in range(n_tmpfiles) ] (tempfile,) = tmpfiles @@ -32,7 +32,7 @@ def _remove_tmpfiles(): atexit.register(_remove_tmpfiles) -class MaskScaleTest(unittest.TestCase): +class VariableIndexerTest(unittest.TestCase): """Test the masking and scaling of netCDF data.""" def test_mask(self): From 8c3eb5c9cb5a46a4d11e958813c5729da439e9fe Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 2 Feb 2024 15:17:02 +0000 Subject: [PATCH 25/88] dev --- cfdm/core/__init__.py | 4 +-- cfdm/data/h5netcdfarray.py | 31 ++++++++++------------- cfdm/data/mixin/filearraymixin.py | 12 --------- cfdm/data/mixin/netcdffilemixin.py | 24 +++++++++++++++++- cfdm/data/netcdfarray.py | 12 ++++++++- cfdm/read_write/netcdf/flatten/flatten.py | 29 ++++++++++++++++++--- cfdm/read_write/netcdf/netcdfread.py | 4 +-- 7 files changed, 78 insertions(+), 38 deletions(-) diff --git a/cfdm/core/__init__.py b/cfdm/core/__init__.py index 1eee20aa3..2481ab5c7 100644 --- a/cfdm/core/__init__.py +++ b/cfdm/core/__init__.py @@ -11,9 +11,9 @@ """ -__date__ = "2023-12-06" +__date__ = "2024-??-??" __cf_version__ = "1.11" -__version__ = "1.11.0.0" +__version__ = "1.11.1.0" from packaging import __version__ as _packaging_ver from packaging import __file__ as _packaging_file diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index cf153fde5..7a1a00145 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -15,9 +15,9 @@ class H5netcdfArray(NetCDFFileMixin, FileArrayMixin, abstract.Array): - """An underlying array stored in an HDF file. + """An underlying array stored in a netCDF HDF file. - .. versionadded:: (cfdm) TODOHDF + .. versionadded:: (cfdm) HDFVER """ @@ -184,19 +184,6 @@ def __getitem__(self, indices): x.__getitem__(indices) <==> x[indices] - The indices that define the subspace must be either `Ellipsis` or - a sequence that contains an index for each dimension. In the - latter case, each dimension's index must either be a `slice` - object or a sequence of two or more integers. - - Indexing is similar to numpy indexing. The only difference to - numpy indexing (given the restrictions on the type of indices - allowed) is: TODOHDF - - * When two or more dimension's indices are sequences of integers - then these indices work independently along each dimension - (similar to the way vector subscripts work in Fortran). - .. versionadded:: (cfdm) HDFVER """ @@ -214,7 +201,7 @@ def __getitem__(self, indices): # Get the data, applying masking and scaling as required. array = VariableIndexer( - variable, mask=mask, scale=mask, always_mask=False + variable, mask=mask, scale=mask, always_masked=False ) array = array[indices] @@ -227,12 +214,22 @@ def __getitem__(self, indices): return array def _get_attr(self, var, attr): - """TODOHDF. + """Get a variable attribute. .. versionadded:: (cfdm) HDFVER :Parameters: + var: `h5netcdf.Variable` + The variable. + + attr: `str` + The attribute name. + + :Returns: + + The attirbute value. + """ return var.attrs[attr] diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index db5cec47d..bd8648c50 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -44,18 +44,6 @@ def shape(self): """Shape of the array.""" return self._get_component("shape") - def _get_attr(self, var, attr): - """TODOHDF. - - .. versionadded:: (cfdm) HDFVER - - :Parameters: - - """ - raise NotImplementedError( - f"Must implement {self.__class__.__name__}._get_attr" - ) # pragma: no cover - def close(self, dataset): """Close the dataset containing the data.""" raise NotImplementedError( diff --git a/cfdm/data/mixin/netcdffilemixin.py b/cfdm/data/mixin/netcdffilemixin.py index 73d7c149f..e43b64915 100644 --- a/cfdm/data/mixin/netcdffilemixin.py +++ b/cfdm/data/mixin/netcdffilemixin.py @@ -2,12 +2,34 @@ class NetCDFFileMixin: - """Mixin class TODOHDF. + """Mixin class for netCDF file arrays. .. versionadded:: (cfdm) HDFVER """ + def _get_attr(self, var, attr): + """Get a variable attribute. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + var: + The variable. + + attr: `str` + The attribute name. + + :Returns: + + The attirbute value. + + """ + raise NotImplementedError( + f"Must implement {self.__class__.__name__}._get_attr" + ) # pragma: no cover + def _group(self, dataset, groups): """Retrun the group object containing a variable. diff --git a/cfdm/data/netcdfarray.py b/cfdm/data/netcdfarray.py index c0e4b1aa0..6f1b77fc8 100644 --- a/cfdm/data/netcdfarray.py +++ b/cfdm/data/netcdfarray.py @@ -287,12 +287,22 @@ def __str__(self): return f"{self.get_filename(None)}, {self.get_address()}" def _get_attr(self, var, attr): - """TODOHDF. + """Get a variable attribute. .. versionadded:: (cfdm) HDFVER :Parameters: + var: `netCDF.Variable` + The variable + + attr: `str` + The attribute name. + + :Returns: + + The attirbute value. + """ return var.getncattr(attr) diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 031e7a3a4..584568535 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -203,7 +203,30 @@ def attrs(self, variable): } def chunksizes(self, variable): - """TODO.""" + """Return the variable chunk sizes. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + variable: + The dataset variable. + + :Returns: + + `None` or sequence of `int` + The chunksizes, or `None` if the variable is not + chunked. + + **Examples** + + >>> f.chunksizes(variable) + [1, 324, 432] + + >>> f.chunksizes(variable) + None + + """ try: # netCDF4 chunking = variable.chunking() @@ -685,7 +708,7 @@ def flatten_variable(self, var): self.resolve_references(new_var, var) def increment_pos(self, pos, dim, copy_slice_shape, var_shape): - """TODOHDF. + """Increment position. Increment position vector in a variable along a dimension by the matching slice length along than dimension. If end of the @@ -1027,7 +1050,7 @@ def search_by_proximity( return None # def __escape_index_error(self, match, group_name): - # """TODOHDF. + # """ # # :param match: regex match # :param group_name: group name diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 4ad4d2725..e6817cbdc 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -955,11 +955,11 @@ def read( .. versionadded:: (cfdm) HDFVER _s3_file_systems: `dict`, optional - TODOHDF + Provide any already-open S3 file systems. .. versionadded:: (cfdm) HDFVER - library: `None` or `str`, optional + library: `None` or `str`, optional See `cfdm.read` for details .. versionadded:: (cfdm) HDFVER From a3aa8d81ca8dee0c9f947c843c3cdb76480786df Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sat, 3 Feb 2024 16:35:00 +0000 Subject: [PATCH 26/88] dev --- cfdm/data/h5netcdfarray.py | 15 ++--- cfdm/data/mixin/filearraymixin.py | 71 +++++++++++++++----- cfdm/data/mixin/netcdffilemixin.py | 23 +------ cfdm/read_write/netcdf/netcdfread.py | 97 ++++++++++++++-------------- cfdm/read_write/read.py | 56 +++++++++------- 5 files changed, 144 insertions(+), 118 deletions(-) diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index 7a1a00145..9f021a503 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -31,7 +31,7 @@ def __init__( units=False, calendar=False, missing_values=None, - s3=None, + storage_options=None, source=None, copy=True, ): @@ -84,12 +84,7 @@ def __init__( The missing value indicators defined by the variable attributes. See `get_missing_values` for details. - s3: `dict` or `None`, optional - The `s3fs.S3FileSystem` options for accessing S3 - files. If there are no options then ``anon=True`` is - assumed, and if there is no ``'endpoint_url'`` key - then one will automatically be derived one for each S3 - filename. + {{storage_options: `dict` or `None`, optional}} .. versionadded:: (cfdm) HDFVER @@ -142,9 +137,9 @@ def __init__( missing_values = None try: - s3 = source._get_component("s3", None) + storage_options = source._get_component("storage_options", None) except AttributeError: - s3 = None + storage_options = None if shape is not None: self._set_component("shape", shape, copy=False) @@ -174,7 +169,7 @@ def __init__( self._set_component("mask", mask, copy=False) self._set_component("units", units, copy=False) self._set_component("calendar", calendar, copy=False) - self._set_component("s3", s3, copy=False) + self._set_component("storage_options", storage_options, copy=False) # By default, close the file after data array access self._set_component("close", True, copy=False) diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index bd8648c50..7b6dd9223 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -189,6 +189,56 @@ def get_formats(self): """ return (self.get_format(),) * len(self.get_filenames()) + def get_storage_options(self, filename=None, parsed_filename=None): + """Return `s3fs.S3FileSystem` options for accessing S3 files. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + filename: `str`, optional + Used to the ``'endpoint_url'`` key if it has not been + previously defined. Ignored if *parse_filename* has + been set. By default the ``'endpoint_url'`` key, if + required, is set from the file name returned by + `get_filename`. + + parse_filename: `urllib.parse.ParseResult`, optional + Used to the ``'endpoint_url'`` key if it has not been + previously defined. By default the ``'endpoint_url'`` + key, if required, is set from the file name returned + by `get_filename`. + + :Returns: + + `dict` + The `s3fs.S3FileSystem` options. + + """ + out = self._get_component("storage_options", None) + if not out: + out = {} + else: + out = deepcopy(out) + + if "endpoint_url" not in out: + if parsed_filename is None: + if filename is None: + try: + filename = self.get_filename() + except AttributeError: + pass + else: + parsed_filename = urlparse(filename) + else: + parsed_filename = urlparse(filename) + + if parsed_filename is not None and parsed_filename.scheme == "s3": + # Derive endpoint_url from filename + out["endpoint_url"] = f"https://{parsed_filename.netloc}" + + return out + def open(self, func, *args, **kwargs): """Return a dataset file object and address. @@ -216,35 +266,26 @@ def open(self, func, *args, **kwargs): # Loop round the files, returning as soon as we find one that # works. filenames = self.get_filenames() - for i, (filename, address) in enumerate( - zip(filenames, self.get_addresses()) - ): + for filename, address in zip(filenames, self.get_addresses(): url = urlparse(filename) if url.scheme == "file": # Convert a file URI into an absolute path filename = url.path elif url.scheme == "s3": # Create an openable S3 file object - s3 = self.get_s3() - if not s3: - s3["anon"] = True - - if "endpoint_url" not in s3: - # Derive endpoint_url from filename - s3["endpoint_url"] = f"https://{url.netloc}" - - fs = S3FileSystem(**s3) + storage_options = self.get_storage_options(parsed_filename=url) + fs = S3FileSystem(**storage_options) filename = fs.open(url.path[1:], "rb") try: - nc = func(filename, *args, **kwargs) + dataset = func(filename, *args, **kwargs) except FileNotFoundError: continue except RuntimeError as error: raise RuntimeError(f"{error}: {filename}") - # Successfully opend a dataset, so return. - return nc, address + # Successfully opened a dataset, so return. + return dataset, address if len(filenames) == 1: raise FileNotFoundError(f"No such file: {filenames[0]}") diff --git a/cfdm/data/mixin/netcdffilemixin.py b/cfdm/data/mixin/netcdffilemixin.py index e43b64915..0546ded61 100644 --- a/cfdm/data/mixin/netcdffilemixin.py +++ b/cfdm/data/mixin/netcdffilemixin.py @@ -1,3 +1,5 @@ +from copy import deepcopy + from ..numpyarray import NumpyArray @@ -173,27 +175,6 @@ def get_mask(self): """ return self._get_component("mask") - def get_s3(self): - """Return `s3fs.S3FileSystem` options for accessing S3 files. - - .. versionadded:: (cfdm) HDFVER - - :Returns: - - `dict` - The `s3fs.S3FileSystem` options for accessing S3 - files. If there are no options then ``anon=True`` is - assumed, and if there is no ``'endpoint_url'`` key - then one will automatically be derived one for each S3 - filename. - - """ - out = self._get_component("s3", None) - if not out: - return {} - - return out.copy() - def get_missing_values(self): """The missing value indicators from the netCDF variable. diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index e6817cbdc..a41dcb9bf 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -468,9 +468,9 @@ def file_close(self): if "nc_grouped" in g: g["nc_grouped"].close() - # Close file-like object from S3 file systems - for filename in g["s3_file_objects"]: - filename.close() + # Close s3fs.File objects + for f in g["s3fs.File_objects"]: + f.close() def file_open(self, filename, flatten=True, verbose=None): """Open the netCDf file for reading. @@ -506,62 +506,58 @@ def file_open(self, filename, flatten=True, verbose=None): netCDF = False HDF = False - library = g["library"] + netCDF_backend = g["netCDF_backend"] # Deal with an file in an S3 object store u = urlparse(filename) if u.scheme == "s3": # Create an openable S3 file object - s3 = g["s3"] - if s3 is None: - # Default s3 file system options - s3 = {"anon": True} - - if "endpoint_url" not in s3: + storage_options = g["storage_options"] + g["file_system_storage_options"][filename] = storage_options + if "endpoint_url" not in storage_options: # Derive endpoint_url from filename - s3 = s3.copy() - s3["endpoint_url"] = f"https://{u.netloc}" - - g["s3_file_system_options"][filename] = s3 - + storage_options = storage_options.copy() + storage_options["endpoint_url"] = f"https://{u.netloc}" + key = tuple(sorted(s3.items())) - s3_file_systems = g["s3_file_systems"] - fs = s3_file_systems.get(key) + file_systems = g["file_systems"] + fs = file_systems.get(key) if fs is None: # An S3 file system with these options does not exist, # so create one. - fs = S3FileSystem(**s3) - s3_file_systems[key] = fs + fs = S3FileSystem(**storage_options) + file_systems[key] = fs filename = fs.open(u.path[1:], "rb") - g["s3_file_objects"].append(filename) + g["s3fs.File_objects"].append(filename) if is_log_level_detail(logger): logger.debug( - f" s3: s3fs.S3FileSystem options: {s3}\n" + f" S3: s3fs.S3FileSystem options: {storage_options}\n" ) # pragma: no cover - if library is None: + if netCDF_backend is None: try: + # Try opening the file with netCDF4 nc = self._open_netCDF4(filename) netCDF = True except Exception: - # File could not be read by netCDF4 so try to open it - # with h5netcdf + # The file could not be read by netCDF4 so try opening + # it with h5netcdf try: nc = self._open_h5netcdf(filename) HDF = True except Exception as error: raise error - elif library == "netCDF4": + elif netCDF_backend == "netCDF4": try: nc = self._open_netCDF4(filename) netCDF = True except Exception as error: raise error - elif library == "h5netcdf": + elif netCDF_backend == "h5netcdf": try: nc = self._open_h5netcdf(filename) HDF = True @@ -569,7 +565,7 @@ def file_open(self, filename, flatten=True, verbose=None): raise error else: - raise ValueError("Unknown library name: library={library!r}") + raise ValueError("Unknown netCDF backend: netCDF_backend={netCDF_backend!r}") g["original_h5netcdf"] = HDF g["original_netCDF4"] = netCDF @@ -902,9 +898,9 @@ def read( warnings=True, warn_valid=False, domain=False, - s3=None, - _s3_file_systems=None, - library=None, + storage_options=None, + _file_systems=None, + netCDF_backend=None, ): """Reads a netCDF dataset from file or OPenDAP URL. @@ -949,20 +945,21 @@ def read( .. versionadded:: (cfdm) 1.9.0.0 - s3: `bool`, optional + storage_options: `bool`, optional See `cfdm.read` for details .. versionadded:: (cfdm) HDFVER - _s3_file_systems: `dict`, optional - Provide any already-open S3 file systems. + netCDF_backend: `None` or `str`, optional + See `cfdm.read` for details .. versionadded:: (cfdm) HDFVER - library: `None` or `str`, optional - See `cfdm.read` for details + _file_systems: `dict`, optional + Provide any already-open S3 file systems. .. versionadded:: (cfdm) HDFVER + :Returns: `list` @@ -1064,21 +1061,21 @@ def read( # "cfa": False, # -------------------------------------------------------- - # Library + # NetCDF backend # -------------------------------------------------------- # - "library": library, + "netCDF_backend": netCDF_backend, # -------------------------------------------------------- # S3 # -------------------------------------------------------- # - "s3": s3, + "storage_options": storage_options, # - "s3_file_systems": {}, + "file_systems": {}, # - "s3_file_system_options": {}, + "file_system_storage_options": {}, # - "s3_file_objects": [], + "s3fs.File_objects": [], } g = self.read_vars @@ -1087,14 +1084,14 @@ def read( for version in ("1.6", "1.7", "1.8", "1.9", "1.10", "1.11"): g["version"][version] = Version(version) - if s3 is None: - # Default s3 file system options - g["s3"] = {"anon": True} + if storage_options is None: + # Default storage options + g["storage_options"] = {"anon": True} - if _s3_file_systems is not None: + if _file_systems is not None: # Update S3 file systems with those passed in as keyword # parameter - g["s3_file_systems"] = _s3_file_systems + g["file_systems"] = _file_systems # ------------------------------------------------------------ # Add custom read vars @@ -2226,7 +2223,7 @@ def _get_variables_from_external_files(self, netcdf_external_variables): external_read_vars = self.read( external_file, _scan_only=True, - _s3_file_systems=read_vars["s3_file_systems"], + _file_systems=read_vars["file_systems"], verbose=verbose, ) @@ -6217,9 +6214,9 @@ def _create_netcdfarray( "missing_values": missing_values, } - s3 = g["s3_file_system_options"].get(filename) - if s3 is not None: - kwargs["s3"] = s3 + storage_options = g["file_system_storage_options"].get(filename) + if storage_options is not None: + kwargs["storage_options"] = storage_options if return_kwargs_only: return kwargs diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 8ca3284fe..094731896 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -17,8 +17,8 @@ def read( warn_valid=False, mask=True, domain=False, - s3=None, - library=None, + storage_options=None, + netCDF_backend=None, _implementation=_implementation, ): """Read field or domain constructs from a dataset. @@ -264,31 +264,43 @@ def read( .. versionadded:: (cfdm) 1.9.0.0 - s3: `dict` or `None`, optional - Keyword parameters to be passed to `s3fs.S3FileSystem` to - control the opening of files in an S3 object store. By - default, or if `None`, then a value of ``{'anon': True}`` - is used. Ignored for file names that don't start with - ``s3:``. + storage_options: `dict` or `None`, optional + Key/value pairs to be passed on to the `s3fs.S3FileSystem` + file-system backend to control the opening of files in an + S3 object store. By default, or if `None`, then a value of + ``{'anon': True}`` is used. Ignored for file names that + don't start with ``s3:``. - If and only if *s3* has no ``'endpoint_url'`` key (which - will always be the case when *s3* is `None`), then one - will be automatically derived from the file name and + If and only if *s3* has no ``'endpoint_url'`` key, then + one will be automatically derived from the file name and included in the keyword parameters. For example, for a - file name of ``'s3://object-store/data/file.nc'``, then an - ``'endpoint_url'`` key with value - ``'https://object-store'`` would be created. To disable - this behaviour, assign `None` to the ``'endpoint_url'`` - key. + file name of ``'s3://store/data/file.nc'``, an + ``'endpoint_url'`` key with value ``'https://store'`` + would be created. To disable this behaviour, assign `None` + to the ``'endpoint_url'`` key. + + *Parameter example:* + ``{'anon': True}`` + + *Parameter example:* + For a file name of ``'s3://store/data/file.nc'``, the + following are equivalent: ``{'anon': True}`` and + ``{'anon': True, 'endpoint_url': 'https://store'}``. + + *Parameter example:* + ``{'key": 'kjhsadf8756', 'secret': '862t3gyebh', + 'client_kwargs': {'endpoint_url': 'http://some-s3.com', + 'config_kwargs': {'s3': {'addressing_style': + 'virtual'}}`` .. versionadded:: (cfdm) HDFVER - library: `None` or `str`, optional - Specify which library to use for opening input files. By + netCDF_backend: `None` or `str`, optional + Specify which library to use for opening netCDF files. By default, or if `None`, then `netCDF4` will used unless it fails to open a given file, in which case `h5netcdf` will - be used. Setting *library* to ``'netCDF4'`` or - ``'h5netcdf'`` will force the use of the `netCDF4` or + be used instead. Setting *netCDF_backend* to ``'netCDF4'`` + or ``'h5netcdf'`` will force the use of the `netCDF4` or `h5netcdf` libraries respectively. .. versionadded:: (cfdm) HDFVER @@ -365,8 +377,8 @@ def read( warn_valid=warn_valid, mask=mask, domain=domain, - s3=s3, - library=library, + storage_options=storage_options, + netCDF_backend=netCDF_backend, extra_read_vars=None, ) except MaskError: From cf0fd2ca2e2d1d008af6532415f80804e3a05267 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sat, 3 Feb 2024 16:35:38 +0000 Subject: [PATCH 27/88] dev --- cfdm/data/h5netcdfarray.py | 4 +++- cfdm/data/mixin/filearraymixin.py | 10 +++++----- cfdm/read_write/netcdf/netcdfread.py | 6 ++++-- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index 9f021a503..9a720b0d6 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -137,7 +137,9 @@ def __init__( missing_values = None try: - storage_options = source._get_component("storage_options", None) + storage_options = source._get_component( + "storage_options", None + ) except AttributeError: storage_options = None diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index 7b6dd9223..c58a66ad4 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -202,13 +202,13 @@ def get_storage_options(self, filename=None, parsed_filename=None): been set. By default the ``'endpoint_url'`` key, if required, is set from the file name returned by `get_filename`. - + parse_filename: `urllib.parse.ParseResult`, optional Used to the ``'endpoint_url'`` key if it has not been previously defined. By default the ``'endpoint_url'`` key, if required, is set from the file name returned by `get_filename`. - + :Returns: `dict` @@ -232,13 +232,13 @@ def get_storage_options(self, filename=None, parsed_filename=None): parsed_filename = urlparse(filename) else: parsed_filename = urlparse(filename) - + if parsed_filename is not None and parsed_filename.scheme == "s3": # Derive endpoint_url from filename out["endpoint_url"] = f"https://{parsed_filename.netloc}" return out - + def open(self, func, *args, **kwargs): """Return a dataset file object and address. @@ -266,7 +266,7 @@ def open(self, func, *args, **kwargs): # Loop round the files, returning as soon as we find one that # works. filenames = self.get_filenames() - for filename, address in zip(filenames, self.get_addresses(): + for filename, address in zip(filenames, self.get_addresses()): url = urlparse(filename) if url.scheme == "file": # Convert a file URI into an absolute path diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index a41dcb9bf..557342a91 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -518,7 +518,7 @@ def file_open(self, filename, flatten=True, verbose=None): # Derive endpoint_url from filename storage_options = storage_options.copy() storage_options["endpoint_url"] = f"https://{u.netloc}" - + key = tuple(sorted(s3.items())) file_systems = g["file_systems"] fs = file_systems.get(key) @@ -565,7 +565,9 @@ def file_open(self, filename, flatten=True, verbose=None): raise error else: - raise ValueError("Unknown netCDF backend: netCDF_backend={netCDF_backend!r}") + raise ValueError( + "Unknown netCDF backend: netCDF_backend={netCDF_backend!r}" + ) g["original_h5netcdf"] = HDF g["original_netCDF4"] = netCDF From 1cff2d0d49fa7e6a488aad5dc610dea68816eca4 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sun, 4 Feb 2024 18:52:28 +0000 Subject: [PATCH 28/88] dev --- cfdm/data/mixin/filearraymixin.py | 1 + cfdm/data/mixin/netcdffilemixin.py | 2 -- cfdm/docstring/docstring.py | 29 ++++++++++++++++++++++++++++ cfdm/read_write/netcdf/netcdfread.py | 2 +- cfdm/read_write/read.py | 9 +++++---- cfdm/test/test_VariableIndexer.py | 4 ++-- cfdm/test/test_groups.py | 16 ++++++++------- cfdm/test/test_read_write.py | 4 ++-- 8 files changed, 49 insertions(+), 18 deletions(-) diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index c58a66ad4..78cef6c83 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -1,3 +1,4 @@ +from copy import deepcopy from urllib.parse import urlparse from s3fs import S3FileSystem diff --git a/cfdm/data/mixin/netcdffilemixin.py b/cfdm/data/mixin/netcdffilemixin.py index 0546ded61..458c01b1c 100644 --- a/cfdm/data/mixin/netcdffilemixin.py +++ b/cfdm/data/mixin/netcdffilemixin.py @@ -1,5 +1,3 @@ -from copy import deepcopy - from ..numpyarray import NumpyArray diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py index 4eccd204d..ae97a1e81 100644 --- a/cfdm/docstring/docstring.py +++ b/cfdm/docstring/docstring.py @@ -402,6 +402,35 @@ "{{init cell_dimension: `int`}}": """cell_dimension: `int` The position of the *data* dimension that indexes the cells, either ``0`` or ``1``.""", + # storage_options + "{{storage_options: `dict` or `None`, optional}}": """storage_options: `dict` or `None`, optional + Key/value pairs to be passed on to the `s3fs.S3FileSystem` + file-system backend to control the opening of files in an + S3 object store. By default, or if `None`, then a value of + ``{'anon': True}`` is used. Ignored for file names that + don't start with ``s3:``. + + If and only if *s3* has no ``'endpoint_url'`` key, then + one will be automatically derived from the file name and + included in the keyword parameters. For example, for a + file name of ``'s3://store/data/file.nc'``, an + ``'endpoint_url'`` key with value ``'https://store'`` + would be created. To disable this behaviour, assign `None` + to the ``'endpoint_url'`` key. + + *Parameter example:* + ``{'anon': True}`` + + *Parameter example:* + For a file name of ``'s3://store/data/file.nc'``, the + following are equivalent: ``{'anon': True}`` and + ``{'anon': True, 'endpoint_url': 'https://store'}``. + + *Parameter example:* + ``{'key": 'jhsadf8756', 'secret': '862t3gyebh', + 'client_kwargs': {'endpoint_url': 'http://some-s3.com', + 'config_kwargs': {'s3': {'addressing_style': + 'virtual'}}``""", # ---------------------------------------------------------------- # Method description susbstitutions (4 levels of indentataion) # ---------------------------------------------------------------- diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 557342a91..785eda1ea 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -519,7 +519,7 @@ def file_open(self, filename, flatten=True, verbose=None): storage_options = storage_options.copy() storage_options["endpoint_url"] = f"https://{u.netloc}" - key = tuple(sorted(s3.items())) + key = tuple(sorted(storage_options.items())) file_systems = g["file_systems"] fs = file_systems.get(key) if fs is None: diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 094731896..1b772d4d5 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -23,13 +23,14 @@ def read( ): """Read field or domain constructs from a dataset. - The dataset may be a netCDF file on disk or on an OPeNDAP server, - or a CDL file on disk (see below). + The following file formats are supported: netCDF and CDL. + + NetCDF files may be on disk, on an OPeNDAP server, or in an S3 + object store. The returned constructs are sorted by the netCDF variable names of their corresponding data or domain variables. - **CDL files** A file is considered to be a CDL representation of a netCDF @@ -288,7 +289,7 @@ def read( ``{'anon': True, 'endpoint_url': 'https://store'}``. *Parameter example:* - ``{'key": 'kjhsadf8756', 'secret': '862t3gyebh', + ``{'key": 'kjhsadf8756', 'secret': '862t3gyebh', 'client_kwargs': {'endpoint_url': 'http://some-s3.com', 'config_kwargs': {'s3': {'addressing_style': 'virtual'}}`` diff --git a/cfdm/test/test_VariableIndexer.py b/cfdm/test/test_VariableIndexer.py index ca9da448d..9b110f27b 100644 --- a/cfdm/test/test_VariableIndexer.py +++ b/cfdm/test/test_VariableIndexer.py @@ -69,8 +69,8 @@ def test_mask(self): cfdm.write(fields, tempfile, warn_valid=False) - fh5 = cfdm.read(tempfile, library="h5netcdf") - fnc = cfdm.read(tempfile, library="netCDF4") + fh5 = cfdm.read(tempfile, netCDF_backend="h5netcdf") + fnc = cfdm.read(tempfile, netCDF_backend="netCDF4") for h, n in zip(fh5, fnc): self.assertTrue(h.data.mask.equals(n.data.mask)) diff --git a/cfdm/test/test_groups.py b/cfdm/test/test_groups.py index 313c25411..e8369a590 100644 --- a/cfdm/test/test_groups.py +++ b/cfdm/test/test_groups.py @@ -180,13 +180,15 @@ def test_groups(self): ) nc.close() - h = cfdm.read(grouped_file, library="netCDF4", verbose="WARNING") + h = cfdm.read( + grouped_file, netCDF_backend="netCDF4", verbose="WARNING" + ) self.assertEqual(len(h), 1) h = h[0] self.assertTrue(f.equals(h, verbose=2)) # Check that h5netcdf reads the file correctly - h5 = cfdm.read(grouped_file, library="h5netcdf") + h5 = cfdm.read(grouped_file, netCDF_backend="h5netcdf") self.assertEqual(len(h5), 1) self._check_h5netcdf_groups(h5[0], h) @@ -319,7 +321,7 @@ def test_groups_geometry(self): self.assertTrue(f.equals(h, verbose=2)) # Check that h5netcdf reads the file correctly - h5 = cfdm.read(grouped_file, library="h5netcdf") + h5 = cfdm.read(grouped_file, netCDF_backend="h5netcdf") self.assertEqual(len(h5), 1) self._check_h5netcdf_groups(h5[0], h) @@ -392,7 +394,7 @@ def test_groups_compression(self): self.assertTrue(f.equals(h, verbose=2)) # Check that h5netcdf reads the file correctly - h5 = cfdm.read(grouped_file, library="h5netcdf") + h5 = cfdm.read(grouped_file, netCDF_backend="h5netcdf") self.assertEqual(len(h5), 1) self._check_h5netcdf_groups(h5[0], h) @@ -465,7 +467,7 @@ def test_groups_dimension(self): self.assertTrue(f.equals(h, verbose=3)) # Check that h5netcdf reads the file correctly - h5 = cfdm.read(grouped_file, library="h5netcdf") + h5 = cfdm.read(grouped_file, netCDF_backend="h5netcdf") self.assertEqual(len(h5), 1) self._check_h5netcdf_groups(h5[0], h) @@ -502,13 +504,13 @@ def test_groups_unlimited_dimension(self): cfdm.write(f, grouped_file5, verbose=1) - h = cfdm.read(grouped_file, library="netCDF4") + h = cfdm.read(grouped_file, netCDF_backend="netCDF4") self.assertEqual(len(h), 1) h = h[0] self.assertTrue(f.equals(h)) # Check that h5netcdf reads the file correctly - h5 = cfdm.read(grouped_file, library="h5netcdf") + h5 = cfdm.read(grouped_file, netCDF_backend="h5netcdf") self.assertEqual(len(h5), 1) self._check_h5netcdf_groups(h5[0], h) diff --git a/cfdm/test/test_read_write.py b/cfdm/test/test_read_write.py index 0c6210212..454a14645 100644 --- a/cfdm/test/test_read_write.py +++ b/cfdm/test/test_read_write.py @@ -671,8 +671,8 @@ def test_read_CDL(self): def test_read_write_string(self): """Test the `string` keyword argument to `read` and `write`.""" - f = cfdm.read(self.string_filename, library="netCDF4") - fh = cfdm.read(self.string_filename, library="h5netcdf") + f = cfdm.read(self.string_filename, netCDF_backend="netCDF4") + fh = cfdm.read(self.string_filename, netCDF_backend="h5netcdf") n = int(len(f) / 2) From 740f4cd790493bc4337911ae8655e2faaf604613 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 5 Feb 2024 16:25:05 +0000 Subject: [PATCH 29/88] dev --- Changelog.rst | 12 + README.md | 3 +- cfdm/__init__.py | 72 ++++-- cfdm/abstract/implementation.py | 2 +- cfdm/cfdmimplementation.py | 12 +- cfdm/core/__init__.py | 32 +-- cfdm/core/data/data.py | 2 +- cfdm/core/functions.py | 33 +-- cfdm/data/__init__.py | 2 +- cfdm/data/h5netcdfarray.py | 4 +- cfdm/data/mixin/filearraymixin.py | 52 ++-- cfdm/data/netcdf4array.py | 354 +++++++++++++++++++++++++++ cfdm/data/netcdfarray.py | 51 +--- cfdm/data/variableindexer.py | 271 +++++++++----------- cfdm/docstring/docstring.py | 13 +- cfdm/functions.py | 71 +++--- cfdm/read_write/netcdf/netcdfread.py | 2 +- cfdm/read_write/read.py | 9 +- cfdm/test/test_NetCDFArray.py | 28 +-- cfdm/test/test_VariableIndexer.py | 74 ++++-- cfdm/test/test_read_write.py | 14 +- docs/source/installation.rst | 18 +- docs/source/tutorial.rst | 4 +- requirements.txt | 5 +- setup.py | 2 +- 25 files changed, 763 insertions(+), 379 deletions(-) create mode 100644 cfdm/data/netcdf4array.py diff --git a/Changelog.rst b/Changelog.rst index d5c425fe2..12502a606 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -1,3 +1,15 @@ +Version 1.11.1.0 +---------------- + +**2024-??-??** + +* New dependency: ``h5netcdf>=1.3.0`` +* New dependency: ``h5py>=3.10.`` +* New dependency: ``s3fs>=2024.2.0`` +* Removed dependency: ``netcdf_flattener`` + +---- + Version 1.11.0.0 ---------------- diff --git a/README.md b/README.md index c028db330..9bd31953e 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,8 @@ inspecting it: The ``cfdm`` package can: -* read field and domain constructs from netCDF and CDL datasets, +* read field and domain constructs from netCDF and CDL datasets with a + choice of netCDF backends, * create new field and domain constructs in memory, * write and append field and domain constructs to netCDF datasets on disk, * read, write, and manipulate UGRID mesh topologies, diff --git a/cfdm/__init__.py b/cfdm/__init__.py index 5773d63eb..c6e88e662 100644 --- a/cfdm/__init__.py +++ b/cfdm/__init__.py @@ -47,16 +47,23 @@ __cf_version__ = core.__cf_version__ __version__ = core.__version__ -_requires = ("cftime", "netcdf_flattener", "scipy") +_requires = core._requires + ( + "cftime", + "netCDF4", + "scipy", + "h5netcdf", + "h5py", + "s3fs", +) _error0 = f"cfdm requires the modules {', '.join(_requires)}. " +# Check the version of cftime try: import cftime except ImportError as error1: raise ImportError(_error0 + str(error1)) -# Check the version of cftime _minimum_vn = "1.6.0" if Version(cftime.__version__) < Version(_minimum_vn): raise ValueError( @@ -64,32 +71,69 @@ f"Got {cftime.__version__} at {cftime.__file__}" ) +# Check the version of netCDF4 try: - import netcdf_flattener + import netCDF4 except ImportError as error1: raise ImportError(_error0 + str(error1)) -# Check the version of netcdf_flattener -_minimum_vn = "1.2.0" -if Version(netcdf_flattener.__version__) < Version(_minimum_vn): +minimum_vn = "1.5.4" +if Version(netCDF4.__version__) < Version(_minimum_vn): raise ValueError( - f"Bad netcdf_flattener version: cfdm requires " - f"netcdf_flattener>={_minimum_vn}. Got {netcdf_flattener.__version__} " - f"at {netcdf_flattener.__file__}" + f"Bad netCDF4 version: cfdm requires netCDF4>={_minimum_vn}. " + f"Got {netCDF4.__version__} at {netCDF4.__file__}" ) +# Check the version of h5netcdf try: - import scipy + import h5netcdf +except ImportError as error1: + raise ImportError(_error0 + str(error1)) + +_minimum_vn = "1.3.0" +if Version(h5netcdf.__version__) < Version(_minimum_vn): + raise ValueError( + f"Bad h5netcdf version: cfdm requires h5netcdf>={_minimum_vn}. " + f"Got {h5netcdf.__version__} at {h5netcdf.__file__}" + ) + +# Check the version of h5py +try: + import h5py +except ImportError as error1: + raise ImportError(_error0 + str(error1)) + +_minimum_vn = "3.10.0" +if Version(h5py.__version__) < Version(_minimum_vn): + raise ValueError( + f"Bad h5py version: cfdm requires h5py>={_minimum_vn}. " + f"Got {h5py.__version__} at {h5py.__file__}" + ) + +# Check the version of s3fs +try: + import s3fs except ImportError as error1: raise ImportError(_error0 + str(error1)) +_minimum_vn = "2024.2.0" +if Version(s3fs.__version__) < Version(_minimum_vn): + raise ValueError( + f"Bad s3fs version: cfdm requires s3fs>={_minimum_vn}. " + f"Got {s3fs.__version__} at {s3fs.__file__}" + ) + # Check the version of scipy +try: + import scipy +except ImportError as error1: + raise ImportError(_error0 + str(error1)) + _minimum_vn = "1.10.0" if Version(scipy.__version__) < Version(_minimum_vn): raise ValueError( - f"Bad scipy version: cfdm requires " - f"scipy>={_minimum_vn}. Got {scipy.__version__} " - f"at {scipy.__file__}" + f"Bad scipy version: cfdm requires scipy>={_minimum_vn}. " + f"Got {scipy.__version__} at {scipy.__file__}" ) from .constants import masked @@ -140,7 +184,7 @@ Data, GatheredArray, H5netcdfArray, - NetCDFArray, + NetCDF4Array, NumpyArray, PointTopologyArray, RaggedArray, diff --git a/cfdm/abstract/implementation.py b/cfdm/abstract/implementation.py index 99547195b..c16864710 100644 --- a/cfdm/abstract/implementation.py +++ b/cfdm/abstract/implementation.py @@ -54,7 +54,7 @@ def classes(self): 'Index', 'InteriorRing', 'List', - 'NetCDFArray', + 'NetCDF4Array', 'NodeCountProperties', 'PartNodeCountProperties', 'RaggedContiguousArray', diff --git a/cfdm/cfdmimplementation.py b/cfdm/cfdmimplementation.py index ee098744f..b9a6ce5f5 100644 --- a/cfdm/cfdmimplementation.py +++ b/cfdm/cfdmimplementation.py @@ -31,7 +31,7 @@ Data, GatheredArray, H5netcdfArray, - NetCDFArray, + NetCDF4Array, PointTopologyArray, RaggedContiguousArray, RaggedIndexedArray, @@ -2292,7 +2292,7 @@ def initialise_TiePointIndex(self): cls = self.get_class("TiePointIndex") return cls() - def initialise_NetCDFArray( + def initialise_NetCDF4Array( self, filename=None, address=None, @@ -2339,10 +2339,10 @@ def initialise_NetCDFArray( :Returns: - `NetCDFArray` + `NetCDF4Array` """ - cls = self.get_class("NetCDFArray") + cls = self.get_class("NetCDF4Array") return cls( filename=filename, address=address, @@ -3748,7 +3748,7 @@ def squeeze(self, construct, axes=None): BoundsFromNodesArray=BoundsFromNodesArray, GatheredArray=GatheredArray, H5netcdfArray=H5netcdfArray, - NetCDFArray=NetCDFArray, + NetCDF4Array=NetCDF4Array, PointTopologyArray=PointTopologyArray, RaggedContiguousArray=RaggedContiguousArray, RaggedIndexedArray=RaggedIndexedArray, @@ -3792,7 +3792,7 @@ def implementation(): 'Data': , 'GatheredArray': , 'H5netcdfArray': , - 'NetCDFArray': , + 'NetCDF4Array': , 'PointTopologyArray': , 'RaggedContiguousArray': , 'RaggedIndexedArray': , diff --git a/cfdm/core/__init__.py b/cfdm/core/__init__.py index 2481ab5c7..6ed22a3a0 100644 --- a/cfdm/core/__init__.py +++ b/cfdm/core/__init__.py @@ -21,20 +21,10 @@ import platform -_requires = ("numpy", "netCDF4", "packaging") +_requires = ("numpy", "packaging") _error0 = f"cfdm.core requires the modules {', '.join(_requires)}. " -try: - import netCDF4 -except ImportError as error1: - raise ImportError(_error0 + str(error1)) - -try: - import numpy as np -except ImportError as error1: - raise ImportError(_error0 + str(error1)) - # Check the version of python _minimum_vn = "3.8.0" if Version(platform.python_version()) < Version(_minimum_vn): @@ -44,22 +34,24 @@ ) # Check the version of packaging +try: + import packaging +except ImportError as error1: + raise ImportError(_error0 + str(error1)) + _minimum_vn = "20.0" if Version(_packaging_ver) < Version(_minimum_vn): raise ValueError( - f"Bad packaging version: cfdm requires packaging>={_minimum_vn}. " + f"Bad packaging version: cfdm.core requires packaging>={_minimum_vn}. " f"Got {_packaging_ver} at {_packaging_file}" ) -# Check the version of netCDF4 -_minimum_vn = "1.5.4" -if Version(netCDF4.__version__) < Version(_minimum_vn): - raise ValueError( - f"Bad netCDF4 version: cfdm.core requires netCDF4>={_minimum_vn}. " - f"Got {netCDF4.__version__} at {netCDF4.__file__}" - ) - # Check the version of numpy +try: + import numpy as np +except ImportError as error1: + raise ImportError(_error0 + str(error1)) + _minimum_vn = "1.15" if Version(np.__version__) < Version(_minimum_vn): raise ValueError( diff --git a/cfdm/core/data/data.py b/cfdm/core/data/data.py index a08b5d868..3056e3487 100644 --- a/cfdm/core/data/data.py +++ b/cfdm/core/data/data.py @@ -894,7 +894,7 @@ def source(self, default=ValueError()): >>> f = {{package}}.read('file.nc')[0] >>> d = f.data >>> d.source() - <{{repr}}NetCDFArray(149, 182): file=file.nc variable=latitude> + <{{repr}}NetCDF4Array(149, 182): file=file.nc variable=latitude> """ return self._get_component("array", default=default) diff --git a/cfdm/core/functions.py b/cfdm/core/functions.py index 4038aa2ec..4f5c29c46 100644 --- a/cfdm/core/functions.py +++ b/cfdm/core/functions.py @@ -3,8 +3,8 @@ import sys from pickle import dumps, loads -import netCDF4 import numpy as np +import packaging from . import __cf_version__, __file__, __version__ @@ -34,32 +34,21 @@ def environment(display=True, paths=True): **Examples** - >>> environment() - - Platform: Linux-5.14.0-1048-oem-x86_64-with-glibc2.31 - HDF5 library: 1.12.1 - netcdf library: 4.8.1 - Python: 3.9.12 /home/user/miniconda3/bin/python - netCDF4: 1.6.0 /home/user/miniconda3/lib/python3.9/site-packages/netCDF4/__init__.py - numpy: 1.22.3 /home/user/miniconda3/lib/python3.9/site-packages/numpy/__init__.py - cfdm.core: 1.10.0.0 /home/user/miniconda3/lib/python3.9/site-packages/cfdm/core/__init__.py - - >>> environment(paths=False) - Platform: Linux-5.14.0-1048-oem-x86_64-with-glibc2.31 - HDF5 library: 1.12.1 - netcdf library: 4.8.1 - Python: 3.9.12 - netCDF4: 1.6.0 - numpy: 1.22.3 - cfdm.core: 1.10.0.0 + >>> cfdm.core.environment(paths=False) + Platform: Linux-5.15.0-92-generic-x86_64-with-glibc2.35 + Python: 3.11.4 + packaging: 23.0 + numpy: 1.25.2 + cfdm.core: 1.11.1.0 """ dependency_version_paths_mapping = { "Platform": (platform.platform(), ""), - "HDF5 library": (netCDF4.__hdf5libversion__, ""), - "netcdf library": (netCDF4.__netcdf4libversion__, ""), "Python": (platform.python_version(), sys.executable), - "netCDF4": (netCDF4.__version__, os.path.abspath(netCDF4.__file__)), + "packaging": ( + packaging.__version__, + os.path.abspath(packaging.__file__), + ), "numpy": (np.__version__, os.path.abspath(np.__file__)), "cfdm.core": (__version__, os.path.abspath(__file__)), } diff --git a/cfdm/data/__init__.py b/cfdm/data/__init__.py index ce1c8ff9f..6cc6b5bb9 100644 --- a/cfdm/data/__init__.py +++ b/cfdm/data/__init__.py @@ -19,7 +19,7 @@ from .cellconnectivityarray import CellConnectivityArray from .gatheredarray import GatheredArray from .h5netcdfarray import H5netcdfArray -from .netcdfarray import NetCDFArray +from .netcdf4array import NetCDF4Array from .numpyarray import NumpyArray from .pointtopologyarray import PointTopologyArray from .raggedcontiguousarray import RaggedContiguousArray diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index 9a720b0d6..8aee0823b 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -84,7 +84,7 @@ def __init__( The missing value indicators defined by the variable attributes. See `get_missing_values` for details. - {{storage_options: `dict` or `None`, optional}} + {{init storage_options: `dict` or `None`, optional}} .. versionadded:: (cfdm) HDFVER @@ -198,7 +198,7 @@ def __getitem__(self, indices): # Get the data, applying masking and scaling as required. array = VariableIndexer( - variable, mask=mask, scale=mask, always_masked=False + variable, mask=mask, scale=True, always_mask=False ) array = array[indices] diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index 78cef6c83..2024b6fd7 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -190,31 +190,53 @@ def get_formats(self): """ return (self.get_format(),) * len(self.get_filenames()) - def get_storage_options(self, filename=None, parsed_filename=None): + def get_storage_options( + self, endpoint_url=True, filename=None, parsed_filename=None + ): """Return `s3fs.S3FileSystem` options for accessing S3 files. .. versionadded:: (cfdm) HDFVER :Parameters: + endpoint_url: `bool`, optional + TODOHDF + filename: `str`, optional - Used to the ``'endpoint_url'`` key if it has not been - previously defined. Ignored if *parse_filename* has - been set. By default the ``'endpoint_url'`` key, if - required, is set from the file name returned by - `get_filename`. - - parse_filename: `urllib.parse.ParseResult`, optional - Used to the ``'endpoint_url'`` key if it has not been - previously defined. By default the ``'endpoint_url'`` - key, if required, is set from the file name returned - by `get_filename`. + Used to set the ``'endpoint_url'`` key if it has not + been previously defined. Ignored if *parse_filename* + has been set. + + parsed_filename: `urllib.parse.ParseResult`, optional + Used to set the ``'endpoint_url'`` key if it has not + been previously defined. By default the + ``'endpoint_url'`` key, if required, is set from the + file name returned by `get_filename`. :Returns: `dict` The `s3fs.S3FileSystem` options. + + **Examples** + + >>> f.get_filename() + 's3://store/data/file.nc' + >>> f.get_storage_options(endpoint_url=False) + {'anon': True} + >>> f.get_storage_options() + {'anon': True, 'endpoint_url': 'https://store'} + >>> f.get_storage_options(filename='s3://other-store/data/file.nc') + {'anon': True, 'endpoint_url': 'https://other-store'} + + >>> f.get_storage_options() + {'key": 'kjhsadf8756', + 'secret': '862t3gyebh', + 'endpoint_url': None, + 'client_kwargs': {'endpoint_url': 'http://some-s3.com', + 'config_kwargs': {'s3': {'addressing_style': 'virtual'}}}} + """ out = self._get_component("storage_options", None) if not out: @@ -222,7 +244,7 @@ def get_storage_options(self, filename=None, parsed_filename=None): else: out = deepcopy(out) - if "endpoint_url" not in out: + if endpoint_url and "endpoint_url" not in out: if parsed_filename is None: if filename is None: try: @@ -274,7 +296,9 @@ def open(self, func, *args, **kwargs): filename = url.path elif url.scheme == "s3": # Create an openable S3 file object - storage_options = self.get_storage_options(parsed_filename=url) + storage_options = self.get_storage_options( + endpoint_url=True, parsed_filename=url + ) fs = S3FileSystem(**storage_options) filename = fs.open(url.path[1:], "rb") diff --git a/cfdm/data/netcdf4array.py b/cfdm/data/netcdf4array.py new file mode 100644 index 000000000..3da924756 --- /dev/null +++ b/cfdm/data/netcdf4array.py @@ -0,0 +1,354 @@ +import netCDF4 + +from . import abstract +from .mixin import FileArrayMixin, NetCDFFileMixin +from .variableindexer import VariableIndexer + +# import numpy as np + + +class NetCDF4Array(NetCDFFileMixin, FileArrayMixin, abstract.Array): + """An underlying array stored in a netCDF file. + + .. versionadded:: (cfdm) 1.7.0 + + """ + + def __init__( + self, + filename=None, + address=None, + dtype=None, + shape=None, + mask=True, + units=False, + calendar=False, + missing_values=None, + source=None, + copy=True, + ): + """**Initialisation** + + :Parameters: + + filename: (sequence of) `str`, optional + The name of the netCDF file(s) containing the array. + + address: (sequence of) `str` or `int`, optional + The identity of the netCDF variable in each file + defined by *filename*. Either a netCDF variable name + or an integer netCDF variable ID. + + .. versionadded:: (cfdm) 1.10.1.0 + + dtype: `numpy.dtype` + The data type of the array in the netCDF file. May be + `None` if the numpy data-type is not known (which can be + the case for netCDF string types, for example). + + shape: `tuple` + The array dimension sizes in the netCDF file. + + size: `int` + Number of elements in the array in the netCDF file. + + ndim: `int` + The number of array dimensions in the netCDF file. + + mask: `bool` + If True (the default) then mask by convention when + reading data from disk. + + A netCDF array is masked depending on the values of any of + the netCDF variable attributes ``valid_min``, + ``valid_max``, ``valid_range``, ``_FillValue`` and + ``missing_value``. + + .. versionadded:: (cfdm) 1.8.2 + + units: `str` or `None`, optional + The units of the netCDF variable. Set to `None` to + indicate that there are no units. If unset then the + units will be set during the first `__getitem__` call. + + .. versionadded:: (cfdm) 1.10.0.1 + + calendar: `str` or `None`, optional + The calendar of the netCDF variable. By default, or if + set to `None`, then the CF default calendar is + assumed, if applicable. If unset then the calendar + will be set during the first `__getitem__` call. + + .. versionadded:: (cfdm) 1.10.0.1 + + missing_values: `dict`, optional + The missing value indicators defined by the netCDF + variable attributes. See `get_missing_values` for + details. + + .. versionadded:: (cfdm) 1.10.0.3 + + {{init source: optional}} + + .. versionadded:: (cfdm) 1.10.0.0 + + {{init copy: `bool`, optional}} + + .. versionadded:: (cfdm) 1.10.0.0 + + ncvar: Deprecated at version 1.10.1.0 + Use the *address* parameter instead. + + varid: Deprecated at version 1.10.1.0 + Use the *address* parameter instead. + + group: Deprecated at version 1.10.1.0 + Use the *address* parameter instead. + + """ + super().__init__(source=source, copy=copy) + + if source is not None: + try: + shape = source._get_component("shape", None) + except AttributeError: + shape = None + + try: + filename = source._get_component("filename", None) + except AttributeError: + filename = None + + try: + address = source._get_component("address", None) + except AttributeError: + address = None + + try: + dtype = source._get_component("dtype", None) + except AttributeError: + dtype = None + + try: + mask = source._get_component("mask", True) + except AttributeError: + mask = True + + try: + units = source._get_component("units", False) + except AttributeError: + units = False + + try: + calendar = source._get_component("calendar", False) + except AttributeError: + calendar = False + + try: + missing_values = source._get_component("missing_values", None) + except AttributeError: + missing_values = None + + if shape is not None: + self._set_component("shape", shape, copy=False) + + if filename is not None: + if isinstance(filename, str): + filename = (filename,) + else: + filename = tuple(filename) + + self._set_component("filename", filename, copy=False) + + if address is not None: + if isinstance(address, (str, int)): + address = (address,) + else: + address = tuple(address) + + self._set_component("address", address, copy=False) + + if missing_values is not None: + self._set_component( + "missing_values", missing_values.copy(), copy=False + ) + + self._set_component("dtype", dtype, copy=False) + self._set_component("mask", mask, copy=False) + self._set_component("units", units, copy=False) + self._set_component("calendar", calendar, copy=False) + + # By default, close the netCDF file after data array access + self._set_component("close", True, copy=False) + + def __getitem__(self, indices): + """Returns a subspace of the array as a numpy array. + + x.__getitem__(indices) <==> x[indices] + + The indices that define the subspace must be either `Ellipsis` or + a sequence that contains an index for each dimension. In the + latter case, each dimension's index must either be a `slice` + object or a sequence of two or more integers. + + Indexing is similar to numpy indexing. The only difference to + numpy indexing (given the restrictions on the type of indices + allowed) is: + + * When two or more dimension's indices are sequences of integers + then these indices work independently along each dimension + (similar to the way vector subscripts work in Fortran). + + .. versionadded:: (cfdm) 1.7.0 + + """ + netcdf, address = self.open() + dataset = netcdf + + mask = self.get_mask() + groups, address = self.get_groups(address) + + if groups: + # Traverse the group structure, if there is one (CF>=1.8). + netcdf = self._group(netcdf, groups) + + if isinstance(address, str): + # Get the variable by netCDF name + variable = netcdf.variables[address] + else: + # Get the variable by netCDF integer ID + for variable in netcdf.variables.values(): + if variable._varid == address: + break + + # Get the data, applying masking and scaling as required. + array = VariableIndexer( + variable, mask=mask, scale=True, always_mask=False + ) + array = array[indices] + + # Set the units, if they haven't been set already. + self._set_units(variable) + + self.close(dataset) + del netcdf, dataset + + if not self.ndim: + # Hmm netCDF4 has a thing for making scalar size 1, 1d + array = array.squeeze() + + return array + + def __repr__(self): + """Called by the `repr` built-in function. + + x.__repr__() <==> repr(x) + + """ + return f"<{self.__class__.__name__}{self.shape}: {self}>" + + def __str__(self): + """Called by the `str` built-in function. + + x.__str__() <==> str(x) + + """ + return f"{self.get_filename(None)}, {self.get_address()}" + + def _get_attr(self, var, attr): + """Get a variable attribute. + + .. versionadded:: (cfdm) HDFVER + + :Parameters: + + var: `netCDF.Variable` + The variable + + attr: `str` + The attribute name. + + :Returns: + + The attirbute value. + + """ + return var.getncattr(attr) + + def get_groups(self, address): + """The netCDF4 group structure of a netCDF variable. + + .. versionadded:: (cfdm) 1.8.6.0 + + :Parameters: + + address: `str` or `int` + The netCDF variable name, or integer varid, from which + to get the groups. + + .. versionadded:: (cfdm) 1.10.1.0 + + :Returns: + + (`list`, `str`) or (`list`, `int`) + The group structure and the name within the group. If + *address* is a varid then an empty list and the varid + are returned. + + **Examples** + + >>> n.get_groups('tas') + ([], 'tas') + + >>> n.get_groups('/tas') + ([], 'tas') + + >>> n.get_groups('/data/model/tas') + (['data', 'model'], 'tas') + + >>> n.get_groups(9) + ([], 9) + + """ + try: + if "/" not in address: + return [], address + except TypeError: + return [], address + + out = address.split("/")[1:] + return out[:-1], out[-1] + + def close(self, dataset): + """Close the dataset containing the data. + + .. versionadded:: (cfdm) 1.7.0 + + :Parameters: + + dataset: `netCDF4.Dataset` + The netCDF dataset to be be closed. + + :Returns: + + `None` + + """ + if self._get_component("close"): + dataset.close() + + def open(self): + """Return a dataset file object and address. + + When multiple files have been provided an attempt is made to + open each one, in the order stored, and a file object is + returned from the first file that exists. + + :Returns: + + (`netCDF4.Dataset`, `str`) + The open file object, and the address of the data + within the file. + + """ + return super().open(netCDF4.Dataset, mode="r") diff --git a/cfdm/data/netcdfarray.py b/cfdm/data/netcdfarray.py index 6f1b77fc8..3da924756 100644 --- a/cfdm/data/netcdfarray.py +++ b/cfdm/data/netcdfarray.py @@ -1,11 +1,13 @@ import netCDF4 -import numpy as np from . import abstract from .mixin import FileArrayMixin, NetCDFFileMixin +from .variableindexer import VariableIndexer +# import numpy as np -class NetCDFArray(NetCDFFileMixin, FileArrayMixin, abstract.Array): + +class NetCDF4Array(NetCDFFileMixin, FileArrayMixin, abstract.Array): """An underlying array stored in a netCDF file. .. versionadded:: (cfdm) 1.7.0 @@ -213,61 +215,28 @@ def __getitem__(self, indices): if isinstance(address, str): # Get the variable by netCDF name variable = netcdf.variables[address] - variable.set_auto_mask(mask) - array = variable[indices] else: # Get the variable by netCDF integer ID for variable in netcdf.variables.values(): if variable._varid == address: - variable.set_auto_mask(mask) - array = variable[indices] break + # Get the data, applying masking and scaling as required. + array = VariableIndexer( + variable, mask=mask, scale=True, always_mask=False + ) + array = array[indices] + # Set the units, if they haven't been set already. self._set_units(variable) self.close(dataset) del netcdf, dataset - string_type = isinstance(array, str) - if string_type: - # -------------------------------------------------------- - # A netCDF string type scalar variable comes out as Python - # str object, so convert it to a numpy array. - # -------------------------------------------------------- - array = np.array(array, dtype=f"U{len(array)}") - if not self.ndim: # Hmm netCDF4 has a thing for making scalar size 1, 1d array = array.squeeze() - kind = array.dtype.kind - if not string_type and kind in "SU": - # -------------------------------------------------------- - # Collapse (by concatenation) the outermost (fastest - # varying) dimension of char array into - # memory. E.g. [['a','b','c']] becomes ['abc'] - # -------------------------------------------------------- - if kind == "U": - array = array.astype("S", copy=False) - - array = netCDF4.chartostring(array) - shape = array.shape - array = np.array([x.rstrip() for x in array.flat], dtype="U") - array = np.reshape(array, shape) - array = np.ma.masked_where(array == "", array) - elif not string_type and kind == "O": - # -------------------------------------------------------- - # A netCDF string type N-d (N>=1) variable comes out as a - # numpy object array, so convert it to numpy string array. - # -------------------------------------------------------- - array = array.astype("U", copy=False) - - # -------------------------------------------------------- - # netCDF4 does not auto-mask VLEN variable, so do it here. - # -------------------------------------------------------- - array = np.ma.where(array == "", np.ma.masked, array) - return array def __repr__(self): diff --git a/cfdm/data/variableindexer.py b/cfdm/data/variableindexer.py index 1a3189cf6..38b5c307b 100644 --- a/cfdm/data/variableindexer.py +++ b/cfdm/data/variableindexer.py @@ -4,25 +4,28 @@ import numpy as np _safecast = netCDF4.utils._safecast -default_fillvals = netCDF4.default_fillvals +_default_fillvals = netCDF4.default_fillvals logger = logging.getLogger(__name__) class VariableIndexer: - """An indexer of netCDF variables that applies masking and scaling. + """A data indexer that applies CF masking and scaling. During indexing, masking and scaling is applied according to the - CF conventions, either of which may be disabled via initialisation - options. + CF conventions, either or both of which may be disabled via + initialisation options. String and character variables are converted to unicode arrays, the latter with the last dimension concatenated. + Adapted from `netCDF4`. + .. versionadded:: (cfdm) HDFVER **Examples** + >>> import netCDF4 >>> nc = netCDF4.Dataset('file.nc', 'r') >>> x = cfdm.VariableIndexer(nc.variables['x']) >>> x.shape @@ -33,6 +36,7 @@ class VariableIndexer: [243.4, 242.4, 241.3], [243.1, 241.7, 240.4]] + >>> import h5netcdf >>> h5 = h5netcdf.File('file.nc', 'r') >>> x = cfdm.VariableIndexer(h5.variables['x']) >>> x.shape @@ -43,18 +47,32 @@ class VariableIndexer: [243.4, 242.4, 241.3], [243.1, 241.7, 240.4]] + >>> import numpy as np + >>> n = np.arange(9) + >>> x = cfdm.VariableIndexer(n) + >>> x.shape + (9,) + >>> print(x[...]) + [1 2 3 4 5 6 7 8] + >>> x = cfdm.VariableIndexer(n, attrs={'_FillValue': 4}) + >>> print(x[...]) + [1 2 3 -- 5 6 7 8] + """ - def __init__(self, variable, mask=True, scale=True, always_masked=False): + def __init__( + self, variable, mask=True, scale=True, always_mask=False, attrs=None + ): """**Initialisation** :Parameters: - variable: `netCDF4.Variable` or `h5netcdf.Variable` - The variable to be indexed. Any masking and scaling - that may be applied by the *variable* itself is - disabled, i.e. Any masking and scaling is always - applied by the `VariableIndexer` instance. + variable: + The variable to be indexed, one `netCDF4.Variable`, + `h5netcdf.Variable`, or `numpy.ndarray`. Any masking + and scaling that may be applied by the *variable* + itself is disabled, i.e. Any masking and scaling is + always applied by the `VariableIndexer` instance. mask: `bool` If True, the default, then an array returned by @@ -68,18 +86,21 @@ def __init__(self, variable, mask=True, scale=True, always_masked=False): converted to unsigned integer data if the ``_Unsigned`` attribute is set to "true" or "True". - always_masked: `bool` + always_mask: `bool` If False, the default, then an array returned by indexing which has no missing values is created as a regular numpy array. If True then an array returned by indexing is always a masked array, even if there are no missing values. + attrs: `dict`, optional + """ self.variable = variable self.mask = mask self.scale = scale - self.always_masked = always_masked + self.always_mask = always_mask + self.attrs = attrs self.shape = variable.shape @@ -103,10 +124,10 @@ def __getitem__(self, index): netCDF4_mask = False try: netCDF4_scale = variable.scale - netCDF4_mask = variable.mask except AttributeError: pass else: + netCDF4_mask = variable.mask # Prevent netCDF4 from doing any masking and scaling variable.set_auto_maskandscale(False) @@ -139,13 +160,12 @@ def __getitem__(self, index): data = data.view(dtype_unsigned_int) if self.mask: - attrs = self._FillValue(variable, attrs) + attrs = self._set_FillValue(variable, attrs) data = self._mask( data, dtype, attrs, scale=scale, - always_masked=self.always_masked, dtype_unsigned_int=dtype_unsigned_int, ) @@ -156,6 +176,7 @@ def __getitem__(self, index): # Assume that object arrays contain strings data = data.astype("U", copy=False) + # Reset a netCDF4 variables's scale and mask behaviour if netCDF4_scale: variable.set_auto_scale(True) @@ -171,8 +192,9 @@ def _attrs(self, variable): :Parameter: - variable: `netCDF4.Variable` or `h5netcdf.Variable` - The variable to be indexed. + variable: + The variable to be indexed, one `netCDF4.Variable`, + `h5netcdf.Variable`, or `numpy.ndarray`. :Returns: @@ -180,14 +202,22 @@ def _attrs(self, variable): The attributes. """ + if self.attrs is not None: + return self.attrs.copy() + try: # h5netcdf return dict(variable.attrs) except AttributeError: - # netCDF4 - return { - attr: variable.getncattr(attr) for attr in variable.ncattrs() - } + try: + # netCDF4 + return { + attr: variable.getncattr(attr) + for attr in variable.ncattrs() + } + except AttributeError: + # numpy + return {} def _check_safecast(self, attname, dtype, attrs): """Check an attribute's data type. @@ -231,54 +261,79 @@ def _check_safecast(self, attname, dtype, attrs): if not is_safe: logger.warn( - f"WARNING: {attname} not used since it cannot " - "be safely cast to variable data type {dtype!r}" + f"WARNING: Attribute {attname} not used since it can't " + f"be safely cast to variable data type {dtype!r}" ) # pragma: no cover return is_safe, attvalue - def _FillValue(self, variable, attrs): - """Set the variable _FillValue. + def _set_FillValue(self, variable, attrs): + """Set the ``_FillValue`` from a `h5netcdf.Variable`. + + If the attributes already contain a ``_FillValue`` then + nothing is done. + + .. seealso:: `_default_FillValue` .. versionadded:: (cfdm) HDFVER :Parameter: - variable: `netCDF4.Variable` or `h5netcdf.Variable` - The variable to be indexed. + variable: `h5netcdf.Variable` + The variable. attrs: `dict` - The variable attributes. May get updated in-place. + The variable attributes. Will get updated in-place if + a ``_FillValue`` is found. :Returns: `dict` - The variable attributes, updated in-place with - ``_FillValue`` if present and not previously set.. + The variable attributes, updated with ``_FillValue`` + if present and not previously set. """ if "_FillValue" not in attrs: try: - fillvalue = getattr(variable._h5ds, "fillvalue", None) + # h5netcdf + _FillValue = getattr(variable._h5ds, "fillvalue", None) except AttributeError: # netCDf4 pass else: - # h5netcdf - if fillvalue is not None: - attrs["_FillValue"] = fillvalue - elif variable.dtype.kind == "O": - attrs["_FillValue"] = default_fillvals["S1"] + if _FillValue is not None: + attrs["_FillValue"] = _FillValue return attrs + def _default_FillValue(self, dtype): + """Return the default ``_FillValue`` for the given data type. + + .. seealso:: `_set_FillValue`, `netCDF4.default_fillvals` + + .. versionadded:: (cfdm) HDFVER + + :Parameter: + + dtype: `numpy.dtype` + The variable's data type + + :Returns: + + The default ``_FillValue``. + + """ + if dtype.kind in "OS": + return _default_fillvals["S1"] + else: + return _default_fillvals[dtype.str[1:]] + def _mask( self, data, dtype, attrs, scale=True, - always_masked=False, dtype_unsigned_int=None, ): """Mask the data. @@ -301,13 +356,10 @@ def _mask( scale: `bool` Whether the data is to be scaled. - always_masked: `bool` - Whether or not return a regular numpy array when there - are no missing values. - dtype_unsigned_int: `dtype` or `None` The data type to which unsigned integer data has been - cast. + cast. Should be `None` for data that are not unsigned + integers. :Returns: @@ -326,13 +378,13 @@ def _mask( if scale and dtype_unsigned_int is not None: mval = mval.view(dtype_unsigned_int) - # create mask from missing values. + # Create mask from missing values. mvalmask = np.zeros(data.shape, np.bool_) if not mval.ndim: # mval a scalar. - mval = (mval,) # make into iterable. + mval = (mval,) # Make into iterable. for m in mval: - # is scalar missing value a NaN? + # Is scalar missing value a NaN? try: mvalisnan = np.isnan(m) except TypeError: @@ -346,20 +398,24 @@ def _mask( if mvalmask.any(): # Set fill_value for masked array to missing_value (or - # 1st element if missing_value is a vector). + # first element if missing_value is a vector). fill_value = mval[0] totalmask += mvalmask - # set mask=True for data == fill value + # Set mask=True for data == fill value safe_fillval, _FillValue = self._check_safecast( "_FillValue", dtype, attrs ) + if not safe_fillval: + _FillValue = self._default_FillValue(dtype) + safe_fillval = True + if safe_fillval: fval = np.array(_FillValue, dtype) if scale and dtype_unsigned_int is not None: fval = fval.view(dtype_unsigned_int) - # is _FillValue a NaN? + # Is _FillValue a NaN? try: fvalisnan = np.isnan(fval) except Exception: @@ -378,56 +434,14 @@ def _mask( fill_value = fval totalmask += mask - else: - # Don't return masked array if variable filling is disabled. - no_fill = 0 - # with nogil: - # ierr = nc_inq_var_fill(self._grpid,self._varid,&no_fill,NULL) - # _ensure_nc_success(ierr) - - # if no_fill is not 1, and not a byte variable, then use - # default fill value. from - # http://www.unidata.ucar.edu/software/netcdf/docs/netcdf-c/Fill-Values.html#Fill-Values - # "If you need a fill value for a byte variable, it is - # recommended that you explicitly define an appropriate - # _FillValue attribute, as generic utilities such as - # ncdump will not assume a default fill value for byte - # variables." Explained here too: - # http://www.unidata.ucar.edu/software/netcdf/docs/known_problems.html#ncdump_ubyte_fill - # "There should be no default fill values when reading any - # byte type, signed or unsigned, because the byte ranges - # are too small to assume one of the values should appear - # as a missing value unless a _FillValue attribute is set - # explicitly." (do this only for non-vlens, since vlens - # don't have a default _FillValue) - if no_fill != 1 or dtype.str[1:] not in ("u1", "i1"): - if dtype.kind == "S": - default_fillval = default_fillvals["S1"] - else: - default_fillval = default_fillvals[dtype.str[1:]] - - fillval = np.array(default_fillval, dtype) - has_fillval = data == fillval - # if data is an array scalar, has_fillval will be a - # boolean. in that case convert to an array. - # if type(has_fillval) == bool: - if isinstance(has_fillval, bool): - has_fillval = np.asarray(has_fillval) - - if has_fillval.any(): - if fill_value is None: - fill_value = fillval - - mask = data == fillval - totalmask += mask # Set mask=True for data outside [valid_min, valid_max] - validmin = None - validmax = None + # # If valid_range exists use that, otherwise look for # valid_min, valid_max. No special treatment of byte data as - # described at - # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). + # described in the netCDF documentation. + validmin = None + validmax = None safe_validrange, valid_range = self._check_safecast( "valid_range", dtype, attrs ) @@ -454,30 +468,9 @@ def _mask( if validmax is not None and dtype_unsigned_int is not None: validmax = validmax.view(dtype_unsigned_int) - # http://www.unidata.ucar.edu/software/netcdf/docs/attribute_conventions.html). - # "If the data type is byte and _FillValue is not explicitly - # defined, then the valid range should include all possible - # values. Otherwise, the valid range should exclude the - # _FillValue (whether defined explicitly or by default) as - # follows. If the _FillValue is positive then it defines a - # valid maximum, otherwise it defines a valid minimum." - if safe_fillval: - fval = np.array(_FillValue, dtype) - else: - k = dtype.str[1:] - if k in ("u1", "i1"): - fval = None - else: - if dtype.kind == "S": - default_fillval = default_fillvals["S1"] - else: - default_fillval = default_fillvals[k] - - fval = np.array(default_fillval, dtype) - if dtype.kind != "S": # Don't set validmin/validmax mask for character data - + # # Setting valid_min/valid_max to the _FillVaue is too # surprising for many users (despite the netcdf docs # attribute best practices suggesting clients should do @@ -488,40 +481,16 @@ def _mask( if validmax is not None: totalmask += data > validmax - if fill_value is None and fval is not None: - fill_value = fval - - # If all else fails, use default _FillValue as fill_value for - # masked array. - if fill_value is None: - if dtype.kind == "S": - fill_value = default_fillvals["S1"] - else: - fill_value = default_fillvals[dtype.str[1:]] - - # Create masked array with computed mask - masked_values = totalmask.any() - if masked_values: - data = np.ma.masked_array( - data, mask=totalmask, fill_value=fill_value - ) - else: - # Always return masked array, if no values masked. + # Mask the data + if totalmask.any(): + data = np.ma.masked_array(data, mask=totalmask, fill_value=fval) + if not data.ndim: + # Return a scalar numpy masked constant not a 0-d + # masked array, so that data == np.ma.masked. + data = data[()] + elif self.always_mask: data = np.ma.masked_array(data) - # Scalar array with mask=True should be converted to - # np.ma.MaskedConstant to be consistent with slicing - # behavior of masked arrays. - if data.shape == () and data.mask.all(): - # Return a scalar numpy masked constant not a 0-d masked - # array, so that data == np.ma.masked. - data = data[()] - - elif not always_masked and not masked_values: - # Return a regular numpy array if requested and there are - # no missing values - data = np.array(data, copy=False) - return data def _scale(self, data, attrs): @@ -556,9 +525,9 @@ def _scale(self, data, attrs): float(add_offset) except ValueError: logging.warn( - "invalid scale_factor or add_offset attribute, " - "no unpacking done..." - ) + "Invalid scale_factor or add_offset attribute, " + "no unpacking done." + ) # pragma: no cover return data if scale_factor is not None and add_offset is not None: diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py index ae97a1e81..140487a1e 100644 --- a/cfdm/docstring/docstring.py +++ b/cfdm/docstring/docstring.py @@ -403,7 +403,7 @@ The position of the *data* dimension that indexes the cells, either ``0`` or ``1``.""", # storage_options - "{{storage_options: `dict` or `None`, optional}}": """storage_options: `dict` or `None`, optional + "{{init storage_options: `dict` or `None`, optional}}": """storage_options: `dict` or `None`, optional Key/value pairs to be passed on to the `s3fs.S3FileSystem` file-system backend to control the opening of files in an S3 object store. By default, or if `None`, then a value of @@ -418,19 +418,16 @@ would be created. To disable this behaviour, assign `None` to the ``'endpoint_url'`` key. - *Parameter example:* - ``{'anon': True}`` - *Parameter example:* For a file name of ``'s3://store/data/file.nc'``, the following are equivalent: ``{'anon': True}`` and ``{'anon': True, 'endpoint_url': 'https://store'}``. *Parameter example:* - ``{'key": 'jhsadf8756', 'secret': '862t3gyebh', - 'client_kwargs': {'endpoint_url': 'http://some-s3.com', - 'config_kwargs': {'s3': {'addressing_style': - 'virtual'}}``""", + ``{'key": 'kjhsadf8756', 'secret': '862t3gyebh', + 'endpoint_url': None, 'client_kwargs': {'endpoint_url': + 'http://some-s3.com', 'config_kwargs': {'s3': + {'addressing_style': 'virtual'}}}}``""", # ---------------------------------------------------------------- # Method description susbstitutions (4 levels of indentataion) # ---------------------------------------------------------------- diff --git a/cfdm/functions.py b/cfdm/functions.py index 874c12867..380671232 100644 --- a/cfdm/functions.py +++ b/cfdm/functions.py @@ -5,8 +5,11 @@ from urllib.parse import urlparse import cftime -import netcdf_flattener +import h5netcdf +import h5py +import netCDF4 import numpy as np +import s3fs import scipy from . import __cf_version__, __file__, __version__, core @@ -317,46 +320,50 @@ def environment(display=True, paths=True): **Examples** - >>> cfdm.environment() - Platform: Linux-5.14.0-1048-oem-x86_64-with-glibc2.31 - HDF5 library: 1.12.1 - netcdf library: 4.8.1 - Python: 3.9.12 /home/user/miniconda3/bin/python - netCDF4: 1.6.0 /home/user/miniconda3/lib/python3.9/site-packages/netCDF4/__init__.py - numpy: 1.22.3 /home/user/miniconda3/lib/python3.9/site-packages/numpy/__init__.py - cfdm.core: 1.11.0.0 /home/user/miniconda3/lib/python3.9/site-packages/cfdm/core/__init__.py - scipy: 1.11.3 /home/user/miniconda3/lib/python3.11/site-packages/scipy/__init__.py - cftime: 1.6.1 /home/user/miniconda3/lib/python3.9/site-packages/cftime/__init__.py - netcdf_flattener: 1.2.0 /home/user/miniconda3/lib/python3.9/site-packages/netcdf_flattener/__init__.py - cfdm: 1.11.0.0 /home/user/miniconda3/lib/python3.9/site-packages/cfdm/__init__.py - >>> cfdm.environment(paths=False) - HDF5 library: 1.12.1 - netcdf library: 4.8.1 - Python: 3.9.12 - netCDF4: 1.6.0 - numpy: 1.22.3 - cfdm.core: 1.11.0.0 + Platform: Linux-5.15.0-92-generic-x86_64-with-glibc2.35 + Python: 3.11.4 + packaging: 23.0 + numpy: 1.25.2 + cfdm.core: 1.11.1.0 + HDF5 library: 1.14.2 + netcdf library: 4.9.2 + netCDF4: 1.6.4 + h5netcdf: 1.3.0 + h5py: 3.10.0 + s3fs: 2023.12.2 scipy: 1.11.3 - cftime: 1.6.1 - netcdf_flattener: 1.2.0 - cfdm: 1.11.0.0 + cftime: 1.6.2 + cfdm: 1.11.1.0 + + >>> cfdm.environment() + Platform: Linux-5.15.0-92-generic-x86_64-with-glibc2.35 + Python: 3.11.4 /home/miniconda3/bin/python + packaging: 23.0 /home/miniconda3/lib/python3.11/site-packages/packaging/__init__.py + numpy: 1.25.2 /home/miniconda3/lib/python3.11/site-packages/numpy/__init__.py + cfdm.core: 1.11.1.0 /home/cfdm/cfdm/core/__init__.py + HDF5 library: 1.14.2 + netcdf library: 4.9.2 + netCDF4: 1.6.4 /home/miniconda3/lib/python3.11/site-packages/netCDF4/__init__.py + h5netcdf: 1.3.0 /home/miniconda3/lib/python3.11/site-packages/h5netcdf/__init__.py + h5py: 3.10.0 /home/miniconda3/lib/python3.11/site-packages/h5py/__init__.py + s3fs: 2023.12.2 /home/miniconda3/lib/python3.11/site-packages/s3fs/__init__.py + scipy: 1.11.3 /home/miniconda3/lib/python3.11/site-packages/scipy/__init__.py + cftime: 1.6.2 /home/miniconda3/lib/python3.11/site-packages/cftime/__init__.py + cfdm: 1.11.1.0 /home/miniconda3/lib/python3.11/site-packages/cfdm/__init__.py """ out = core.environment(display=False, paths=paths) # get all core env - try: - netcdf_flattener_version = netcdf_flattener.__version__ - except AttributeError: - netcdf_flattener_version = "unknown version" - dependency_version_paths_mapping = { + "HDF5 library": (netCDF4.__hdf5libversion__, ""), + "netcdf library": (netCDF4.__netcdf4libversion__, ""), + "netCDF4": (netCDF4.__version__, os.path.abspath(netCDF4.__file__)), + "h5netcdf": (h5netcdf.__version__, os.path.abspath(h5netcdf.__file__)), + "h5py": (h5py.__version__, os.path.abspath(h5py.__file__)), + "s3fs": (s3fs.__version__, os.path.abspath(s3fs.__file__)), "scipy": (scipy.__version__, os.path.abspath(scipy.__file__)), "cftime": (cftime.__version__, os.path.abspath(cftime.__file__)), - "netcdf_flattener": ( - netcdf_flattener_version, - os.path.abspath(netcdf_flattener.__file__), - ), "cfdm": (__version__, os.path.abspath(__file__)), } string = "{0}: {1!s}" diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 785eda1ea..e5299e90d 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -6224,7 +6224,7 @@ def _create_netcdfarray( return kwargs if g["original_netCDF4"]: - array = self.implementation.initialise_NetCDFArray(**kwargs) + array = self.implementation.initialise_NetCDF4Array(**kwargs) else: # h5netcdf array = self.implementation.initialise_H5netcdfArray(**kwargs) diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 1b772d4d5..5d0ebe21c 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -280,9 +280,6 @@ def read( would be created. To disable this behaviour, assign `None` to the ``'endpoint_url'`` key. - *Parameter example:* - ``{'anon': True}`` - *Parameter example:* For a file name of ``'s3://store/data/file.nc'``, the following are equivalent: ``{'anon': True}`` and @@ -290,9 +287,9 @@ def read( *Parameter example:* ``{'key": 'kjhsadf8756', 'secret': '862t3gyebh', - 'client_kwargs': {'endpoint_url': 'http://some-s3.com', - 'config_kwargs': {'s3': {'addressing_style': - 'virtual'}}`` + 'endpoint_url': None, 'client_kwargs': {'endpoint_url': + 'http://some-s3.com', 'config_kwargs': {'s3': + {'addressing_style': 'virtual'}}}}`` .. versionadded:: (cfdm) HDFVER diff --git a/cfdm/test/test_NetCDFArray.py b/cfdm/test/test_NetCDFArray.py index 5ff1b69b3..80e46aa13 100644 --- a/cfdm/test/test_NetCDFArray.py +++ b/cfdm/test/test_NetCDFArray.py @@ -45,33 +45,33 @@ def setUp(self): # < ... test code ... > # cfdm.log_level('DISABLE') - def test_NetCDFArray_get_addresses(self): - """Test `NetCDFArray.get_addresses`""" - a = cfdm.NetCDFArray(address="tas") + def test_NetCDF4Array_get_addresses(self): + """Test `NetCDF4Array.get_addresses`""" + a = cfdm.NetCDF4Array(address="tas") self.assertEqual(a.get_addresses(), ("tas",)) - a = cfdm.NetCDFArray(address=("tas1", "tas1")) + a = cfdm.NetCDF4Array(address=("tas1", "tas1")) self.assertEqual(a.get_addresses(), ("tas1", "tas1")) - a = cfdm.NetCDFArray() + a = cfdm.NetCDF4Array() self.assertEqual(a.get_addresses(), ()) - def test_NetCDFArray_get_filenames(self): - """Test `NetCDFArray.get_filenames`""" - a = cfdm.NetCDFArray("/data1/file1") + def test_NetCDF4Array_get_filenames(self): + """Test `NetCDF4Array.get_filenames`""" + a = cfdm.NetCDF4Array("/data1/file1") self.assertEqual(a.get_filenames(), ("/data1/file1",)) - a = cfdm.NetCDFArray(("/data1/file1",)) + a = cfdm.NetCDF4Array(("/data1/file1",)) self.assertEqual(a.get_filenames(), ("/data1/file1",)) - a = cfdm.NetCDFArray(("/data1/file1", "/data2/file2")) + a = cfdm.NetCDF4Array(("/data1/file1", "/data2/file2")) self.assertEqual(a.get_filenames(), ("/data1/file1", "/data2/file2")) - a = cfdm.NetCDFArray() + a = cfdm.NetCDF4Array() self.assertEqual(a.get_filenames(), ()) - def test_NetCDFArray_get_missing_values(self): - """Test NetCDFArray.get_missing_values.""" + def test_NetCDF4Array_get_missing_values(self): + """Test NetCDF4Array.get_missing_values.""" f = cfdm.example_field(0) f.set_property("missing_value", -999) @@ -92,7 +92,7 @@ def test_NetCDFArray_get_missing_values(self): c = g.coordinate("latitude") self.assertEqual(c.data.source().get_missing_values(), {}) - a = cfdm.NetCDFArray("file.nc", "ncvar") + a = cfdm.NetCDF4Array("file.nc", "ncvar") self.assertIsNone(a.get_missing_values()) diff --git a/cfdm/test/test_VariableIndexer.py b/cfdm/test/test_VariableIndexer.py index 9b110f27b..53b5a9121 100644 --- a/cfdm/test/test_VariableIndexer.py +++ b/cfdm/test/test_VariableIndexer.py @@ -17,7 +17,7 @@ tempfile.mkstemp("_test_VariableIndxer.nc", dir=os.getcwd())[1] for i in range(n_tmpfiles) ] -(tempfile,) = tmpfiles +(tmpfile,) = tmpfiles def _remove_tmpfiles(): @@ -31,12 +31,20 @@ def _remove_tmpfiles(): atexit.register(_remove_tmpfiles) +netCDF_backends = ("netCDF4", "h5netcdf") + class VariableIndexerTest(unittest.TestCase): """Test the masking and scaling of netCDF data.""" + def test_shape(self): + """Test VariableIndexer shape.""" + n = np.ma.arange(9) + x = cfdm.VariableIndexer(n) + self.assertEqual(x.shape, n.shape) + def test_mask(self): - """Test CF masking.""" + """Test VariableIndexer for CF masking.""" f0 = cfdm.example_field(0) f0.del_property("missing_value", None) f0.del_property("_FillValue", None) @@ -67,15 +75,23 @@ def test_mask(self): f.set_property("valid_range", [valid_min, valid_max]) fields.append(f) - cfdm.write(fields, tempfile, warn_valid=False) - - fh5 = cfdm.read(tempfile, netCDF_backend="h5netcdf") - fnc = cfdm.read(tempfile, netCDF_backend="netCDF4") - for h, n in zip(fh5, fnc): - self.assertTrue(h.data.mask.equals(n.data.mask)) + cfdm.write(fields, tmpfile, warn_valid=False) + + # Check against netCDF4 with set_auto_maskandscale(True) + nc = netCDF4.Dataset(tmpfile, "r") + nc.set_auto_maskandscale(True) + nc.set_always_mask(True) + for backend in netCDF_backends: + f = cfdm.read(tmpfile, netCDF_backend=backend) + for g in f: + ncvar = g.nc_get_variable() + n = nc.variables[ncvar] + na = n[...] + self.assertTrue((g.array == na).all()) + self.assertTrue((g.data.mask.array == na.mask).all()) def test_scale(self): - """Test CF scaling.""" + """Test VariableIndexer for CF scaling.""" f = cfdm.example_field(0) array = np.ma.arange(40, dtype="int32").reshape(f.shape) @@ -89,21 +105,33 @@ def test_scale(self): f.set_property("add_offset", add_offset) f.set_property("missing_value", 999) - cfdm.write(f, tempfile) - x = cfdm.read(tempfile)[0] - - nc = netCDF4.Dataset(tempfile, "r") - q = nc.variables["q"] - q.set_auto_maskandscale(False) - - raw = (array - add_offset) / scale_factor - raw[1, :] = 999 - raw = raw.astype(array.dtype) - self.assertEqual(q.dtype, raw.dtype) - self.assertTrue((q[...] == raw).all()) - nc.close() + cfdm.write(f, tmpfile) + + # Check against netCDF4 with set_auto_maskandscale(True) + nc = netCDF4.Dataset(tmpfile, "r") + nc.set_auto_maskandscale(True) + nc.set_always_mask(True) + for backend in netCDF_backends: + f = cfdm.read(tmpfile, netCDF_backend=backend) + for g in f: + ncvar = g.nc_get_variable() + n = nc.variables[ncvar] + na = n[...] + self.assertTrue((g.array == na).all()) + self.assertTrue((g.data.mask.array == na.mask).all()) + + def test_numpy(self): + """Test VariableIndexer for numpy.""" + array = np.ma.arange(9) + x = cfdm.VariableIndexer(array) + x = x[...] + self.assertTrue((x == array).all()) - x = x.array + x = cfdm.VariableIndexer( + array.copy(), attrs={"_FillValue": 4, "missing_value": (0, 8)} + ) + x = x[...] + array[[0, 4, 8]] = np.ma.masked self.assertTrue((x.mask == array.mask).all()) self.assertTrue((x == array).all()) diff --git a/cfdm/test/test_read_write.py b/cfdm/test/test_read_write.py index 454a14645..076506c15 100644 --- a/cfdm/test/test_read_write.py +++ b/cfdm/test/test_read_write.py @@ -671,22 +671,18 @@ def test_read_CDL(self): def test_read_write_string(self): """Test the `string` keyword argument to `read` and `write`.""" - f = cfdm.read(self.string_filename, netCDF_backend="netCDF4") + fn = cfdm.read(self.string_filename, netCDF_backend="netCDF4") fh = cfdm.read(self.string_filename, netCDF_backend="h5netcdf") - n = int(len(f) / 2) + n = int(len(fn) / 2) for i in range(0, n): j = i + n - self.assertTrue( - f[i].data.equals(f[j].data, verbose=3), f"{f[i]!r} {f[j]!r}" - ) - self.assertTrue( - f[j].data.equals(f[i].data, verbose=3), f"{f[j]!r} {f[i]!r}" - ) + self.assertTrue(fn[i].data.equals(fn[j].data, verbose=3)) + self.assertTrue(fn[j].data.equals(fn[i].data, verbose=3)) # Check that netCDF4 and h5netcdf give the same results - for i, j in zip(f, fh): + for i, j in zip(fn, fh): self.assertTrue(i.data.equals(j.data)) # Note: Don't loop round all netCDF formats for better diff --git a/docs/source/installation.rst b/docs/source/installation.rst index ad36e5bee..0ea81b745 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -152,20 +152,24 @@ Tests are run from within the ``cfdm/test`` directory: The cfdm package requires: -* `Python `_, version 3.8 or newer. +* `Python `_, version 3.8 or newer. -* `numpy `_, version 1.15 or newer. +* `numpy `_, version 1.15 or newer. -* `netCDF4 `_, version 1.5.4 or +* `netCDF4 `_, version 1.5.4 or newer. -* `cftime `_, version 1.6.0 or +* `cftime `_, version 1.6.0 or newer. -* `netcdf_flattener `_, - version 1.2.0 or newer. +* `h5netcdf `_, version 1.3.0 + newer. + +* `h5py `_, version 3.10.0 or newer. + +* `s3fs `_, version 2024.2.0 or newer. -* `packaging `_, version 20.0 or +* `packaging `_, version 20.0 or newer. * `scipy `_, version 1.10.0 or newer. diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index 0642d7fec..f8487d613 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -184,7 +184,9 @@ The `cfdm.read` function has optional parameters to attributes are present (see :ref:`data masking `); and * display information and issue warnings about the mapping of the - netCDF file contents to CF data model constructs. + netCDF file contents to CF data model constructs; + +* choose either `netCDF4` or `h5netcdf` backends for accessing netCDF files. .. _CF-compliance: diff --git a/requirements.txt b/requirements.txt index c50f5f8bc..7cfb76173 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,9 @@ netCDF4>=1.5.4 cftime>=1.6.0 numpy>=1.15 -netcdf-flattener>=1.2.0 packaging>=20.0 scipy>=1.10.0 -h5py>=3.0.0 h5netcdf>=1.3.0 -s3fs>=2023.12.2 +h5py>=3.10.0 +s3fs>=2024.2.0 diff --git a/setup.py b/setup.py index 9a2172da9..937fdd1e7 100755 --- a/setup.py +++ b/setup.py @@ -67,7 +67,7 @@ def _get_version(): The **cfdm** package can -* read field and domain constructs from netCDF and CDL datasets, +* read field and domain constructs from netCDF and CDL datasets with a choice of netCDF backends, * create new field and domain constructs in memory, * write and append field and domain constructs to netCDF datasets on disk, * read, write, and manipulate UGRID mesh topologies, From c2321bef47e6a6fde61a8b296aa1798e255c8070 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 6 Feb 2024 18:21:41 +0000 Subject: [PATCH 30/88] dev --- cfdm/__init__.py | 2 +- cfdm/cfdmimplementation.py | 85 ++------- cfdm/data/__init__.py | 2 +- cfdm/data/h5netcdfarray.py | 36 ++-- cfdm/data/mixin/filearraymixin.py | 36 ++-- cfdm/data/mixin/netcdffilemixin.py | 13 ++ cfdm/data/netcdf4array.py | 38 ++-- .../{variableindexer.py => netcdfindexer.py} | 169 +++++++++--------- cfdm/docstring/docstring.py | 49 +++-- cfdm/read_write/netcdf/netcdfread.py | 13 +- cfdm/read_write/read.py | 58 +++--- ...st_NetCDFArray.py => test_NetCDF4Array.py} | 53 +++++- ...riableIndexer.py => test_NetCDFIndexer.py} | 30 ++-- 13 files changed, 324 insertions(+), 260 deletions(-) rename cfdm/data/{variableindexer.py => netcdfindexer.py} (84%) rename cfdm/test/{test_NetCDFArray.py => test_NetCDF4Array.py} (63%) rename cfdm/test/{test_VariableIndexer.py => test_NetCDFIndexer.py} (85%) diff --git a/cfdm/__init__.py b/cfdm/__init__.py index c6e88e662..23eb0e53e 100644 --- a/cfdm/__init__.py +++ b/cfdm/__init__.py @@ -185,6 +185,7 @@ GatheredArray, H5netcdfArray, NetCDF4Array, + NetCDFIndexer, NumpyArray, PointTopologyArray, RaggedArray, @@ -193,7 +194,6 @@ RaggedIndexedContiguousArray, SparseArray, SubsampledArray, - VariableIndexer, ) from .data import ( diff --git a/cfdm/cfdmimplementation.py b/cfdm/cfdmimplementation.py index b9a6ce5f5..7eda9fcd6 100644 --- a/cfdm/cfdmimplementation.py +++ b/cfdm/cfdmimplementation.py @@ -2292,50 +2292,15 @@ def initialise_TiePointIndex(self): cls = self.get_class("TiePointIndex") return cls() - def initialise_NetCDF4Array( - self, - filename=None, - address=None, - dtype=None, - shape=None, - mask=True, - units=False, - calendar=None, - missing_values=None, - ): - """Return a netCDF array instance. + def initialise_NetCDF4Array(self, **kwargs): + """Return a `NetCDF4Array` instance. :Parameters: - filename: `str` - - address: `str` - - dytpe: `numpy.dtype` - - shape: sequence of `int`, optional - - mask: `bool`, optional - - units: `str` or `None` or False, optional - The units of the netCDF variable. Set to `None` to - indicate that there are no units. If False (the - default) then the units are considered unset. - - .. versionadded:: (cfdm) 1.10.0.2 - - calendar: `str` or `None`, optional - The calendar of the netCDF variable. By default, or if - set to `None`, then the CF default calendar is - assumed, if applicable. - - .. versionadded:: (cfdm) 1.10.0.2 - - missing_values: `dict`, optional - The missing value indicators defined by the netCDF - variable attributes. + kwargs: optional + Initialisation parameters to pass to the new instance. - .. versionadded:: (cfdm) 1.10.0.3 + .. versionadded:: (cfdm) HDFVER :Returns: @@ -2343,47 +2308,17 @@ def initialise_NetCDF4Array( """ cls = self.get_class("NetCDF4Array") - return cls( - filename=filename, - address=address, - dtype=dtype, - shape=shape, - mask=mask, - units=units, - calendar=calendar, - missing_values=missing_values, - ) + return cls(**kwargs) def initialise_H5netcdfArray(self, **kwargs): """Return a `H5netcdfArray` instance. - :Parameters: - - filename: `str` - - address: `str` - - dytpe: `numpy.dtype` - - shape: sequence of `int`, optional - - mask: `bool`, optional + .. versionadded:: (cfdm) HDFVER - units: `str` or `None` or False, optional - The units of the variable. Set to `None` to indicate - that there are no units. If False (the default) then - the units are considered unset. - - calendar: `str` or `None`, optional - The calendar of the variable. By default, or if set to - `None`, then the CF default calendar is assumed, if - applicable. - - missing_values: `dict`, optional - The missing value indicators defined by the variable - attributes. + :Parameters: - s3 + kwargs: optional + Initialisation parameters to pass to the new instance. :Returns: diff --git a/cfdm/data/__init__.py b/cfdm/data/__init__.py index 6cc6b5bb9..cb647802c 100644 --- a/cfdm/data/__init__.py +++ b/cfdm/data/__init__.py @@ -20,6 +20,7 @@ from .gatheredarray import GatheredArray from .h5netcdfarray import H5netcdfArray from .netcdf4array import NetCDF4Array +from .netcdfindexer import NetCDFIndexer from .numpyarray import NumpyArray from .pointtopologyarray import PointTopologyArray from .raggedcontiguousarray import RaggedContiguousArray @@ -27,6 +28,5 @@ from .raggedindexedcontiguousarray import RaggedIndexedContiguousArray from .sparsearray import SparseArray from .subsampledarray import SubsampledArray -from .variableindexer import VariableIndexer from .data import Data diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index 8aee0823b..5ef6fd946 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -5,7 +5,7 @@ from . import abstract from .mixin import FileArrayMixin, NetCDFFileMixin -from .variableindexer import VariableIndexer +from .netcdfindexer import NetCDFIndexer _safecast = netCDF4.utils._safecast default_fillvals = netCDF4.default_fillvals.copy() @@ -28,6 +28,7 @@ def __init__( dtype=None, shape=None, mask=True, + unpack=True, units=False, calendar=False, missing_values=None, @@ -64,10 +65,18 @@ def __init__( If True (the default) then mask by convention when reading data from disk. - A netCDF array is masked depending on the values of any of - the netCDF variable attributes ``valid_min``, - ``valid_max``, ``valid_range``, ``_FillValue`` and - ``missing_value``. + A netCDF array is masked depending on the values of + any of the netCDF attributes ``_FillValue``, + ``missing_value``, ``_Unsigned``, ``valid_min``, + ``valid_max``, and ``valid_range``. + + unpack: `bool` + If True (the default) then unpack by convention when + reading data from disk. + + A netCDF array is unpacked depending on the values of + the netCDF attributes ``add_offset`` and + ``scale_factor``. units: `str` or `None`, optional The units of the variable. Set to `None` to indicate @@ -121,6 +130,11 @@ def __init__( except AttributeError: mask = True + try: + unpack = source._get_component("unpack", True) + except AttributeError: + unpack = True + try: units = source._get_component("units", False) except AttributeError: @@ -168,7 +182,8 @@ def __init__( ) self._set_component("dtype", dtype, copy=False) - self._set_component("mask", mask, copy=False) + self._set_component("mask", bool(mask), copy=False) + self._set_component("unpack", bool(unpack), copy=False) self._set_component("units", units, copy=False) self._set_component("calendar", calendar, copy=False) self._set_component("storage_options", storage_options, copy=False) @@ -187,9 +202,7 @@ def __getitem__(self, indices): dataset, address = self.open() dataset0 = dataset - mask = self.get_mask() groups, address = self.get_groups(address) - if groups: dataset = self._group(dataset, groups) @@ -197,8 +210,11 @@ def __getitem__(self, indices): variable = dataset.variables[address] # Get the data, applying masking and scaling as required. - array = VariableIndexer( - variable, mask=mask, scale=True, always_mask=False + array = NetCDFIndexer( + variable, + mask=self.get_mask(), + unpack=self.get_unpack(), + always_mask=False, ) array = array[indices] diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index 2024b6fd7..5190f790c 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -191,7 +191,7 @@ def get_formats(self): return (self.get_format(),) * len(self.get_filenames()) def get_storage_options( - self, endpoint_url=True, filename=None, parsed_filename=None + self, create_endpoint_url=True, filename=None, parsed_filename=None ): """Return `s3fs.S3FileSystem` options for accessing S3 files. @@ -199,8 +199,11 @@ def get_storage_options( :Parameters: - endpoint_url: `bool`, optional - TODOHDF + create_endpoint_url: `bool`, optional + If True, the default, then create an + ``'endpoint_url'`` if and only if one has not already + been provided. See *filename* and *parsed_filename* + for details. filename: `str`, optional Used to set the ``'endpoint_url'`` key if it has not @@ -215,27 +218,28 @@ def get_storage_options( :Returns: - `dict` + `dict` or `None` The `s3fs.S3FileSystem` options. - **Examples** >>> f.get_filename() 's3://store/data/file.nc' - >>> f.get_storage_options(endpoint_url=False) - {'anon': True} + >>> f.get_storage_options(create_endpoint_url=False) + {} >>> f.get_storage_options() - {'anon': True, 'endpoint_url': 'https://store'} + {'endpoint_url': 'https://store'} >>> f.get_storage_options(filename='s3://other-store/data/file.nc') - {'anon': True, 'endpoint_url': 'https://other-store'} + {'endpoint_url': 'https://other-store'} + >>> f.get_storage_options(create_endpoint_url=False, + ... filename='s3://other-store/data/file.nc') + {} >>> f.get_storage_options() - {'key": 'kjhsadf8756', - 'secret': '862t3gyebh', - 'endpoint_url': None, - 'client_kwargs': {'endpoint_url': 'http://some-s3.com', - 'config_kwargs': {'s3': {'addressing_style': 'virtual'}}}} + {'key: 'scaleway-api-key...', + 'secret': 'scaleway-secretkey...', + 'endpoint_url': 'https://s3.fr-par.scw.cloud', + 'client_kwargs': {'region_name': 'fr-par'}} """ out = self._get_component("storage_options", None) @@ -244,7 +248,7 @@ def get_storage_options( else: out = deepcopy(out) - if endpoint_url and "endpoint_url" not in out: + if create_endpoint_url and "endpoint_url" not in out: if parsed_filename is None: if filename is None: try: @@ -297,7 +301,7 @@ def open(self, func, *args, **kwargs): elif url.scheme == "s3": # Create an openable S3 file object storage_options = self.get_storage_options( - endpoint_url=True, parsed_filename=url + create_endpoint_url=True, parsed_filename=url ) fs = S3FileSystem(**storage_options) filename = fs.open(url.path[1:], "rb") diff --git a/cfdm/data/mixin/netcdffilemixin.py b/cfdm/data/mixin/netcdffilemixin.py index 458c01b1c..652194ef4 100644 --- a/cfdm/data/mixin/netcdffilemixin.py +++ b/cfdm/data/mixin/netcdffilemixin.py @@ -208,6 +208,19 @@ def get_missing_values(self): return out.copy() + def get_unpack(self): + """Whether or not to automatically unpack the data. + + .. versionadded:: (cfdm) HDFVER + + **Examples** + + >>> a.get_unpack() + True + + """ + return self._get_component("unpack") + def to_memory(self): """Bring data on disk into memory. diff --git a/cfdm/data/netcdf4array.py b/cfdm/data/netcdf4array.py index 3da924756..1ae15b676 100644 --- a/cfdm/data/netcdf4array.py +++ b/cfdm/data/netcdf4array.py @@ -2,7 +2,7 @@ from . import abstract from .mixin import FileArrayMixin, NetCDFFileMixin -from .variableindexer import VariableIndexer +from .netcdfindexer import NetCDFIndexer # import numpy as np @@ -21,6 +21,7 @@ def __init__( dtype=None, shape=None, mask=True, + unpack=True, units=False, calendar=False, missing_values=None, @@ -59,13 +60,23 @@ def __init__( If True (the default) then mask by convention when reading data from disk. - A netCDF array is masked depending on the values of any of - the netCDF variable attributes ``valid_min``, - ``valid_max``, ``valid_range``, ``_FillValue`` and - ``missing_value``. + A netCDF array is masked depending on the values of + any of the netCDF attributes ``_FillValue``, + ``missing_value``, ``_Unsigned``, ``valid_min``, + ``valid_max``, and ``valid_range``. .. versionadded:: (cfdm) 1.8.2 + unpack: `bool` + If True (the default) then unpack by convention when + reading data from disk. + + A netCDF array is unpacked depending on the values of + the netCDF attributes ``add_offset`` and + ``scale_factor``. + + .. versionadded:: (cfdm) HDFVER + units: `str` or `None`, optional The units of the netCDF variable. Set to `None` to indicate that there are no units. If unset then the @@ -134,6 +145,11 @@ def __init__( except AttributeError: mask = True + try: + unpack = source._get_component("unpack", True) + except AttributeError: + unpack = True + try: units = source._get_component("units", False) except AttributeError: @@ -174,7 +190,8 @@ def __init__( ) self._set_component("dtype", dtype, copy=False) - self._set_component("mask", mask, copy=False) + self._set_component("mask", bool(mask), copy=False) + self._set_component("unpack", bool(unpack), copy=False) self._set_component("units", units, copy=False) self._set_component("calendar", calendar, copy=False) @@ -205,9 +222,7 @@ def __getitem__(self, indices): netcdf, address = self.open() dataset = netcdf - mask = self.get_mask() groups, address = self.get_groups(address) - if groups: # Traverse the group structure, if there is one (CF>=1.8). netcdf = self._group(netcdf, groups) @@ -222,8 +237,11 @@ def __getitem__(self, indices): break # Get the data, applying masking and scaling as required. - array = VariableIndexer( - variable, mask=mask, scale=True, always_mask=False + array = NetCDFIndexer( + variable, + mask=self.get_mask(), + unpack=self.get_unpack(), + always_mask=False, ) array = array[indices] diff --git a/cfdm/data/variableindexer.py b/cfdm/data/netcdfindexer.py similarity index 84% rename from cfdm/data/variableindexer.py rename to cfdm/data/netcdfindexer.py index 38b5c307b..4302b4de3 100644 --- a/cfdm/data/variableindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -9,15 +9,21 @@ logger = logging.getLogger(__name__) -class VariableIndexer: - """A data indexer that applies CF masking and scaling. +class NetCDFIndexer: + """A data indexer that applies netCDF masking and unpacking. - During indexing, masking and scaling is applied according to the - CF conventions, either or both of which may be disabled via + During indexing, masking and unpacking is applied according to the + netCDF conventions, either or both of which may be disabled via initialisation options. - String and character variables are converted to unicode arrays, - the latter with the last dimension concatenated. + The netCDF conventions assign special meaning to the following + variable attributes: ``_FillValue``, ``missing_value``, + ``_Unsigned``, ``valid_max``, ``valid_min``, and ``valid_range`` + (for masking); and ``add_offset`` and ``scale_factor`` (for + unpacking). + + In addition, string and character variables are converted to + unicode arrays, the latter with the last dimension concatenated. Adapted from `netCDF4`. @@ -27,7 +33,7 @@ class VariableIndexer: >>> import netCDF4 >>> nc = netCDF4.Dataset('file.nc', 'r') - >>> x = cfdm.VariableIndexer(nc.variables['x']) + >>> x = cfdm.{{class}}(nc.variables['x']) >>> x.shape (12, 64, 128) >>> print(x[0, 0:4, 0:3]) @@ -38,7 +44,7 @@ class VariableIndexer: >>> import h5netcdf >>> h5 = h5netcdf.File('file.nc', 'r') - >>> x = cfdm.VariableIndexer(h5.variables['x']) + >>> x = cfdm.{{class}}(h5.variables['x']) >>> x.shape (12, 64, 128) >>> print(x[0, 0:4, 0:3]) @@ -49,37 +55,38 @@ class VariableIndexer: >>> import numpy as np >>> n = np.arange(9) - >>> x = cfdm.VariableIndexer(n) + >>> x = cfdm.{{class}}(n) >>> x.shape (9,) >>> print(x[...]) [1 2 3 4 5 6 7 8] - >>> x = cfdm.VariableIndexer(n, attrs={'_FillValue': 4}) + >>> x = cfdm.{{class}}(n, attrs={'_FillValue': 4}) >>> print(x[...]) [1 2 3 -- 5 6 7 8] """ def __init__( - self, variable, mask=True, scale=True, always_mask=False, attrs=None + self, variable, mask=True, unpack=True, always_mask=False, attrs=None ): """**Initialisation** :Parameters: variable: - The variable to be indexed, one `netCDF4.Variable`, + The variable to be indexed, one of `netCDF4.Variable`, `h5netcdf.Variable`, or `numpy.ndarray`. Any masking - and scaling that may be applied by the *variable* - itself is disabled, i.e. Any masking and scaling is - always applied by the `VariableIndexer` instance. + and unpacking that could be implemented by applied by + the *variable* itself is disabled, i.e. Any masking + and unpacking is always applied by the + `NetCDFIndexer` instance. mask: `bool` If True, the default, then an array returned by indexing is automatically converted to a masked array when missing values or fill values are present. - scale: `bool` + unpack: `bool` If True, the default, then the ``scale_factor`` and ``add_offset`` are applied to an array returned by indexing, and signed integer data is automatically @@ -94,14 +101,17 @@ def __init__( no missing values. attrs: `dict`, optional + Provide the netCDF attributes of the *variable* as + dictionary key/value pairs. If *attrs* is set then any + netCDF attributes stored by *variable* itself are + ignored. """ self.variable = variable self.mask = mask - self.scale = scale + self.unpack = unpack self.always_mask = always_mask - self.attrs = attrs - + self._attrs = attrs self.shape = variable.shape def __getitem__(self, index): @@ -109,15 +119,14 @@ def __getitem__(self, index): v.__getitem__(index) <==> v[index] - Indexing follows rules defined by the variable. + Indexing follows the rules defined by the variable. .. versionadded:: (cfdm) HDFVER """ variable = self.variable - scale = self.scale - - attrs = self._attrs(variable) + unpack = self.unpack + attrs = self.attrs() dtype = variable.dtype netCDF4_scale = False @@ -149,8 +158,8 @@ def __getitem__(self, index): if dtype is str: dtype = data.dtype - if scale: - dtype_unsigned_int = None + dtype_unsigned_int = None + if unpack: is_unsigned_int = attrs.get("_Unsigned", False) in ("true", "True") if is_unsigned_int: data_dtype = data.dtype @@ -165,15 +174,14 @@ def __getitem__(self, index): data, dtype, attrs, - scale=scale, + unpack=unpack, dtype_unsigned_int=dtype_unsigned_int, ) - if scale: - data = self._scale(data, attrs) + if unpack: + data = self._unpack(data, attrs) if data.dtype.kind == "S": - # Assume that object arrays contain strings data = data.astype("U", copy=False) # Reset a netCDF4 variables's scale and mask behaviour @@ -185,40 +193,6 @@ def __getitem__(self, index): return data - def _attrs(self, variable): - """Return the variable attributes. - - .. versionadded:: (cfdm) HDFVER - - :Parameter: - - variable: - The variable to be indexed, one `netCDF4.Variable`, - `h5netcdf.Variable`, or `numpy.ndarray`. - - :Returns: - - `dict` - The attributes. - - """ - if self.attrs is not None: - return self.attrs.copy() - - try: - # h5netcdf - return dict(variable.attrs) - except AttributeError: - try: - # netCDF4 - return { - attr: variable.getncattr(attr) - for attr in variable.ncattrs() - } - except AttributeError: - # numpy - return {} - def _check_safecast(self, attname, dtype, attrs): """Check an attribute's data type. @@ -273,10 +247,10 @@ def _set_FillValue(self, variable, attrs): If the attributes already contain a ``_FillValue`` then nothing is done. - .. seealso:: `_default_FillValue` - .. versionadded:: (cfdm) HDFVER + .. seealso:: `_default_FillValue` + :Parameter: variable: `h5netcdf.Variable` @@ -309,10 +283,10 @@ def _set_FillValue(self, variable, attrs): def _default_FillValue(self, dtype): """Return the default ``_FillValue`` for the given data type. - .. seealso:: `_set_FillValue`, `netCDF4.default_fillvals` - .. versionadded:: (cfdm) HDFVER + .. seealso:: `_set_FillValue`, `netCDF4.default_fillvals` + :Parameter: dtype: `numpy.dtype` @@ -333,7 +307,7 @@ def _mask( data, dtype, attrs, - scale=True, + unpack=True, dtype_unsigned_int=None, ): """Mask the data. @@ -343,7 +317,7 @@ def _mask( :Parameter: data: `numpy.ndarray` - The unmasked and unscaled data indexed from the + The unmasked and unpacked data indexed from the variable. dtype: `numpy.dtype` @@ -353,8 +327,8 @@ def _mask( attrs: `dict` The variable attributes. - scale: `bool` - Whether the data is to be scaled. + unpack: `bool` + Whether the data is to be unpacked. dtype_unsigned_int: `dtype` or `None` The data type to which unsigned integer data has been @@ -364,7 +338,7 @@ def _mask( :Returns: `nump.ndarray` - The masked (but not scaled) data. + The masked (but not unpacked) data. """ totalmask = np.zeros(data.shape, np.bool_) @@ -375,7 +349,7 @@ def _mask( ) if safe_missval: mval = np.array(missing_value, dtype) - if scale and dtype_unsigned_int is not None: + if unpack and dtype_unsigned_int is not None: mval = mval.view(dtype_unsigned_int) # Create mask from missing values. @@ -384,7 +358,6 @@ def _mask( mval = (mval,) # Make into iterable. for m in mval: - # Is scalar missing value a NaN? try: mvalisnan = np.isnan(m) except TypeError: @@ -412,10 +385,9 @@ def _mask( if safe_fillval: fval = np.array(_FillValue, dtype) - if scale and dtype_unsigned_int is not None: + if unpack and dtype_unsigned_int is not None: fval = fval.view(dtype_unsigned_int) - # Is _FillValue a NaN? try: fvalisnan = np.isnan(fval) except Exception: @@ -461,7 +433,7 @@ def _mask( if safe_validmax: validmax = np.array(valid_max, dtype) - if scale: + if unpack: if validmin is not None and dtype_unsigned_int is not None: validmin = validmin.view(dtype_unsigned_int) @@ -493,15 +465,15 @@ def _mask( return data - def _scale(self, data, attrs): - """Scale the data.. + def _unpack(self, data, attrs): + """Unpack the data.. .. versionadded:: (cfdm) HDFVER :Parameter: data: `numpy.ndarray` - The unmasked and unscaled data indexed from the + The unmasked and unpacked data indexed from the variable. attrs: `dict` @@ -510,7 +482,7 @@ def _scale(self, data, attrs): :Returns: `nump.ndarray` - The scaled data. + The unpacked data. """ # If variable has scale_factor and add_offset attributes, @@ -543,3 +515,38 @@ def _scale(self, data, attrs): data = data + add_offset return data + + def attrs(self): + """Return the netCDF attributes of the variable. + + .. versionadded:: (cfdm) HDFVER + + :Returns: + + `dict` + The attributes. + + **Examples** + + >>> v.attrs() + {'standard_name': 'air_temperature', + 'missing_value': -999.0} + + """ + if self._attrs is not None: + return self._attrs.copy() + + variable = self.variable + try: + # h5netcdf + return dict(variable.attrs) + except AttributeError: + try: + # netCDF4 + return { + attr: variable.getncattr(attr) + for attr in variable.ncattrs() + } + except AttributeError: + # numpy + return {} diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py index 140487a1e..5d1b9420d 100644 --- a/cfdm/docstring/docstring.py +++ b/cfdm/docstring/docstring.py @@ -402,32 +402,31 @@ "{{init cell_dimension: `int`}}": """cell_dimension: `int` The position of the *data* dimension that indexes the cells, either ``0`` or ``1``.""", - # storage_options + # init storage_options "{{init storage_options: `dict` or `None`, optional}}": """storage_options: `dict` or `None`, optional - Key/value pairs to be passed on to the `s3fs.S3FileSystem` - file-system backend to control the opening of files in an - S3 object store. By default, or if `None`, then a value of - ``{'anon': True}`` is used. Ignored for file names that - don't start with ``s3:``. - - If and only if *s3* has no ``'endpoint_url'`` key, then - one will be automatically derived from the file name and - included in the keyword parameters. For example, for a - file name of ``'s3://store/data/file.nc'``, an - ``'endpoint_url'`` key with value ``'https://store'`` - would be created. To disable this behaviour, assign `None` - to the ``'endpoint_url'`` key. - - *Parameter example:* - For a file name of ``'s3://store/data/file.nc'``, the - following are equivalent: ``{'anon': True}`` and - ``{'anon': True, 'endpoint_url': 'https://store'}``. - - *Parameter example:* - ``{'key": 'kjhsadf8756', 'secret': '862t3gyebh', - 'endpoint_url': None, 'client_kwargs': {'endpoint_url': - 'http://some-s3.com', 'config_kwargs': {'s3': - {'addressing_style': 'virtual'}}}}``""", + Key/value pairs to be passed on to the creation of an + `s3fs.S3FileSystem` file system to control the opening + of the file in an S3 object store. Ignored for a file + not in an S3 object store, i.e. one whose name does + not start with ``s3:``. + + If an ``'endpoint_url'`` key is not in + *storage_options* then one will be automatically + derived for accessing an S3 file. For example, for a + file name of ``'s3://store/data/file.nc'``, an + ``'endpoint_url'`` key with value ``'https://store'`` + would be created. + + *Parameter example:* + For a file name of ``'s3://store/data/file.nc'``, + the following are equivalent: ``None``, ``{}`` and + ``{'endpoint_url': 'https://store'}``. + + *Parameter example:* + ``{'key: 'scaleway-api-key...', 'secret': + 'scaleway-secretkey...', 'endpoint_url': + 'https://s3.fr-par.scw.cloud', 'client_kwargs': + {'region_name': 'fr-par'}}``""", # ---------------------------------------------------------------- # Method description susbstitutions (4 levels of indentataion) # ---------------------------------------------------------------- diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index e5299e90d..9772bf680 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -897,6 +897,7 @@ def read( _scan_only=False, verbose=None, mask=True, + unpack=True, warnings=True, warn_valid=False, domain=False, @@ -937,6 +938,11 @@ def read( .. versionadded:: (cfdm) 1.8.2 + unpack: `bool`, optional + See `cfdm.read` for details + + .. versionadded:: (cfdm) HDFVER + warn_valid: `bool`, optional See `cfdm.read` for details @@ -1022,8 +1028,9 @@ def read( "vertical_crs": {}, # "version": {}, - # Auto mask? + # Auto mask and unpack? "mask": bool(mask), + "unpack": bool(unpack), # Warn for the presence of valid_[min|max|range] # attributes? "warn_valid": bool(warn_valid), @@ -1087,8 +1094,7 @@ def read( g["version"][version] = Version(version) if storage_options is None: - # Default storage options - g["storage_options"] = {"anon": True} + storage_options = {"anon": True} if _file_systems is not None: # Update S3 file systems with those passed in as keyword @@ -6211,6 +6217,7 @@ def _create_netcdfarray( "shape": shape, "dtype": dtype, "mask": g["mask"], + "unpack": g["unpack"], "units": units, "calendar": calendar, "missing_values": missing_values, diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 5d0ebe21c..27649f310 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -16,6 +16,7 @@ def read( warnings=False, warn_valid=False, mask=True, + unpack=True, domain=False, storage_options=None, netCDF_backend=None, @@ -232,10 +233,10 @@ def read( If True (the default) then mask by convention the data of field and metadata constructs. - The masking by convention of a netCDF array depends on the - values of any of the netCDF variable attributes - ``_FillValue``, ``missing_value``, ``valid_min``, - ``valid_max`` and ``valid_range``. + A netCDF array is masked depending on the values of any of + the netCDF attributes ``_FillValue``, ``missing_value``, + ``_Unsigned``, ``valid_min``, ``valid_max``, and + ``valid_range``. See https://ncas-cms.github.io/cfdm/tutorial.html#data-mask @@ -243,6 +244,15 @@ def read( .. versionadded:: (cfdm) 1.8.2 + unpack: `bool` + If True (the default) then unpack by convention when + reading data from disk. + + A netCDF array is unpacked depending on the values of the + netCDF attributes ``add_offset`` and ``scale_factor``. + + .. versionadded:: (cfdm) HDFVER + domain: `bool`, optional If True then return only the domain constructs that are explicitly defined by CF-netCDF domain variables, ignoring @@ -266,30 +276,31 @@ def read( .. versionadded:: (cfdm) 1.9.0.0 storage_options: `dict` or `None`, optional - Key/value pairs to be passed on to the `s3fs.S3FileSystem` - file-system backend to control the opening of files in an - S3 object store. By default, or if `None`, then a value of - ``{'anon': True}`` is used. Ignored for file names that - don't start with ``s3:``. - - If and only if *s3* has no ``'endpoint_url'`` key, then - one will be automatically derived from the file name and - included in the keyword parameters. For example, for a - file name of ``'s3://store/data/file.nc'``, an - ``'endpoint_url'`` key with value ``'https://store'`` - would be created. To disable this behaviour, assign `None` - to the ``'endpoint_url'`` key. + Key/value pairs to be passed on to the creation of + `s3fs.S3FileSystem` file systems to control the opening of + files in S3 object stores. Ignored for files not in an S3 + object store, i.e. those whose names do not start with + ``s3:``. + + By default, or if `None`, then a value of ``{'anon': + True}`` is used. + + If an ``'endpoint_url'`` key is not in *storage_options* + then one will be automatically derived for accessing each + S3 file. For example, for a file name of + ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key + with value ``'https://store'`` would be created. *Parameter example:* For a file name of ``'s3://store/data/file.nc'``, the - following are equivalent: ``{'anon': True}`` and - ``{'anon': True, 'endpoint_url': 'https://store'}``. + following are equivalent: ``None``, ``{'anon': True}``, + and ``{'anon': True, 'endpoint_url': 'https://store'}``. *Parameter example:* - ``{'key": 'kjhsadf8756', 'secret': '862t3gyebh', - 'endpoint_url': None, 'client_kwargs': {'endpoint_url': - 'http://some-s3.com', 'config_kwargs': {'s3': - {'addressing_style': 'virtual'}}}}`` + ``{'key: 'scaleway-api-key...', 'secret': + 'scaleway-secretkey...', 'endpoint_url': + 'https://s3.fr-par.scw.cloud', 'client_kwargs': + {'region_name': 'fr-par'}}`` .. versionadded:: (cfdm) HDFVER @@ -374,6 +385,7 @@ def read( warnings=warnings, warn_valid=warn_valid, mask=mask, + unpack=unpack, domain=domain, storage_options=storage_options, netCDF_backend=netCDF_backend, diff --git a/cfdm/test/test_NetCDFArray.py b/cfdm/test/test_NetCDF4Array.py similarity index 63% rename from cfdm/test/test_NetCDFArray.py rename to cfdm/test/test_NetCDF4Array.py index 80e46aa13..f9ef9062c 100644 --- a/cfdm/test/test_NetCDFArray.py +++ b/cfdm/test/test_NetCDF4Array.py @@ -7,6 +7,8 @@ faulthandler.enable() # to debug seg faults and timeouts +import numpy as np + import cfdm n_tmpfiles = 1 @@ -29,8 +31,8 @@ def _remove_tmpfiles(): atexit.register(_remove_tmpfiles) -class NetCDFTest(unittest.TestCase): - """Unit test for the NetCDF class.""" +class NetCDF4ArrayTest(unittest.TestCase): + """Unit test for the NetCDF4Array class.""" def setUp(self): """Preparations called immediately before each test method.""" @@ -95,6 +97,53 @@ def test_NetCDF4Array_get_missing_values(self): a = cfdm.NetCDF4Array("file.nc", "ncvar") self.assertIsNone(a.get_missing_values()) + def test_NetCDF4Array_mask(self): + """Test NetCDF4Array masking.""" + f = cfdm.example_field(0) + f.data[0] = np.ma.masked + cfdm.write(f, tmpfile) + array = f.array + + n = cfdm.NetCDF4Array(tmpfile, f.nc_get_variable(), shape=f.shape) + self.assertTrue(n.get_mask()) + n = n[...] + self.assertTrue((array.mask == n.mask).all()) + + n = cfdm.NetCDF4Array( + tmpfile, f.nc_get_variable(), shape=f.shape, mask=False + ) + self.assertFalse(n.get_mask()) + n = n[...] + self.assertEqual(np.ma.count(n), n.size) + + def test_NetCDF4Array_unpack(self): + """Test NetCDF4Array unpacking.""" + add_offset = 10.0 + scale_factor = 3.14 + + f = cfdm.example_field(0) + f.data[0] = np.ma.masked + array0 = f.array + array1 = (array0 - add_offset) / scale_factor + + f.set_property("add_offset", add_offset) + f.set_property("scale_factor", scale_factor) + cfdm.write(f, tmpfile) + + n = cfdm.NetCDF4Array(tmpfile, f.nc_get_variable(), shape=f.shape) + self.assertTrue(n.get_unpack()) + n = n[...] + self.assertTrue((n.mask == array0.mask).all()) + self.assertTrue(np.ma.allclose(n, array0)) + + n = cfdm.NetCDF4Array( + tmpfile, f.nc_get_variable(), shape=f.shape, unpack=False + ) + self.assertFalse(n.get_unpack()) + n = n[...] + self.assertTrue((n.mask == array1.mask).all()) + self.assertTrue((n == array1).all()) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/cfdm/test/test_VariableIndexer.py b/cfdm/test/test_NetCDFIndexer.py similarity index 85% rename from cfdm/test/test_VariableIndexer.py rename to cfdm/test/test_NetCDFIndexer.py index 53b5a9121..7cc1172f0 100644 --- a/cfdm/test/test_VariableIndexer.py +++ b/cfdm/test/test_NetCDFIndexer.py @@ -14,7 +14,7 @@ n_tmpfiles = 1 tmpfiles = [ - tempfile.mkstemp("_test_VariableIndxer.nc", dir=os.getcwd())[1] + tempfile.mkstemp("_test_NetCDFIndexer.nc", dir=os.getcwd())[1] for i in range(n_tmpfiles) ] (tmpfile,) = tmpfiles @@ -34,17 +34,17 @@ def _remove_tmpfiles(): netCDF_backends = ("netCDF4", "h5netcdf") -class VariableIndexerTest(unittest.TestCase): +class NetCDFIndexerTest(unittest.TestCase): """Test the masking and scaling of netCDF data.""" - def test_shape(self): - """Test VariableIndexer shape.""" + def test_NetCDFIndexer_shape(self): + """Test NetCDFIndexer shape.""" n = np.ma.arange(9) - x = cfdm.VariableIndexer(n) + x = cfdm.NetCDFIndexer(n) self.assertEqual(x.shape, n.shape) - def test_mask(self): - """Test VariableIndexer for CF masking.""" + def test_NetCDFIndexer_mask(self): + """Test NetCDFIndexer for CF masking.""" f0 = cfdm.example_field(0) f0.del_property("missing_value", None) f0.del_property("_FillValue", None) @@ -90,8 +90,10 @@ def test_mask(self): self.assertTrue((g.array == na).all()) self.assertTrue((g.data.mask.array == na.mask).all()) - def test_scale(self): - """Test VariableIndexer for CF scaling.""" + nc.close() + + def test_NetCDFIndexer_scale(self): + """Test NetCDFIndexer for CF scaling.""" f = cfdm.example_field(0) array = np.ma.arange(40, dtype="int32").reshape(f.shape) @@ -120,14 +122,16 @@ def test_scale(self): self.assertTrue((g.array == na).all()) self.assertTrue((g.data.mask.array == na.mask).all()) - def test_numpy(self): - """Test VariableIndexer for numpy.""" + nc.close() + + def test_NetCDFIndexer_numpy(self): + """Test NetCDFIndexer for numpy.""" array = np.ma.arange(9) - x = cfdm.VariableIndexer(array) + x = cfdm.NetCDFIndexer(array) x = x[...] self.assertTrue((x == array).all()) - x = cfdm.VariableIndexer( + x = cfdm.NetCDFIndexer( array.copy(), attrs={"_FillValue": 4, "missing_value": (0, 8)} ) x = x[...] From 0009c31dbf81926cd3768d1b507fefa618905743 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 7 Feb 2024 11:33:58 +0000 Subject: [PATCH 31/88] dev --- cfdm/data/netcdf4array.py | 13 ++++++++++ cfdm/data/netcdfindexer.py | 22 ++++++++++------ cfdm/test/test_NetCDF4Array.py | 46 ++++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 8 deletions(-) diff --git a/cfdm/data/netcdf4array.py b/cfdm/data/netcdf4array.py index 1ae15b676..473020167 100644 --- a/cfdm/data/netcdf4array.py +++ b/cfdm/data/netcdf4array.py @@ -25,6 +25,7 @@ def __init__( units=False, calendar=False, missing_values=None, + storage_options=None, source=None, copy=True, ): @@ -99,6 +100,10 @@ def __init__( .. versionadded:: (cfdm) 1.10.0.3 + {{init storage_options: `dict` or `None`, optional}} + + .. versionadded:: (cfdm) HDFVER + {{init source: optional}} .. versionadded:: (cfdm) 1.10.0.0 @@ -165,6 +170,13 @@ def __init__( except AttributeError: missing_values = None + try: + storage_options = source._get_component( + "storage_options", None + ) + except AttributeError: + storage_options = None + if shape is not None: self._set_component("shape", shape, copy=False) @@ -194,6 +206,7 @@ def __init__( self._set_component("unpack", bool(unpack), copy=False) self._set_component("units", units, copy=False) self._set_component("calendar", calendar, copy=False) + self._set_component("storage_options", storage_options, copy=False) # By default, close the netCDF file after data array access self._set_component("close", True, copy=False) diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index 4302b4de3..aa769bd19 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -16,14 +16,20 @@ class NetCDFIndexer: netCDF conventions, either or both of which may be disabled via initialisation options. - The netCDF conventions assign special meaning to the following - variable attributes: ``_FillValue``, ``missing_value``, - ``_Unsigned``, ``valid_max``, ``valid_min``, and ``valid_range`` - (for masking); and ``add_offset`` and ``scale_factor`` (for - unpacking). - - In addition, string and character variables are converted to - unicode arrays, the latter with the last dimension concatenated. + In addition, string and character variables are always converted + to unicode arrays, the latter with the last dimension + concatenated. + + Masking and unpacking operations are defined by netCDF attributes, + which are either provided as part of the input *data* object, or + given with the input *attrs* parameter. + + The netCDF attributes, all of which are optional, used are: + + * For masking: ``_FillValue``, ``missing_value``, ``_Unsigned``, + ``valid_max``, ``valid_min``, and ``valid_range`` + + * For unpacking: ``add_offset`` and ``scale_factor`` Adapted from `netCDF4`. diff --git a/cfdm/test/test_NetCDF4Array.py b/cfdm/test/test_NetCDF4Array.py index f9ef9062c..3b6a2f887 100644 --- a/cfdm/test/test_NetCDF4Array.py +++ b/cfdm/test/test_NetCDF4Array.py @@ -4,6 +4,7 @@ import os import tempfile import unittest +from urllib.parse import urlparse faulthandler.enable() # to debug seg faults and timeouts @@ -144,6 +145,51 @@ def test_NetCDF4Array_unpack(self): self.assertTrue((n.mask == array1.mask).all()) self.assertTrue((n == array1).all()) + def test_NetCDF4Array_get_storage_options(self): + """Test NetCDF4Array get_storage_options.""" + n = cfdm.NetCDF4Array(filename="filename.nc") + self.assertEqual(n.get_storage_options(), {}) + + n = cfdm.NetCDF4Array( + filename="filename.nc", storage_options={"anon": True} + ) + self.assertEqual(n.get_storage_options(), {"anon": True}) + + n = cfdm.NetCDF4Array(filename="s3://store/filename.nc") + self.assertEqual( + n.get_storage_options(), {"endpoint_url": "https://store"} + ) + self.assertEqual(n.get_storage_options(create_endpoint_url=False), {}) + + n = cfdm.NetCDF4Array( + filename="s3://store/filename.nc", storage_options={"anon": True} + ) + self.assertEqual( + n.get_storage_options(), + {"anon": True, "endpoint_url": "https://store"}, + ) + self.assertEqual( + n.get_storage_options(create_endpoint_url=False), {"anon": True} + ) + other_file = "s3://other/file.nc" + self.assertEqual( + n.get_storage_options(filename=other_file), + {"anon": True, "endpoint_url": "https://other"}, + ) + self.assertEqual( + n.get_storage_options(parsed_filename=urlparse(other_file)), + {"anon": True, "endpoint_url": "https://other"}, + ) + + n = cfdm.NetCDF4Array( + filename="s3://store/filename.nc", + storage_options={"anon": True, "endpoint_url": None}, + ) + self.assertEqual( + n.get_storage_options(), + {"anon": True, "endpoint_url": None}, + ) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) From d19d6c3d415577457f9cc1e3c85508801c3aa9c8 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 8 Feb 2024 18:27:50 +0000 Subject: [PATCH 32/88] dev --- cfdm/data/h5netcdfarray.py | 48 +++++++----- cfdm/data/mixin/arraymixin.py | 16 ++++ cfdm/data/mixin/netcdffilemixin.py | 113 +++++++++++++++++---------- cfdm/data/netcdf4array.py | 47 ++++++----- cfdm/data/netcdfindexer.py | 4 +- cfdm/read_write/netcdf/netcdfread.py | 27 +------ cfdm/test/test_NetCDF4Array.py | 25 +++++- 7 files changed, 172 insertions(+), 108 deletions(-) diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index 5ef6fd946..bd245f757 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -31,7 +31,7 @@ def __init__( unpack=True, units=False, calendar=False, - missing_values=None, + attributes=None, storage_options=None, source=None, copy=True, @@ -93,6 +93,7 @@ def __init__( The missing value indicators defined by the variable attributes. See `get_missing_values` for details. + {{init storage_options: `dict` or `None`, optional}} .. versionadded:: (cfdm) HDFVER @@ -146,9 +147,9 @@ def __init__( calendar = False try: - missing_values = source._get_component("missing_values", None) + attributes = source._get_component("attributes", None) except AttributeError: - missing_values = None + attributes = None try: storage_options = source._get_component( @@ -176,17 +177,13 @@ def __init__( self._set_component("address", address, copy=False) - if missing_values is not None: - self._set_component( - "missing_values", missing_values.copy(), copy=False - ) - self._set_component("dtype", dtype, copy=False) self._set_component("mask", bool(mask), copy=False) self._set_component("unpack", bool(unpack), copy=False) self._set_component("units", units, copy=False) self._set_component("calendar", calendar, copy=False) self._set_component("storage_options", storage_options, copy=False) + self._set_component("attributes", attributes, copy=False) # By default, close the file after data array access self._set_component("close", True, copy=False) @@ -218,7 +215,11 @@ def __getitem__(self, indices): ) array = array[indices] - # Set the units, if they haven't been set already. + # Set the attributes, if they haven't been set already. + self._set_attributes(variable) + + # Set the units, if they haven't been set already (do this + # after setting the attributes). self._set_units(variable) self.close(dataset0) @@ -226,25 +227,34 @@ def __getitem__(self, indices): return array - def _get_attr(self, var, attr): - """Get a variable attribute. + def _set_attributes(self, var): + """TODOHDF The units and calendar properties. - .. versionadded:: (cfdm) HDFVER + These are set from the netCDF variable attributes, but only if + they have already not been defined, either during {{class}} + instantiation or by a previous call to `_set_units`. - :Parameters: + .. versionadded:: (cfdm) 1.10.0.1 - var: `h5netcdf.Variable` - The variable. + :Parameters: - attr: `str` - The attribute name. + var: `netCDF4.Variable` or `h5netcdf.Variable` + The variable containing the units and calendar + definitions. :Returns: - The attirbute value. + `tuple` + The units and calendar values, either of which may be + `None`. """ - return var.attrs[attr] + attributes = self._get_component("attributes", None) + if attributes is not None: + return + + attributes = dict(var.attrs) + self._set_component("attributes", attributes, copy=False) def close(self, dataset): """Close the dataset containing the data. diff --git a/cfdm/data/mixin/arraymixin.py b/cfdm/data/mixin/arraymixin.py index 901087744..5dc42a5a7 100644 --- a/cfdm/data/mixin/arraymixin.py +++ b/cfdm/data/mixin/arraymixin.py @@ -1,3 +1,5 @@ +from copy import deepcopy + import numpy as np @@ -101,6 +103,14 @@ def _set_units(self): return units, calendar + def get_attributes(self, default=ValueError()): + """TODOHDF.""" + attributes = self._get_component("attributes", None) + if attributes is not None: + attributes = deepcopy(attributes) + + return attributes + def get_calendar(self, default=ValueError()): """The calendar of the array. @@ -162,6 +172,12 @@ def get_compression_type(self): """ return self._get_component("compression_type", "") + def get_missing_values(self): + """TODOHDF.""" + raise NotImplementedError( + f"Must implement {self.__class__.__name__}.get_missing_values" + ) # pragma: no cover + @classmethod def get_subspace(cls, array, indices, copy=True): """Return a subspace, defined by indices, of a numpy array. diff --git a/cfdm/data/mixin/netcdffilemixin.py b/cfdm/data/mixin/netcdffilemixin.py index 652194ef4..37461ec9b 100644 --- a/cfdm/data/mixin/netcdffilemixin.py +++ b/cfdm/data/mixin/netcdffilemixin.py @@ -1,3 +1,5 @@ +from copy import deepcopy + from ..numpyarray import NumpyArray @@ -8,28 +10,6 @@ class NetCDFFileMixin: """ - def _get_attr(self, var, attr): - """Get a variable attribute. - - .. versionadded:: (cfdm) HDFVER - - :Parameters: - - var: - The variable. - - attr: `str` - The attribute name. - - :Returns: - - The attirbute value. - - """ - raise NotImplementedError( - f"Must implement {self.__class__.__name__}._get_attr" - ) # pragma: no cover - def _group(self, dataset, groups): """Retrun the group object containing a variable. @@ -58,6 +38,32 @@ def _group(self, dataset, groups): return dataset + def _set_attributes(self, var): + """The units and calendar properties. + + These are set from the netCDF variable attributes, but only if + they have already not been defined, either during {{class}} + instantiation or by a previous call to `_set_units`. + + .. versionadded:: (cfdm) 1.10.0.1 + + :Parameters: + + var: `netCDF4.Variable` or `h5netcdf.Variable` + The variable containing the units and calendar + definitions. + + :Returns: + + `tuple` + The units and calendar values, either of which may be + `None`. + + """ + raise NotImplementedError( + f"Must implement {self.__class__.__name__}._set_attributes" + ) # pragma: no cover + def _set_units(self, var): """The units and calendar properties. @@ -80,26 +86,21 @@ def _set_units(self, var): `None`. """ + # We assume that an attributes dictionary exists + attributes = self._get_component("attributes") + # Note: Can't use None as the default since it is a valid # `units` or 'calendar' value that indicates that the # attribute has not been set in the dataset. units = self._get_component("units", False) if units is False: - try: - units = self._get_attr(var, "units") - except AttributeError: - units = None - - self._set_component("units", units, copy=False) + self._set_component("units", attributes.get("units"), copy=False) calendar = self._get_component("calendar", False) if calendar is False: - try: - calendar = self._get_attr(var, "calendar") - except AttributeError: - calendar = None - - self._set_component("calendar", calendar, copy=False) + self._set_component( + "calendar", attributes.get("calendar"), copy=False + ) return units, calendar @@ -173,11 +174,19 @@ def get_mask(self): """ return self._get_component("mask") - def get_missing_values(self): + def get_missing_values(self, default=ValueError()): """The missing value indicators from the netCDF variable. .. versionadded:: (cfdm) 1.10.0.3 + :Parameters: + + default: optional + Return the value of the *default* parameter no missing + values have yet been defined. + + {{default Exception}} + :Returns: `dict` or `None` @@ -189,9 +198,12 @@ def get_missing_values(self): **Examples** - >>> a.get_missing_values() + >>> a.get_missing_values(None) None + >>> b.get_missing_values({}) + {} + >>> b.get_missing_values() {} @@ -202,11 +214,30 @@ def get_missing_values(self): {'valid_min': -999} """ - out = self._get_component("missing_values", None) - if out is None: - return - - return out.copy() + attributes = self._get_component("attributes", None) + if attributes is None: + if default is None: + return + + return self._default( + default, + f"{self.__class__.__name__} missing values have not been set", + ) + + missing = {} + for attr in ( + "_FillValue", + "_Unsigned", + "missing_value", + "valid_min", + "valid_max", + "valid_range", + ): + value = attributes.get(attr) + if value is not None: + missing[attr] = deepcopy(value) + + return missing def get_unpack(self): """Whether or not to automatically unpack the data. diff --git a/cfdm/data/netcdf4array.py b/cfdm/data/netcdf4array.py index 473020167..d60127658 100644 --- a/cfdm/data/netcdf4array.py +++ b/cfdm/data/netcdf4array.py @@ -24,7 +24,7 @@ def __init__( unpack=True, units=False, calendar=False, - missing_values=None, + attributes=None, storage_options=None, source=None, copy=True, @@ -166,9 +166,9 @@ def __init__( calendar = False try: - missing_values = source._get_component("missing_values", None) + attributes = source._get_component("attributes", None) except AttributeError: - missing_values = None + attributes = None try: storage_options = source._get_component( @@ -196,17 +196,13 @@ def __init__( self._set_component("address", address, copy=False) - if missing_values is not None: - self._set_component( - "missing_values", missing_values.copy(), copy=False - ) - self._set_component("dtype", dtype, copy=False) self._set_component("mask", bool(mask), copy=False) self._set_component("unpack", bool(unpack), copy=False) self._set_component("units", units, copy=False) self._set_component("calendar", calendar, copy=False) self._set_component("storage_options", storage_options, copy=False) + self._set_component("attributes", attributes, copy=False) # By default, close the netCDF file after data array access self._set_component("close", True, copy=False) @@ -258,6 +254,9 @@ def __getitem__(self, indices): ) array = array[indices] + # Set the units, if they haven't been set already. + self._set_attributes(variable) + # Set the units, if they haven't been set already. self._set_units(variable) @@ -286,25 +285,37 @@ def __str__(self): """ return f"{self.get_filename(None)}, {self.get_address()}" - def _get_attr(self, var, attr): - """Get a variable attribute. + def _set_attributes(self, var): + """The units and calendar properties. - .. versionadded:: (cfdm) HDFVER + These are set from the netCDF variable attributes, but only if + they have already not been defined, either during {{class}} + instantiation or by a previous call to `_set_units`. - :Parameters: + .. versionadded:: (cfdm) 1.10.0.1 - var: `netCDF.Variable` - The variable + :Parameters: - attr: `str` - The attribute name. + var: `netCDF4.Variable` or `h5netcdf.Variable` + The variable containing the units and calendar + definitions. :Returns: - The attirbute value. + `tuple` + The units and calendar values, either of which may be + `None`. """ - return var.getncattr(attr) + # Note: Can't use None as the default since it is a valid + # `units` or 'calendar' value that indicates that the + # attribute has not been set in the dataset. + attributes = self._get_component("attributes", None) + if attributes is not None: + return + + attributes = {attr: var.getncattr(attr) for attr in var.ncattrs()} + self._set_component("attributes", attributes, copy=False) def get_groups(self, address): """The netCDF4 group structure of a netCDF variable. diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index aa769bd19..4c26cde69 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -175,7 +175,7 @@ def __getitem__(self, index): data = data.view(dtype_unsigned_int) if self.mask: - attrs = self._set_FillValue(variable, attrs) + # attrs = self._set_FillValue(variable, attrs) data = self._mask( data, dtype, @@ -291,7 +291,7 @@ def _default_FillValue(self, dtype): .. versionadded:: (cfdm) HDFVER - .. seealso:: `_set_FillValue`, `netCDF4.default_fillvals` + .. seealso:: `netCDF4.default_fillvals` :Parameter: diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 9772bf680..42b03f934 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -6191,26 +6191,6 @@ def _create_netcdfarray( "calendar" ) - # Store the missing value indicators - missing_values = {} - for attr in ( - "missing_value", - "_FillValue", - "valid_min", - "valid_max", - "valid_range", - ): - value = getattr(variable, attr, None) - if value is not None: - missing_values[attr] = value - - valid_range = missing_values.get("valid_range") - if valid_range is not None: - try: - missing_values["valid_range"] = tuple(valid_range) - except TypeError: - pass - kwargs = { "filename": filename, "address": ncvar, @@ -6220,13 +6200,10 @@ def _create_netcdfarray( "unpack": g["unpack"], "units": units, "calendar": calendar, - "missing_values": missing_values, + "attributes": g["variable_attributes"][ncvar], + "storage_options": g["file_system_storage_options"].get(filename), } - storage_options = g["file_system_storage_options"].get(filename) - if storage_options is not None: - kwargs["storage_options"] = storage_options - if return_kwargs_only: return kwargs diff --git a/cfdm/test/test_NetCDF4Array.py b/cfdm/test/test_NetCDF4Array.py index 3b6a2f887..6008b546f 100644 --- a/cfdm/test/test_NetCDF4Array.py +++ b/cfdm/test/test_NetCDF4Array.py @@ -79,7 +79,7 @@ def test_NetCDF4Array_get_missing_values(self): f.set_property("missing_value", -999) f.set_property("_FillValue", -3) - f.set_property("valid_range", [-111, 222]) + f.set_property("valid_min", -111) cfdm.write(f, tmpfile) g = cfdm.read(tmpfile)[0] @@ -88,7 +88,7 @@ def test_NetCDF4Array_get_missing_values(self): { "missing_value": -999.0, "_FillValue": -3, - "valid_range": (-111, 222), + "valid_min": -111, }, ) @@ -96,7 +96,7 @@ def test_NetCDF4Array_get_missing_values(self): self.assertEqual(c.data.source().get_missing_values(), {}) a = cfdm.NetCDF4Array("file.nc", "ncvar") - self.assertIsNone(a.get_missing_values()) + self.assertIsNone(a.get_missing_values(None)) def test_NetCDF4Array_mask(self): """Test NetCDF4Array masking.""" @@ -190,6 +190,25 @@ def test_NetCDF4Array_get_storage_options(self): {"anon": True, "endpoint_url": None}, ) + def test_NetCDF4Array_get_attributes(self): + """Test NetCDF4Array get_attributes.""" + f = cfdm.example_field(0) + cfdm.write(f, tmpfile) + n = cfdm.NetCDF4Array(tmpfile, f.nc_get_variable(), shape=f.shape) + self.assertIsNone(n.get_attributes()) + + _ = n[...] + self.assertEqual( + n.get_attributes(), + { + "cell_methods": "area: mean", + "coordinates": "time", + "project": "research", + "standard_name": "specific_humidity", + "units": "1", + }, + ) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) From 92f59eb17e51e6c8b3e88aad95866f328f0c72d0 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 9 Feb 2024 16:32:28 +0000 Subject: [PATCH 33/88] dev --- cfdm/data/netcdfindexer.py | 253 +++++++++++++++++++------------------ 1 file changed, 129 insertions(+), 124 deletions(-) diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index 4c26cde69..0403e109f 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -1,3 +1,22 @@ +"""License information: + +Substantial portions of this code were adapted from the `netCDF4` +library, which carries MIT License as follows: + +Copyright 2008 Jeffrey Whitaker + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +""" import logging import netCDF4 @@ -24,12 +43,12 @@ class NetCDFIndexer: which are either provided as part of the input *data* object, or given with the input *attrs* parameter. - The netCDF attributes, all of which are optional, used are: + The relevant netCDF attributes that may be used are: - * For masking: ``_FillValue``, ``missing_value``, ``_Unsigned``, - ``valid_max``, ``valid_min``, and ``valid_range`` + * For masking: ``missing_value``, ``valid_max``, ``valid_min``, + ``valid_range``, ``_FillValue``, ``_Unsigned`` - * For unpacking: ``add_offset`` and ``scale_factor`` + * For unpacking: ``add_offset``, ``scale_factor``, ``_Unsigned`` Adapted from `netCDF4`. @@ -60,15 +79,18 @@ class NetCDFIndexer: [243.1, 241.7, 240.4]] >>> import numpy as np - >>> n = np.arange(9) + >>> n = np.arange(7) >>> x = cfdm.{{class}}(n) >>> x.shape (9,) >>> print(x[...]) - [1 2 3 4 5 6 7 8] + [0 1 2 3 4 5 6] >>> x = cfdm.{{class}}(n, attrs={'_FillValue': 4}) >>> print(x[...]) - [1 2 3 -- 5 6 7 8] + [0 1 2 3 -- 5 6] + >>> x = cfdm.{{class}}(n, mask=False, attrs={'_FillValue': 4}) + >>> print(x[...]) + [0 1 2 3 4 5 6] """ @@ -79,25 +101,24 @@ def __init__( :Parameters: - variable: - The variable to be indexed, one of `netCDF4.Variable`, - `h5netcdf.Variable`, or `numpy.ndarray`. Any masking - and unpacking that could be implemented by applied by - the *variable* itself is disabled, i.e. Any masking - and unpacking is always applied by the - `NetCDFIndexer` instance. + variable: `netCDF4.Variable` or `h5netcdf.Variable` or `numpy.ndarray` + The variable to be indexed. Any masking and unpacking + that could be applied by applied by the *variable* + itself is disabled, i.e. Any masking and unpacking is + always done by the `NetCDFIndexer` instance. mask: `bool` If True, the default, then an array returned by - indexing is automatically converted to a masked array - when missing values or fill values are present. + indexing is automatically masked. Masking is governed + by the ``missing_value``, ``valid_max``, + ``valid_min``, ``valid_range``, ``_FillValue``, and + ``_Unsigned`` attributes. unpack: `bool` - If True, the default, then the ``scale_factor`` and - ``add_offset`` are applied to an array returned by - indexing, and signed integer data is automatically - converted to unsigned integer data if the - ``_Unsigned`` attribute is set to "true" or "True". + If True, the default, then an array returned by + indexing is automatically unpacked. Unpacking is + governed by the ``_Unsigned``, ``add_offset``, and + ``scale_factor`` attributes. always_mask: `bool` If False, the default, then an array returned by @@ -110,6 +131,8 @@ def __init__( Provide the netCDF attributes of the *variable* as dictionary key/value pairs. If *attrs* is set then any netCDF attributes stored by *variable* itself are + ignored. Only the attributes relevant to masking and + unpacking are considers, and all other attributes are ignored. """ @@ -143,7 +166,7 @@ def __getitem__(self, index): pass else: netCDF4_mask = variable.mask - # Prevent netCDF4 from doing any masking and scaling + # Prevent netCDF4 from doing any masking and unpacking variable.set_auto_maskandscale(False) # Index the variable @@ -175,14 +198,7 @@ def __getitem__(self, index): data = data.view(dtype_unsigned_int) if self.mask: - # attrs = self._set_FillValue(variable, attrs) - data = self._mask( - data, - dtype, - attrs, - unpack=unpack, - dtype_unsigned_int=dtype_unsigned_int, - ) + data = self._mask(data, dtype, attrs, dtype_unsigned_int) if unpack: data = self._unpack(data, attrs) @@ -247,45 +263,6 @@ def _check_safecast(self, attname, dtype, attrs): return is_safe, attvalue - def _set_FillValue(self, variable, attrs): - """Set the ``_FillValue`` from a `h5netcdf.Variable`. - - If the attributes already contain a ``_FillValue`` then - nothing is done. - - .. versionadded:: (cfdm) HDFVER - - .. seealso:: `_default_FillValue` - - :Parameter: - - variable: `h5netcdf.Variable` - The variable. - - attrs: `dict` - The variable attributes. Will get updated in-place if - a ``_FillValue`` is found. - - :Returns: - - `dict` - The variable attributes, updated with ``_FillValue`` - if present and not previously set. - - """ - if "_FillValue" not in attrs: - try: - # h5netcdf - _FillValue = getattr(variable._h5ds, "fillvalue", None) - except AttributeError: - # netCDf4 - pass - else: - if _FillValue is not None: - attrs["_FillValue"] = _FillValue - - return attrs - def _default_FillValue(self, dtype): """Return the default ``_FillValue`` for the given data type. @@ -308,14 +285,7 @@ def _default_FillValue(self, dtype): else: return _default_fillvals[dtype.str[1:]] - def _mask( - self, - data, - dtype, - attrs, - unpack=True, - dtype_unsigned_int=None, - ): + def _mask(self, data, dtype, attrs, dtype_unsigned_int): """Mask the data. .. versionadded:: (cfdm) HDFVER @@ -333,35 +303,34 @@ def _mask( attrs: `dict` The variable attributes. - unpack: `bool` - Whether the data is to be unpacked. - dtype_unsigned_int: `dtype` or `None` - The data type to which unsigned integer data has been - cast. Should be `None` for data that are not unsigned - integers. + The data type when the data have been cast to unsigned + integers, otherwise `None`. :Returns: `nump.ndarray` - The masked (but not unpacked) data. + The masked data. """ - totalmask = np.zeros(data.shape, np.bool_) + # + totalmask = None + # The fill value for the returned numpy array fill_value = None safe_missval, missing_value = self._check_safecast( "missing_value", dtype, attrs ) if safe_missval: + # -------------------------------------------------------- + # Create mask from missing_value + # -------------------------------------------------------- mval = np.array(missing_value, dtype) - if unpack and dtype_unsigned_int is not None: + if dtype_unsigned_int is not None: mval = mval.view(dtype_unsigned_int) - # Create mask from missing values. - mvalmask = np.zeros(data.shape, np.bool_) - if not mval.ndim: # mval a scalar. - mval = (mval,) # Make into iterable. + if not mval.ndim: + mval = (mval,) for m in mval: try: @@ -371,15 +340,18 @@ def _mask( mvalisnan = False if mvalisnan: - mvalmask += np.isnan(data) + mask = np.isnan(data) else: - mvalmask += data == m + mask = data == m + + if mask.any(): + if totalmask is None: + totalmask = mask + else: + totalmask += mask - if mvalmask.any(): - # Set fill_value for masked array to missing_value (or - # first element if missing_value is a vector). + if totalmask is not None: fill_value = mval[0] - totalmask += mvalmask # Set mask=True for data == fill value safe_fillval, _FillValue = self._check_safecast( @@ -390,8 +362,11 @@ def _mask( safe_fillval = True if safe_fillval: + # -------------------------------------------------------- + # Create mask from _FillValue + # -------------------------------------------------------- fval = np.array(_FillValue, dtype) - if unpack and dtype_unsigned_int is not None: + if dtype_unsigned_int is not None: fval = fval.view(dtype_unsigned_int) try: @@ -402,16 +377,17 @@ def _mask( if fvalisnan: mask = np.isnan(data) - elif (data == fval).any(): - mask = data == fval else: - mask = None + mask = data == fval - if mask is not None: + if mask.any(): if fill_value is None: fill_value = fval - totalmask += mask + if totalmask is None: + totalmask = mask + else: + totalmask += mask # Set mask=True for data outside [valid_min, valid_max] # @@ -439,14 +415,17 @@ def _mask( if safe_validmax: validmax = np.array(valid_max, dtype) - if unpack: - if validmin is not None and dtype_unsigned_int is not None: + if dtype_unsigned_int is not None: + if validmin is not None: validmin = validmin.view(dtype_unsigned_int) - if validmax is not None and dtype_unsigned_int is not None: + if validmax is not None: validmax = validmax.view(dtype_unsigned_int) if dtype.kind != "S": + # -------------------------------------------------------- + # Create mask from valid_min. valid_max, valid_range + # -------------------------------------------------------- # Don't set validmin/validmax mask for character data # # Setting valid_min/valid_max to the _FillVaue is too @@ -454,19 +433,32 @@ def _mask( # attribute best practices suggesting clients should do # this). if validmin is not None: - totalmask += data < validmin + mask = data < validmin + if totalmask is None: + totalmask = mask + else: + totalmask += mask if validmax is not None: - totalmask += data > validmax + mask = data > validmax + if totalmask is None: + totalmask = mask + else: + totalmask += mask + # ------------------------------------------------------------ # Mask the data - if totalmask.any(): - data = np.ma.masked_array(data, mask=totalmask, fill_value=fval) + # ------------------------------------------------------------ + if totalmask is not None and totalmask.any(): + data = np.ma.masked_array( + data, mask=totalmask, fill_value=fill_value + ) if not data.ndim: # Return a scalar numpy masked constant not a 0-d # masked array, so that data == np.ma.masked. data = data[()] - elif self.always_mask: + elif self.always_mask and not np.ma.isMA(data): + # Return a masked array when there are no masked elements data = np.ma.masked_array(data) return data @@ -491,34 +483,47 @@ def _unpack(self, data, attrs): The unpacked data. """ - # If variable has scale_factor and add_offset attributes, - # apply them. scale_factor = attrs.get("scale_factor") add_offset = attrs.get("add_offset") try: if scale_factor is not None: float(scale_factor) + except ValueError: + logging.warn( + "No unpacking done: 'scale_factor' attribute " + f"{scale_factor!r} can't be converted to a float" + ) # pragma: no cover + return data + try: if add_offset is not None: float(add_offset) except ValueError: logging.warn( - "Invalid scale_factor or add_offset attribute, " - "no unpacking done." + "No unpacking done: 'add_offset' attribute " + f"{add_offset!r} can't be converted to a float" ) # pragma: no cover return data - if scale_factor is not None and add_offset is not None: - if add_offset != 0.0 or scale_factor != 1.0: - data = data * scale_factor + add_offset + if scale_factor is not None: + if add_offset is not None: + # scale_factor and add_offset + if add_offset != 0.0 or scale_factor != 1.0: + data = data * scale_factor + add_offset + else: + data = data.astype(np.array(scale_factor).dtype) + else: + # scale_factor with no add_offset + if scale_factor != 1.0: + data = data * scale_factor + else: + data = data.astype(scale_factor.dtype) + elif add_offset is not None: + # add_offset with no scale_factor + if add_offset != 0.0: + data = data + add_offset else: - data = data.astype(scale_factor.dtype) - elif scale_factor is not None and scale_factor != 1.0: - # If variable has only scale_factor attribute, rescale. - data = data * scale_factor - elif add_offset is not None and add_offset != 0.0: - # If variable has only add_offset attribute, add offset. - data = data + add_offset + data = data.astype(np.array(add_offset).dtype) return data From 59669a1c8536be0a6b8509458225ca4958f4d5cd Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 12 Feb 2024 17:49:19 +0000 Subject: [PATCH 34/88] dev --- Changelog.rst | 10 +- cfdm/__init__.py | 1 + cfdm/cfdmimplementation.py | 4 +- cfdm/data/__init__.py | 1 + cfdm/data/data.py | 4 +- cfdm/data/h5netcdfarray.py | 33 +- cfdm/data/mixin/arraymixin.py | 16 - cfdm/data/mixin/filearraymixin.py | 49 +- cfdm/data/mixin/netcdffilemixin.py | 61 +- cfdm/data/netcdf4array.py | 58 +- cfdm/data/netcdfarray.py | 287 +-------- cfdm/data/netcdfindexer.py | 114 ++-- cfdm/docstring/docstring.py | 17 + cfdm/functions.py | 3 - cfdm/read_write/netcdf/flatten/config.py | 115 ++-- cfdm/read_write/netcdf/flatten/flatten.py | 719 ++++++++++++++++------ cfdm/read_write/netcdf/netcdfread.py | 93 +-- cfdm/read_write/read.py | 26 +- cfdm/test/test_NetCDF4Array.py | 31 +- cfdm/test/test_NetCDFIndexer.py | 2 +- docs/source/class.rst | 4 +- docs/source/class/cfdm.H5netcdfArray.rst | 123 ++++ docs/source/class/cfdm.NetCDF4Array.rst | 123 ++++ docs/source/class/cfdm.NetCDFArray.rst | 108 ---- docs/source/class/cfdm.NetCDFIndexer.rst | 42 ++ docs/source/conf.py | 1 + docs/source/extensions.rst | 2 +- docs/source/introduction.rst | 46 +- docs/source/spelling_false_positives.txt | 8 +- docs/source/tutorial.py | 4 +- docs/source/tutorial.rst | 23 +- release_docs | 6 + setup.py | 1 + 33 files changed, 1233 insertions(+), 902 deletions(-) create mode 100644 docs/source/class/cfdm.H5netcdfArray.rst create mode 100644 docs/source/class/cfdm.NetCDF4Array.rst delete mode 100644 docs/source/class/cfdm.NetCDFArray.rst create mode 100644 docs/source/class/cfdm.NetCDFIndexer.rst diff --git a/Changelog.rst b/Changelog.rst index 12502a606..1e0b0f08c 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -3,13 +3,17 @@ Version 1.11.1.0 **2024-??-??** +* Allow access to netCDF-4 files in S3 object stores + (https://github.com/NCAS-CMS/cfdm/issues/285) +* Refactored the flattening of netCDF-4 groups + (https://github.com/NCAS-CMS/cfdm/issues/286) * New dependency: ``h5netcdf>=1.3.0`` -* New dependency: ``h5py>=3.10.`` +* New dependency: ``h5py>=3.10.0`` * New dependency: ``s3fs>=2024.2.0`` * Removed dependency: ``netcdf_flattener`` ----- - +---- + Version 1.11.0.0 ---------------- diff --git a/cfdm/__init__.py b/cfdm/__init__.py index 23eb0e53e..db85621a9 100644 --- a/cfdm/__init__.py +++ b/cfdm/__init__.py @@ -184,6 +184,7 @@ Data, GatheredArray, H5netcdfArray, + NetCDFArray, NetCDF4Array, NetCDFIndexer, NumpyArray, diff --git a/cfdm/cfdmimplementation.py b/cfdm/cfdmimplementation.py index 7eda9fcd6..e60bbf0b4 100644 --- a/cfdm/cfdmimplementation.py +++ b/cfdm/cfdmimplementation.py @@ -2300,7 +2300,7 @@ def initialise_NetCDF4Array(self, **kwargs): kwargs: optional Initialisation parameters to pass to the new instance. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Returns: @@ -2313,7 +2313,7 @@ def initialise_NetCDF4Array(self, **kwargs): def initialise_H5netcdfArray(self, **kwargs): """Return a `H5netcdfArray` instance. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: diff --git a/cfdm/data/__init__.py b/cfdm/data/__init__.py index cb647802c..bcbece15c 100644 --- a/cfdm/data/__init__.py +++ b/cfdm/data/__init__.py @@ -19,6 +19,7 @@ from .cellconnectivityarray import CellConnectivityArray from .gatheredarray import GatheredArray from .h5netcdfarray import H5netcdfArray +from .netcdfarray import NetCDFArray from .netcdf4array import NetCDF4Array from .netcdfindexer import NetCDFIndexer from .numpyarray import NumpyArray diff --git a/cfdm/data/data.py b/cfdm/data/data.py index 3f3315096..479e97d3c 100644 --- a/cfdm/data/data.py +++ b/cfdm/data/data.py @@ -2217,7 +2217,7 @@ def maximum(self, axes=None, squeeze=False): size one. With this option, the result will broadcast correctly against the original data. - .. versionaded:: (cfdm) HDFVER + .. versionaded:: (cfdm) 1.11.1.0 :Returns: @@ -2443,7 +2443,7 @@ def sum(self, axes=None, squeeze=False): size one. With this option, the result will broadcast correctly against the original data. - .. versionaded:: (cfdm) HDFVER + .. versionaded:: (cfdm) 1.11.1.0 :Returns: diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index bd245f757..8d3645def 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -15,9 +15,9 @@ class H5netcdfArray(NetCDFFileMixin, FileArrayMixin, abstract.Array): - """An underlying array stored in a netCDF HDF file. + """A netCDF array accessed with `h5netcdf`. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 """ @@ -89,15 +89,8 @@ def __init__( applicable. If unset then the calendar will be set during the first `__getitem__` call. - missing_values: `dict`, optional - The missing value indicators defined by the variable - attributes. See `get_missing_values` for details. - - {{init storage_options: `dict` or `None`, optional}} - .. versionadded:: (cfdm) HDFVER - {{init source: optional}} {{init copy: `bool`, optional}} @@ -193,7 +186,7 @@ def __getitem__(self, indices): x.__getitem__(indices) <==> x[indices] - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 """ dataset, address = self.open() @@ -228,25 +221,23 @@ def __getitem__(self, indices): return array def _set_attributes(self, var): - """TODOHDF The units and calendar properties. + """Set the netCDF variable attributes. These are set from the netCDF variable attributes, but only if - they have already not been defined, either during {{class}} - instantiation or by a previous call to `_set_units`. + they have not already been defined, either during {{class}} + instantiation or by a previous call to `_set_attributes`. - .. versionadded:: (cfdm) 1.10.0.1 + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: - var: `netCDF4.Variable` or `h5netcdf.Variable` - The variable containing the units and calendar - definitions. + var: `h5netcdf.Variable` + The netCDF variable. :Returns: - `tuple` - The units and calendar values, either of which may be - `None`. + `dict` + The attributes. """ attributes = self._get_component("attributes", None) @@ -259,7 +250,7 @@ def _set_attributes(self, var): def close(self, dataset): """Close the dataset containing the data. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: diff --git a/cfdm/data/mixin/arraymixin.py b/cfdm/data/mixin/arraymixin.py index 5dc42a5a7..901087744 100644 --- a/cfdm/data/mixin/arraymixin.py +++ b/cfdm/data/mixin/arraymixin.py @@ -1,5 +1,3 @@ -from copy import deepcopy - import numpy as np @@ -103,14 +101,6 @@ def _set_units(self): return units, calendar - def get_attributes(self, default=ValueError()): - """TODOHDF.""" - attributes = self._get_component("attributes", None) - if attributes is not None: - attributes = deepcopy(attributes) - - return attributes - def get_calendar(self, default=ValueError()): """The calendar of the array. @@ -172,12 +162,6 @@ def get_compression_type(self): """ return self._get_component("compression_type", "") - def get_missing_values(self): - """TODOHDF.""" - raise NotImplementedError( - f"Must implement {self.__class__.__name__}.get_missing_values" - ) # pragma: no cover - @classmethod def get_subspace(cls, array, indices, copy=True): """Return a subspace, defined by indices, of a numpy array. diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index 5190f790c..67ee7a305 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -6,6 +6,12 @@ from ...functions import abspath +class DeprecationError(Exception): + """Deprecation error.""" + + pass + + class FileArrayMixin: """Mixin class for a file container of an array. @@ -105,6 +111,35 @@ def get_addresses(self): """ return self._get_component("address", ()) + def get_attributes(self, default=ValueError()): + """The attributes of the array. + + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + default: optional + Return the value of the *default* parameter if the + attributes have not been set. If set to an `Exception` + instance then it will be raised instead. + + :Returns: + + The attributes. + + """ + attributes = self._get_component("attributes", None) + if attributes is None: + if default is None: + return + + return self._default( + default, + f"{self.__class__.__name__} attributes have not yet been set", + ) + + return deepcopy(attributes) + def get_filename(self, default=AttributeError()): """The name of the file containing the array. @@ -190,12 +225,24 @@ def get_formats(self): """ return (self.get_format(),) * len(self.get_filenames()) + def get_missing_values(self): + """The missing values of the data. + + Deprecated at version 1.11.1.0. Use `get_attributes` instead. + + """ + raise DeprecationError( + f"{self.__class__.__name__}.get_missing_values was deprecated " + "at version 1.11.1.0 and is no longer available. " + "Use {self.__class__.__name__}.get_attributes instead." + ) # pragma: no cover + def get_storage_options( self, create_endpoint_url=True, filename=None, parsed_filename=None ): """Return `s3fs.S3FileSystem` options for accessing S3 files. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: diff --git a/cfdm/data/mixin/netcdffilemixin.py b/cfdm/data/mixin/netcdffilemixin.py index 37461ec9b..376acf809 100644 --- a/cfdm/data/mixin/netcdffilemixin.py +++ b/cfdm/data/mixin/netcdffilemixin.py @@ -1,19 +1,23 @@ -from copy import deepcopy - from ..numpyarray import NumpyArray +class DeprecationError(Exception): + """Deprecation error.""" + + pass + + class NetCDFFileMixin: """Mixin class for netCDF file arrays. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 """ def _group(self, dataset, groups): """Retrun the group object containing a variable. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: @@ -39,25 +43,23 @@ def _group(self, dataset, groups): return dataset def _set_attributes(self, var): - """The units and calendar properties. + """Set the netCDF variable attributes. These are set from the netCDF variable attributes, but only if - they have already not been defined, either during {{class}} - instantiation or by a previous call to `_set_units`. + they have not already been defined, either during {{class}} + instantiation or by a previous call to `_set_attributes`. - .. versionadded:: (cfdm) 1.10.0.1 + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: var: `netCDF4.Variable` or `h5netcdf.Variable` - The variable containing the units and calendar - definitions. + The netCDF variable. :Returns: - `tuple` - The units and calendar values, either of which may be - `None`. + `dict` + The attributes. """ raise NotImplementedError( @@ -177,6 +179,8 @@ def get_mask(self): def get_missing_values(self, default=ValueError()): """The missing value indicators from the netCDF variable. + Deprecated at version 1.11.1.0. Use `get_attributes` instead. + .. versionadded:: (cfdm) 1.10.0.3 :Parameters: @@ -214,35 +218,16 @@ def get_missing_values(self, default=ValueError()): {'valid_min': -999} """ - attributes = self._get_component("attributes", None) - if attributes is None: - if default is None: - return - - return self._default( - default, - f"{self.__class__.__name__} missing values have not been set", - ) - - missing = {} - for attr in ( - "_FillValue", - "_Unsigned", - "missing_value", - "valid_min", - "valid_max", - "valid_range", - ): - value = attributes.get(attr) - if value is not None: - missing[attr] = deepcopy(value) - - return missing + raise DeprecationError( + f"{self.__class__.__name__}.get_missing_values was deprecated " + "at version 1.11.1.0 and is no longer available. " + "Use {self.__class__.__name__}.get_attributes instead." + ) def get_unpack(self): """Whether or not to automatically unpack the data. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 **Examples** diff --git a/cfdm/data/netcdf4array.py b/cfdm/data/netcdf4array.py index d60127658..72dffbf79 100644 --- a/cfdm/data/netcdf4array.py +++ b/cfdm/data/netcdf4array.py @@ -4,11 +4,9 @@ from .mixin import FileArrayMixin, NetCDFFileMixin from .netcdfindexer import NetCDFIndexer -# import numpy as np - class NetCDF4Array(NetCDFFileMixin, FileArrayMixin, abstract.Array): - """An underlying array stored in a netCDF file. + """A netCDF array accessed with `netCDF4`. .. versionadded:: (cfdm) 1.7.0 @@ -57,26 +55,13 @@ def __init__( ndim: `int` The number of array dimensions in the netCDF file. - mask: `bool` - If True (the default) then mask by convention when - reading data from disk. - - A netCDF array is masked depending on the values of - any of the netCDF attributes ``_FillValue``, - ``missing_value``, ``_Unsigned``, ``valid_min``, - ``valid_max``, and ``valid_range``. + {{init mask: `bool`, optional}} .. versionadded:: (cfdm) 1.8.2 - unpack: `bool` - If True (the default) then unpack by convention when - reading data from disk. + {{init unpack: `bool`, optional}} - A netCDF array is unpacked depending on the values of - the netCDF attributes ``add_offset`` and - ``scale_factor``. - - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 units: `str` or `None`, optional The units of the netCDF variable. Set to `None` to @@ -93,16 +78,9 @@ def __init__( .. versionadded:: (cfdm) 1.10.0.1 - missing_values: `dict`, optional - The missing value indicators defined by the netCDF - variable attributes. See `get_missing_values` for - details. - - .. versionadded:: (cfdm) 1.10.0.3 - {{init storage_options: `dict` or `None`, optional}} - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 {{init source: optional}} @@ -112,6 +90,11 @@ def __init__( .. versionadded:: (cfdm) 1.10.0.0 + missing_values: Deprecated at version 1.11.1.0 + The missing value indicators defined by the netCDF + variable attributes. The may now be recorded via the + *attributes* parameter + ncvar: Deprecated at version 1.10.1.0 Use the *address* parameter instead. @@ -286,30 +269,25 @@ def __str__(self): return f"{self.get_filename(None)}, {self.get_address()}" def _set_attributes(self, var): - """The units and calendar properties. + """Set the netCDF variable attributes. These are set from the netCDF variable attributes, but only if - they have already not been defined, either during {{class}} - instantiation or by a previous call to `_set_units`. + they have not already been defined, either during {{class}} + instantiation or by a previous call to `_set_attributes`. - .. versionadded:: (cfdm) 1.10.0.1 + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: - var: `netCDF4.Variable` or `h5netcdf.Variable` - The variable containing the units and calendar - definitions. + var: `netCDF4.Variable` + The netCDF variable. :Returns: - `tuple` - The units and calendar values, either of which may be - `None`. + `dict` + The attributes. """ - # Note: Can't use None as the default since it is a valid - # `units` or 'calendar' value that indicates that the - # attribute has not been set in the dataset. attributes = self._get_component("attributes", None) if attributes is not None: return diff --git a/cfdm/data/netcdfarray.py b/cfdm/data/netcdfarray.py index 3da924756..5ab97ba8e 100644 --- a/cfdm/data/netcdfarray.py +++ b/cfdm/data/netcdfarray.py @@ -1,14 +1,14 @@ -import netCDF4 +class DeprecationError(Exception): + """Deprecation error.""" -from . import abstract -from .mixin import FileArrayMixin, NetCDFFileMixin -from .variableindexer import VariableIndexer + pass -# import numpy as np +class NetCDFArray: + """A netCDF array accessed with `netCDF4`. -class NetCDF4Array(NetCDFFileMixin, FileArrayMixin, abstract.Array): - """An underlying array stored in a netCDF file. + Deprecated at version 1.11.1.0 and is no longer available. Use + `cfdm.NetCDF4Array` instead. .. versionadded:: (cfdm) 1.7.0 @@ -21,9 +21,11 @@ def __init__( dtype=None, shape=None, mask=True, + unpack=True, units=False, calendar=False, - missing_values=None, + attributes=None, + storage_options=None, source=None, copy=True, ): @@ -55,14 +57,7 @@ def __init__( ndim: `int` The number of array dimensions in the netCDF file. - mask: `bool` - If True (the default) then mask by convention when - reading data from disk. - - A netCDF array is masked depending on the values of any of - the netCDF variable attributes ``valid_min``, - ``valid_max``, ``valid_range``, ``_FillValue`` and - ``missing_value``. + {{init mask: `bool`, optional}} .. versionadded:: (cfdm) 1.8.2 @@ -81,13 +76,6 @@ def __init__( .. versionadded:: (cfdm) 1.10.0.1 - missing_values: `dict`, optional - The missing value indicators defined by the netCDF - variable attributes. See `get_missing_values` for - details. - - .. versionadded:: (cfdm) 1.10.0.3 - {{init source: optional}} .. versionadded:: (cfdm) 1.10.0.0 @@ -96,6 +84,11 @@ def __init__( .. versionadded:: (cfdm) 1.10.0.0 + missing_values: Deprecated at version 1.11.1.0 + The missing value indicators defined by the netCDF + variable attributes. The may now be recorded via the + *attributes* parameter + ncvar: Deprecated at version 1.10.1.0 Use the *address* parameter instead. @@ -106,249 +99,7 @@ def __init__( Use the *address* parameter instead. """ - super().__init__(source=source, copy=copy) - - if source is not None: - try: - shape = source._get_component("shape", None) - except AttributeError: - shape = None - - try: - filename = source._get_component("filename", None) - except AttributeError: - filename = None - - try: - address = source._get_component("address", None) - except AttributeError: - address = None - - try: - dtype = source._get_component("dtype", None) - except AttributeError: - dtype = None - - try: - mask = source._get_component("mask", True) - except AttributeError: - mask = True - - try: - units = source._get_component("units", False) - except AttributeError: - units = False - - try: - calendar = source._get_component("calendar", False) - except AttributeError: - calendar = False - - try: - missing_values = source._get_component("missing_values", None) - except AttributeError: - missing_values = None - - if shape is not None: - self._set_component("shape", shape, copy=False) - - if filename is not None: - if isinstance(filename, str): - filename = (filename,) - else: - filename = tuple(filename) - - self._set_component("filename", filename, copy=False) - - if address is not None: - if isinstance(address, (str, int)): - address = (address,) - else: - address = tuple(address) - - self._set_component("address", address, copy=False) - - if missing_values is not None: - self._set_component( - "missing_values", missing_values.copy(), copy=False - ) - - self._set_component("dtype", dtype, copy=False) - self._set_component("mask", mask, copy=False) - self._set_component("units", units, copy=False) - self._set_component("calendar", calendar, copy=False) - - # By default, close the netCDF file after data array access - self._set_component("close", True, copy=False) - - def __getitem__(self, indices): - """Returns a subspace of the array as a numpy array. - - x.__getitem__(indices) <==> x[indices] - - The indices that define the subspace must be either `Ellipsis` or - a sequence that contains an index for each dimension. In the - latter case, each dimension's index must either be a `slice` - object or a sequence of two or more integers. - - Indexing is similar to numpy indexing. The only difference to - numpy indexing (given the restrictions on the type of indices - allowed) is: - - * When two or more dimension's indices are sequences of integers - then these indices work independently along each dimension - (similar to the way vector subscripts work in Fortran). - - .. versionadded:: (cfdm) 1.7.0 - - """ - netcdf, address = self.open() - dataset = netcdf - - mask = self.get_mask() - groups, address = self.get_groups(address) - - if groups: - # Traverse the group structure, if there is one (CF>=1.8). - netcdf = self._group(netcdf, groups) - - if isinstance(address, str): - # Get the variable by netCDF name - variable = netcdf.variables[address] - else: - # Get the variable by netCDF integer ID - for variable in netcdf.variables.values(): - if variable._varid == address: - break - - # Get the data, applying masking and scaling as required. - array = VariableIndexer( - variable, mask=mask, scale=True, always_mask=False + raise DeprecationError( + f"{self.__class__.__name__} was deprecated at version 1.11.1.0 " + "and is no longer available. Use cfdm.NetCDF4Array instead." ) - array = array[indices] - - # Set the units, if they haven't been set already. - self._set_units(variable) - - self.close(dataset) - del netcdf, dataset - - if not self.ndim: - # Hmm netCDF4 has a thing for making scalar size 1, 1d - array = array.squeeze() - - return array - - def __repr__(self): - """Called by the `repr` built-in function. - - x.__repr__() <==> repr(x) - - """ - return f"<{self.__class__.__name__}{self.shape}: {self}>" - - def __str__(self): - """Called by the `str` built-in function. - - x.__str__() <==> str(x) - - """ - return f"{self.get_filename(None)}, {self.get_address()}" - - def _get_attr(self, var, attr): - """Get a variable attribute. - - .. versionadded:: (cfdm) HDFVER - - :Parameters: - - var: `netCDF.Variable` - The variable - - attr: `str` - The attribute name. - - :Returns: - - The attirbute value. - - """ - return var.getncattr(attr) - - def get_groups(self, address): - """The netCDF4 group structure of a netCDF variable. - - .. versionadded:: (cfdm) 1.8.6.0 - - :Parameters: - - address: `str` or `int` - The netCDF variable name, or integer varid, from which - to get the groups. - - .. versionadded:: (cfdm) 1.10.1.0 - - :Returns: - - (`list`, `str`) or (`list`, `int`) - The group structure and the name within the group. If - *address* is a varid then an empty list and the varid - are returned. - - **Examples** - - >>> n.get_groups('tas') - ([], 'tas') - - >>> n.get_groups('/tas') - ([], 'tas') - - >>> n.get_groups('/data/model/tas') - (['data', 'model'], 'tas') - - >>> n.get_groups(9) - ([], 9) - - """ - try: - if "/" not in address: - return [], address - except TypeError: - return [], address - - out = address.split("/")[1:] - return out[:-1], out[-1] - - def close(self, dataset): - """Close the dataset containing the data. - - .. versionadded:: (cfdm) 1.7.0 - - :Parameters: - - dataset: `netCDF4.Dataset` - The netCDF dataset to be be closed. - - :Returns: - - `None` - - """ - if self._get_component("close"): - dataset.close() - - def open(self): - """Return a dataset file object and address. - - When multiple files have been provided an attempt is made to - open each one, in the order stored, and a file object is - returned from the first file that exists. - - :Returns: - - (`netCDF4.Dataset`, `str`) - The open file object, and the address of the data - within the file. - - """ - return super().open(netCDF4.Dataset, mode="r") diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index 0403e109f..11c96bb42 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -1,10 +1,12 @@ """License information: -Substantial portions of this code were adapted from the `netCDF4` -library, which carries MIT License as follows: +Portions of this code were adapted from the `netCDF4` library, which +is lcensed with the carries MIT License: Copyright 2008 Jeffrey Whitaker +https://opensource.org/license/mit + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including @@ -41,7 +43,7 @@ class NetCDFIndexer: Masking and unpacking operations are defined by netCDF attributes, which are either provided as part of the input *data* object, or - given with the input *attrs* parameter. + given with the input *attributes* parameter. The relevant netCDF attributes that may be used are: @@ -50,15 +52,13 @@ class NetCDFIndexer: * For unpacking: ``add_offset``, ``scale_factor``, ``_Unsigned`` - Adapted from `netCDF4`. - - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 **Examples** >>> import netCDF4 >>> nc = netCDF4.Dataset('file.nc', 'r') - >>> x = cfdm.{{class}}(nc.variables['x']) + >>> x = cfdm.NetCDFIndexer(nc.variables['x']) >>> x.shape (12, 64, 128) >>> print(x[0, 0:4, 0:3]) @@ -69,7 +69,7 @@ class NetCDFIndexer: >>> import h5netcdf >>> h5 = h5netcdf.File('file.nc', 'r') - >>> x = cfdm.{{class}}(h5.variables['x']) + >>> x = cfdm.NetCDFIndexer(h5.variables['x']) >>> x.shape (12, 64, 128) >>> print(x[0, 0:4, 0:3]) @@ -80,22 +80,27 @@ class NetCDFIndexer: >>> import numpy as np >>> n = np.arange(7) - >>> x = cfdm.{{class}}(n) + >>> x = cfdm.NetCDFIndexer(n) >>> x.shape (9,) >>> print(x[...]) [0 1 2 3 4 5 6] - >>> x = cfdm.{{class}}(n, attrs={'_FillValue': 4}) + >>> x = cfdm.NetCDFIndexer(n, attributes={'_FillValue': 4}) >>> print(x[...]) [0 1 2 3 -- 5 6] - >>> x = cfdm.{{class}}(n, mask=False, attrs={'_FillValue': 4}) + >>> x = cfdm.NetCDFIndexer(n, mask=False, attributes={'_FillValue': 4}) >>> print(x[...]) [0 1 2 3 4 5 6] """ def __init__( - self, variable, mask=True, unpack=True, always_mask=False, attrs=None + self, + variable, + mask=True, + unpack=True, + always_mask=False, + attributes=None, ): """**Initialisation** @@ -127,21 +132,20 @@ def __init__( indexing is always a masked array, even if there are no missing values. - attrs: `dict`, optional + attributes: `dict`, optional Provide the netCDF attributes of the *variable* as - dictionary key/value pairs. If *attrs* is set then any - netCDF attributes stored by *variable* itself are - ignored. Only the attributes relevant to masking and - unpacking are considers, and all other attributes are - ignored. + dictionary key/value pairs. If *attributes* is set + then any netCDF attributes stored by *variable* itself + are ignored. Only the attributes relevant to masking + and unpacking are considers, and all other attributes + are ignored. """ self.variable = variable self.mask = mask self.unpack = unpack self.always_mask = always_mask - self._attrs = attrs - self.shape = variable.shape + self._attributes = attributes def __getitem__(self, index): """Return a subspace of the variable as a `numpy` array. @@ -150,12 +154,12 @@ def __getitem__(self, index): Indexing follows the rules defined by the variable. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 """ variable = self.variable unpack = self.unpack - attrs = self.attrs() + attributes = self.attributes() dtype = variable.dtype netCDF4_scale = False @@ -189,7 +193,10 @@ def __getitem__(self, index): dtype_unsigned_int = None if unpack: - is_unsigned_int = attrs.get("_Unsigned", False) in ("true", "True") + is_unsigned_int = attributes.get("_Unsigned", False) in ( + "true", + "True", + ) if is_unsigned_int: data_dtype = data.dtype dtype_unsigned_int = ( @@ -198,10 +205,10 @@ def __getitem__(self, index): data = data.view(dtype_unsigned_int) if self.mask: - data = self._mask(data, dtype, attrs, dtype_unsigned_int) + data = self._mask(data, dtype, attributes, dtype_unsigned_int) if unpack: - data = self._unpack(data, attrs) + data = self._unpack(data, attributes) if data.dtype.kind == "S": data = data.astype("U", copy=False) @@ -215,13 +222,22 @@ def __getitem__(self, index): return data - def _check_safecast(self, attname, dtype, attrs): + @property + def shape(self): + """Tuple of the data dimension sizes. + + .. versionadded:: (cfdm) 1.11.1.0 + + """ + return self.variable.shape + + def _check_safecast(self, attname, dtype, attributes): """Check an attribute's data type. Checks to see that variable attribute exists and can be safely cast to variable data type. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameter: @@ -231,7 +247,7 @@ def _check_safecast(self, attname, dtype, attrs): dtype: `numpy.dtype` The variable data type. - attrs: `dict` + attributes: `dict` The variable attributes. :Returns: @@ -241,8 +257,8 @@ def _check_safecast(self, attname, dtype, attrs): with the variable data type, and the attribute value. """ - if attname in attrs: - attvalue = attrs[attname] + if attname in attributes: + attvalue = attributes[attname] att = np.array(attvalue) else: return False, None @@ -266,7 +282,7 @@ def _check_safecast(self, attname, dtype, attrs): def _default_FillValue(self, dtype): """Return the default ``_FillValue`` for the given data type. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 .. seealso:: `netCDF4.default_fillvals` @@ -285,10 +301,10 @@ def _default_FillValue(self, dtype): else: return _default_fillvals[dtype.str[1:]] - def _mask(self, data, dtype, attrs, dtype_unsigned_int): + def _mask(self, data, dtype, attributes, dtype_unsigned_int): """Mask the data. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameter: @@ -300,7 +316,7 @@ def _mask(self, data, dtype, attrs, dtype_unsigned_int): The data type of the variable (which may be different to that of *data*). - attrs: `dict` + attributes: `dict` The variable attributes. dtype_unsigned_int: `dtype` or `None` @@ -319,7 +335,7 @@ def _mask(self, data, dtype, attrs, dtype_unsigned_int): fill_value = None safe_missval, missing_value = self._check_safecast( - "missing_value", dtype, attrs + "missing_value", dtype, attributes ) if safe_missval: # -------------------------------------------------------- @@ -355,7 +371,7 @@ def _mask(self, data, dtype, attrs, dtype_unsigned_int): # Set mask=True for data == fill value safe_fillval, _FillValue = self._check_safecast( - "_FillValue", dtype, attrs + "_FillValue", dtype, attributes ) if not safe_fillval: _FillValue = self._default_FillValue(dtype) @@ -397,13 +413,13 @@ def _mask(self, data, dtype, attrs, dtype_unsigned_int): validmin = None validmax = None safe_validrange, valid_range = self._check_safecast( - "valid_range", dtype, attrs + "valid_range", dtype, attributes ) safe_validmin, valid_min = self._check_safecast( - "valid_min", dtype, attrs + "valid_min", dtype, attributes ) safe_validmax, valid_max = self._check_safecast( - "valid_max", dtype, attrs + "valid_max", dtype, attributes ) if safe_validrange and valid_range.size == 2: validmin = np.array(valid_range[0], dtype) @@ -463,10 +479,10 @@ def _mask(self, data, dtype, attrs, dtype_unsigned_int): return data - def _unpack(self, data, attrs): + def _unpack(self, data, attributes): """Unpack the data.. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameter: @@ -474,7 +490,7 @@ def _unpack(self, data, attrs): The unmasked and unpacked data indexed from the variable. - attrs: `dict` + attributes: `dict` The variable attributes. :Returns: @@ -483,8 +499,8 @@ def _unpack(self, data, attrs): The unpacked data. """ - scale_factor = attrs.get("scale_factor") - add_offset = attrs.get("add_offset") + scale_factor = attributes.get("scale_factor") + add_offset = attributes.get("add_offset") try: if scale_factor is not None: float(scale_factor) @@ -527,10 +543,10 @@ def _unpack(self, data, attrs): return data - def attrs(self): + def attributes(self): """Return the netCDF attributes of the variable. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Returns: @@ -539,13 +555,13 @@ def attrs(self): **Examples** - >>> v.attrs() + >>> v.attributes() {'standard_name': 'air_temperature', 'missing_value': -999.0} """ - if self._attrs is not None: - return self._attrs.copy() + if self._attributes is not None: + return self._attributes.copy() variable = self.variable try: diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py index 5d1b9420d..46728c3e8 100644 --- a/cfdm/docstring/docstring.py +++ b/cfdm/docstring/docstring.py @@ -402,6 +402,23 @@ "{{init cell_dimension: `int`}}": """cell_dimension: `int` The position of the *data* dimension that indexes the cells, either ``0`` or ``1``.""", + # init mask + "{{init mask: `bool`, optional}}": """mask: `bool`, optional + If True (the default) then mask by convention when + reading data from disk. + + A netCDF array is masked depending on the values of + any of the netCDF attributes ``_FillValue``, + ``missing_value``, ``_Unsigned``, ``valid_min``, + ``valid_max``, and ``valid_range``.""", + # init unpack + "{{init unpack: `bool`, optional}}": """unpack: `bool`, optional + If True (the default) then unpack by convention when + reading data from disk. + + A netCDF array is unpacked depending on the values of + the netCDF attributes ``add_offset`` and + ``scale_factor``.""", # init storage_options "{{init storage_options: `dict` or `None`, optional}}": """storage_options: `dict` or `None`, optional Key/value pairs to be passed on to the creation of an diff --git a/cfdm/functions.py b/cfdm/functions.py index 380671232..4d2880f3b 100644 --- a/cfdm/functions.py +++ b/cfdm/functions.py @@ -989,9 +989,6 @@ def __str__(self): """Called by the `str` built-in function.""" return str(self.value) - # ---------------------------------------------------------------- - # Methods - # ---------------------------------------------------------------- def copy(self): """Return a deep copy. diff --git a/cfdm/read_write/netcdf/flatten/config.py b/cfdm/read_write/netcdf/flatten/config.py index 427665fc4..b754e1030 100644 --- a/cfdm/read_write/netcdf/flatten/config.py +++ b/cfdm/read_write/netcdf/flatten/config.py @@ -1,6 +1,6 @@ """Configuration for netCDF group flattening. -.. versionadded:: (cfdm) HDFVER +.. versionadded:: (cfdm) 1.11.1.0 """ from dataclasses import dataclass @@ -35,17 +35,18 @@ @dataclass() -class AttributeFeatures: - """Data class that defines attribute flattening features. +class FlatteningRules: + """Define the flattening rules for a netCDF attribute. - For a named netCDF attribute, the features a define how the - contents of the attribute are flattened. + For a named netCDF attribute, the rules a define how the contents + of the attribute are flattened. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 """ - # name: The attribute name + # name: The name of attribute containing the reference to be + # flattened name: str # ref_to_dim: Positive integer if contains references to # dimensions (higher values have priority) @@ -76,22 +77,26 @@ class AttributeFeatures: # -------------------------------------------------------------------- -# Set flattening features for named CF attributes +# Set the flattening rules for named CF attributes # -------------------------------------------------------------------- -attribute_features = { +flattening_rules = { attr.name: attr for attr in ( + # ------------------------------------------------------------ # Coordinates - AttributeFeatures( + # ------------------------------------------------------------ + FlatteningRules( name="coordinates", ref_to_var=1, resolve_key=True, stop_at_local_apex=True, ), - AttributeFeatures(name="bounds", ref_to_var=1, resolve_key=True), - AttributeFeatures(name="climatology", ref_to_var=1, resolve_key=True), + FlatteningRules(name="bounds", ref_to_var=1, resolve_key=True), + FlatteningRules(name="climatology", ref_to_var=1, resolve_key=True), + # ------------------------------------------------------------ # Cell methods - AttributeFeatures( + # ------------------------------------------------------------ + FlatteningRules( name="cell_methods", ref_to_dim=2, ref_to_var=1, @@ -99,99 +104,113 @@ class AttributeFeatures: accept_standard_names=True, limit_to_scalar_coordinates=True, ), + # ------------------------------------------------------------ # Cell measures - AttributeFeatures( + # ------------------------------------------------------------ + FlatteningRules( name="cell_measures", ref_to_var=1, resolve_value=True ), + # ------------------------------------------------------------ # Coordinate references - AttributeFeatures( + # ------------------------------------------------------------ + FlatteningRules( name="formula_terms", ref_to_var=1, resolve_value=True ), - AttributeFeatures( + FlatteningRules( name="grid_mapping", ref_to_var=1, resolve_key=True, resolve_value=True, ), + # ------------------------------------------------------------ # Ancillary variables - AttributeFeatures( + # ------------------------------------------------------------ + FlatteningRules( name="ancillary_variables", ref_to_var=1, resolve_key=True ), + # ------------------------------------------------------------ # Compression by gathering - AttributeFeatures(name="compress", ref_to_dim=1, resolve_key=True), + # ------------------------------------------------------------ + FlatteningRules(name="compress", ref_to_dim=1, resolve_key=True), + # ------------------------------------------------------------ # Discrete sampling geometries - AttributeFeatures( + # ------------------------------------------------------------ + FlatteningRules( name="instance_dimension", ref_to_dim=1, resolve_key=True ), - AttributeFeatures( + FlatteningRules( name="sample_dimension", ref_to_dim=1, resolve_key=True ), + # ------------------------------------------------------------ # Domain variables - AttributeFeatures(name="dimensions", ref_to_dim=1, resolve_key=True), + # ------------------------------------------------------------ + FlatteningRules(name="dimensions", ref_to_dim=1, resolve_key=True), + # ------------------------------------------------------------ # Aggregation variables - AttributeFeatures( + # ------------------------------------------------------------ + FlatteningRules( name="aggregated_dimensions", ref_to_dim=1, resolve_key=True ), - AttributeFeatures( + FlatteningRules( name="aggregated_data", ref_to_var=1, resolve_value=True ), + # ------------------------------------------------------------ # Cell geometries - AttributeFeatures(name="geometry", ref_to_var=1, resolve_key=True), - AttributeFeatures( - name="interior_ring", ref_to_var=1, resolve_key=True - ), - AttributeFeatures( + # ------------------------------------------------------------ + FlatteningRules(name="geometry", ref_to_var=1, resolve_key=True), + FlatteningRules(name="interior_ring", ref_to_var=1, resolve_key=True), + FlatteningRules( name="node_coordinates", ref_to_var=1, resolve_key=True ), - AttributeFeatures(name="node_count", ref_to_var=1, resolve_key=True), - AttributeFeatures(name="nodes", ref_to_var=1, resolve_key=True), - AttributeFeatures( + FlatteningRules(name="node_count", ref_to_var=1, resolve_key=True), + FlatteningRules(name="nodes", ref_to_var=1, resolve_key=True), + FlatteningRules( name="part_node_count", ref_to_var=1, resolve_key=True ), + # ------------------------------------------------------------ # UGRID variables - AttributeFeatures(name="mesh", ref_to_var=1, resolve_key=True), - AttributeFeatures( + # ------------------------------------------------------------ + FlatteningRules(name="mesh", ref_to_var=1, resolve_key=True), + FlatteningRules( name="edge_coordinates", ref_to_var=1, resolve_key=True ), - AttributeFeatures( + FlatteningRules( name="face_coordinates", ref_to_var=1, resolve_key=True ), - AttributeFeatures( + FlatteningRules( name="edge_node_connectivity", ref_to_var=1, resolve_key=True ), - AttributeFeatures( + FlatteningRules( name="face_node_connectivity", ref_to_var=1, resolve_key=True ), - AttributeFeatures( + FlatteningRules( name="face_face_connectivity", ref_to_var=1, resolve_key=True ), - AttributeFeatures( + FlatteningRules( name="edge_face_connectivity", ref_to_var=1, resolve_key=True ), - AttributeFeatures( + FlatteningRules( name="face_edge_connectivity", ref_to_var=1, resolve_key=True ), - AttributeFeatures( - name="edge_dimension", ref_to_dim=1, resolve_key=True - ), - AttributeFeatures( - name="face_dimension", ref_to_dim=1, resolve_key=True - ), + FlatteningRules(name="edge_dimension", ref_to_dim=1, resolve_key=True), + FlatteningRules(name="face_dimension", ref_to_dim=1, resolve_key=True), + # ------------------------------------------------------------ # Compression by coordinate subsampling - AttributeFeatures( + # ------------------------------------------------------------ + FlatteningRules( name="coordinate_interpolation", ref_to_var=1, resolve_key=True, resolve_value=True, ), - AttributeFeatures( + FlatteningRules( name="tie_point_mapping", ref_to_dim=2, ref_to_var=1, resolve_key=True, resolve_value=True, ), - AttributeFeatures( + FlatteningRules( name="interpolation_parameters", ref_to_var=1, resolve_value=True ), ) diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 584568535..fb2e50600 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -1,8 +1,15 @@ -"""Netcdf flattener. +"""Flatten NetCDF groups. -This code has been adapted from the code found in the `netcdf_flattener` -package, which is licensed with Apache License 2.0 -(http://www.apache.org/licenses/LICENSE-2.0) +Portions of this code were adapted from the `netcdf_flattener` +library, which carries the Apache 2.0 License: + +Copyright (c) 2020 EUMETSAT + +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. The ASF licenses this file to you +under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy +of the License at http://www.apache.org/licenses/LICENSE-2.0. """ @@ -12,12 +19,12 @@ import warnings from .config import ( - attribute_features, default_copy_slice_size, flattener_attribute_map, flattener_dimension_map, flattener_separator, flattener_variable_map, + flattening_rules, group_separator, max_name_len, ref_not_found_error, @@ -32,11 +39,13 @@ None: "native", } -special_attributes = set(attribute_features) +# Set of netCDF attributes that may contain references to dimensions +# or variables +special_attributes = set(flattening_rules) def netcdf_flatten( - input_ds, output_ds, lax_mode=False, _copy_data=True, copy_slices=None + input_ds, output_ds, lax_mode=False, copy_data=True, copy_slices=None ): """Create a flattened version of a netCDF dataset. @@ -44,34 +53,67 @@ def netcdf_flatten( "copy_slices" input allows to copy some or all of the variables in slices. - :param input_ds: input netcdf4 dataset - :param output_ds: output netcdf4 dataset - :param lax_mode: if false (default), not resolving a reference halts the execution. If true, continue with warning. - :param _copy_data: if true (default), then all data arrays are copied from the input to the output dataset. - If false, then this does not happen. - Use this option *only* if the data arrays of the flattened dataset are never to be accessed. - If false then consider setting the fill mode for the output netcd4 dataset to "off" for improved performance. - :param copy_slices: dictionary containing variable_name: shape pairs, where variable_name is the path to the - variable name in the original Dataset (for instance /group1/group2/my_variable), and shape is either None for - using default slice value, or a custom slicing shap in the form of a tuple of the same dimension as the variable - (for instance (1000,2000,1500,) for a 3-dimensional variable). If a variable from the Dataset is not contained - in the dict, it will not be sliced and copied normally. + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + input_ds: `netCDF4.Dataset` or `h5netcdf.File` + The dataset to be falttened. + + output_ds: `netCDF4.Dataset` + A container for the flattened dataset. + + lax_mode: `bool`, optional + If False, the default, the not resolving a reference + halts the execution. If True, then continue with a + warning. + + copy_data: `bool`, optional + If True, the default, then all data arrays are copied + from the input to the output dataset. If False, then + this does not happen. Use this option only if the data + arrays of the flattened dataset are never to be + accessed. + + copy_slices: `dict`, optional + Dictionary containing variable_name/shape key/value + pairs, where variable_name is the path to the variable + name in the original dataset (for instance + ``/group1/group2/my_variable``), and shape is either + `None` for using default slice value, or a custom + slicing shape in the form of a tuple of the same + dimension as the variable (for instance ``(1000, 2000, + 1500)`` for a 3-dimensional variable). If a variable + from the dataset is not contained in the dictionary + then it will not be sliced and copied normally. """ _Flattener( - input_ds, lax_mode, _copy_data=_copy_data, copy_slices=copy_slices + input_ds, lax_mode, copy_data=copy_data, copy_slices=copy_slices ).flatten(output_ds) -def parse_var_attr(input_str): +def parse_var_attr(attribute): """Parse variable attribute of any form into a dict: - * 'time' -> OrderedDict([('time', [])]) - * 'lat lon' -> OrderedDict([('lat', []), ('lon', [])]) - * 'area: time volume: lat lon' -> OrderedDict([('area', ['time']), ('volume', ['lat', 'lon'])]) + * 'time' -> {'time': []} + + * 'lat lon' -> {'lat': [], 'lon': []} + + * 'area: time volume: lat lon' -> {'area': ['time'], 'volume': + ['lat', 'lon']} + + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + attribute: `str` + The attribute value to parse. - :param input_str: string to parse - :return: parsed string in an OrderedDict + :Returns: + + `dict` + The parsed string. """ @@ -100,7 +142,7 @@ def subst(s): ) ) - m = re.match(pat_all, input_str) + m = re.match(pat_all, attribute) # Output is always a dict. If input form is a list, dict values # are set as empty lists @@ -128,7 +170,7 @@ def subst(s): out[term] = values else: raise ReferenceException( - f"Error while parsing attribute value: {input_str!r}" + f"Error while parsing attribute value: {attribute!r}" ) return out @@ -137,8 +179,17 @@ def subst(s): def generate_var_attr_str(d): """Re-generate the attribute string from a dictionary. - :param d: dictionary - :return: valid attribute string + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + d: `dict` + A resolved and parsed attribute. + + :Returns: + + `str` + The flattened attribute value. """ parsed_list = [] @@ -148,7 +199,6 @@ def generate_var_attr_str(d): elif not v: parsed_list.append(f"{k}:") else: - # parsed_list.append(k + ": " + (" ".join(v))) parsed_list.append(f"{k}: {' '.join(v)}") return " ".join(parsed_list) @@ -160,21 +210,41 @@ class _Flattener: Contains the input file, the output file being flattened, and all the logic of the flattening process. + .. versionadded:: (cfdm) 1.11.1.0 + """ - def __init__(self, input_ds, lax_mode, _copy_data=True, copy_slices=None): + def __init__(self, input_ds, lax_mode, copy_data=True, copy_slices=None): """**Initialisation** - :param input_ds: input netcdf dataset - :param lax_mode: if false (default), not resolving a reference halts the execution. If true, continue with warning. - :param _copy_data: if true (default), then all data arrays are copied from the input to the output dataset - If false, then this does not happen. - Use this option *only* if the data arrays of the flattened dataset are never to be accessed. - :param copy_slices: dictionary containing variable_name: shape pairs, where variable_name is the path to the - variable name in the original Dataset (for instance /group1/group2/my_variable), and shape is either None - for using default slice value, or a custom slicing shape in the form of a tuple of the same dimension as the - variable (for instance (1000,2000,1500,) for a 3-dimensional variable). If a variable from the Dataset is - not contained in the dict, it will not be sliced and copied normally. + :Parameters: + + input_ds: `netCDF4.Dataset` or `h5netcdf.File` + The dataset to be falttened. + + lax_mode: `bool`, optional + If False, the default, the not resolving a reference + halts the execution. If True, then continue with a + warning. + + copy_data: `bool`, optional + If True, the default, then all data arrays are copied + from the input to the output dataset. If False, then + this does not happen. Use this option only if the data + arrays of the flattened dataset are never to be + accessed. + + copy_slices: `dict`, optional + Dictionary containing variable_name/shape key/value + pairs, where variable_name is the path to the variable + name in the original dataset (for instance + ``/group1/group2/my_variable``), and shape is either + `None` for using default slice value, or a custom + slicing shape in the form of a tuple of the same + dimension as the variable (for instance ``(1000, 2000, + 1500)`` for a 3-dimensional variable). If a variable + from the dataset is not contained in the dictionary + then it will not be sliced and copied normally. """ self.__attr_map_value = [] @@ -186,16 +256,17 @@ def __init__(self, input_ds, lax_mode, _copy_data=True, copy_slices=None): self.__lax_mode = lax_mode - self.__copy_data = _copy_data + self.__copy_data = copy_data self.__copy_slices = copy_slices self.__input_file = input_ds self.__output_file = None def attrs(self, variable): + """TODOHDF.""" try: # h5netcdf - return variable.attrs + return dict(variable.attrs) except AttributeError: # netCDF4 return { @@ -205,7 +276,7 @@ def attrs(self, variable): def chunksizes(self, variable): """Return the variable chunk sizes. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: @@ -241,7 +312,7 @@ def chunksizes(self, variable): def contiguous(self, variable): """Whether or not the variable data is contiguous on disk. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: @@ -270,7 +341,7 @@ def contiguous(self, variable): def dtype(self, variable): """Return the data type of a variable. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: @@ -300,7 +371,7 @@ def dtype(self, variable): def endian(self, variable): """Return the endian-ness of a variable. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: @@ -330,7 +401,7 @@ def endian(self, variable): def filepath(self, dataset): """Return the file path for the dataset. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: @@ -340,8 +411,8 @@ def filepath(self, dataset): :Returns: `str` - The file system path, or the opendap URL, for the - dataset. + The file system path, or the opendap URL, for the + dataset. **Examples** @@ -359,7 +430,7 @@ def filepath(self, dataset): def get_dims(self, variable): """Return. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Returns: @@ -393,7 +464,7 @@ def get_dims(self, variable): def getncattr(self, x, attr): """Retrieve a netCDF attribute. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: @@ -414,7 +485,7 @@ def getncattr(self, x, attr): def group(self, x): """Return a. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Returns: @@ -431,7 +502,7 @@ def group(self, x): def name(self, x): """Return the netCDF name, without its groups. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Returns: @@ -448,7 +519,7 @@ def name(self, x): def ncattrs(self, x): """Return netCDF attribute names. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: @@ -469,7 +540,7 @@ def ncattrs(self, x): def parent(self, group): """Return a simulated unix directory path to a group. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Returns: @@ -484,7 +555,7 @@ def parent(self, group): def path(self, group): """Return a simulated unix directory path to a group. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Returns: @@ -504,7 +575,16 @@ def path(self, group): def flatten(self, output_ds): """Flattens and write to output file. - :param output_ds: The dataset in which to store the flattened result. + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + output_ds: `netCDF4.Dataset` + A container for the flattened dataset. + + :Return: + + `None` """ logging.info( @@ -548,7 +628,16 @@ def flatten(self, output_ds): def process_group(self, input_group): """Flattens a given group to the output file. - :param input_group: group to flatten + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + input_group: `str` + The group to faltten. + + :Returns: + + `None` """ logging.info(f" Browsing group {self.path(input_group)}") @@ -568,8 +657,19 @@ def process_group(self, input_group): def flatten_attribute(self, input_group, attr_name): """Flattens a given attribute from a group to the output file. - :param input_group: group containing the attribute to flatten - :param attr_name: name of the attribute to flatten + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + input_group: `str` + The group containing the attribute to flatten. + + attr_name: `str` + The anme of the attribute. + + :Returns: + + `None` """ logging.info( @@ -593,7 +693,16 @@ def flatten_attribute(self, input_group, attr_name): def flatten_dimension(self, dim): """Flattens a given dimension to the output file. - :param dim: dimension to flatten + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + dim: `netCDF4.Dimension` or `h5netcdf.Dimension` + The dimension to flatten. + + :Returns: + + `None` """ logging.info( @@ -626,7 +735,16 @@ def flatten_dimension(self, dim): def flatten_variable(self, var): """Flattens a given variable to the output file. - :param var: variable to flatten + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + var: `netCDF4.Variable` or `h5netcdf.Variable` + The variable to flatten. + + :Returns: + + `None` """ logging.info( @@ -653,6 +771,14 @@ def flatten_variable(self, var): fullname = self.pathname(self.group(var), self.name(var)) logging.info(f" Creating variable {new_name} from {fullname}") + attributes = self.attrs(var) + + copy_data = self.__copy_data + if copy_data: + fill_value = attributes.pop("_FillValue", None) + else: + fill_value = False + new_var = self.__output_file.createVariable( new_name, self.dtype(var), @@ -665,18 +791,16 @@ def flatten_variable(self, var): chunksizes=self.chunksizes(var), endian=self.endian(var), least_significant_digit=None, - fill_value=None, + fill_value=fill_value, ) - if self.__copy_data: + if copy_data: # Find out slice method for variable and copy data - if ( - self.__copy_slices is None - or fullname not in self.__copy_slices - ): + copy_slices = self.__copy_slices + if copy_slices is None or fullname not in copy_slices: # Copy data as a whole - new_var[:] = var[:] - elif self.__copy_slices[fullname] is None: + new_var[...] = var[...] + elif copy_slices[fullname] is None: # Copy with default slice size copy_slice = tuple( default_copy_slice_size // len(var.shape) @@ -685,11 +809,11 @@ def flatten_variable(self, var): self.copy_var_by_slices(new_var, var, copy_slice) else: # Copy in slices - copy_slice = self.__copy_slices[fullname] + copy_slice = copy_slices[fullname] self.copy_var_by_slices(new_var, var, copy_slice) # Copy attributes - new_var.setncatts(self.attrs(var)) + new_var.setncatts(attributes) # Store new name in dict for resolving references later self.__var_map[ @@ -715,11 +839,28 @@ def increment_pos(self, pos, dim, copy_slice_shape, var_shape): dimension is reached, recursively increment the next dimensions until a valid position is found. - :param pos: current position - :param dim: dimension to be incremented - :param copy_slice_shape: shape of the slice - :param var_shape: shape of the variable - :return True if a valid position is found within the variable, False otherwise + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + pos: `list` + The current slice position along each dimension of the + array. + + dim: `int` + The position of the array dimension to be incremented. + + copy_slice_shape: `list` + The shape of the copy slice. + + var_shape: `tuple` + The shape of the whole variable. + + :Returns: + + `bool` + `True` if a valid position is found within the + variable, `False` otherwise. """ # Try to increment dimension @@ -749,9 +890,22 @@ def increment_pos(self, pos, dim, copy_slice_shape, var_shape): def copy_var_by_slices(self, new_var, old_var, copy_slice_shape): """Copy the data of a variable to a new one by slice. - :param new_var: new variable where to copy data - :param old_var: variable where data should be copied from - :param copy_slice_shape: shape of the slice + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + new_var: `netCDF4.Variable` + The new variable where to copy dataf. + + old_var: `netCDF4.Variable` or `h5netcdf.Variable` + The variable where data should be copied from. + + copy_slice_shape: `tuple` + The shape of the slice + + :Returns: + + `None` """ logging.info( @@ -781,16 +935,29 @@ def copy_var_by_slices(self, new_var, old_var, copy_slice_shape): pos, 0, copy_slice_shape, old_var.shape ) - def resolve_reference(self, orig_ref, orig_var, attr): + def resolve_reference(self, orig_ref, orig_var, rules): """Resolve a refrence. Resolves the absolute path to a coordinate variable within the group structure. - :param orig_ref: reference to resolve - :param orig_var: variable originally containing the reference - :param attr: AttributeFeatures object item to know if ref to dim or var - :return: absolute path to the reference + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + orig_ref: `str` + The reference to resolve. + + orig_var: `netCDF4.Variable` or `h5netcdf.Variable` + The original variable containing the reference. + + rules: `FlatteningRules` + The flattening rules that apply to the reference. + + :Returns: + + `str` + The absolute path to the reference. """ ref = orig_ref @@ -798,10 +965,10 @@ def resolve_reference(self, orig_ref, orig_var, attr): ref_type = "" # Resolve first as dim (True), or var (False) - resolve_dim_or_var = attr.ref_to_dim > attr.ref_to_var + resolve_dim_or_var = rules.ref_to_dim > rules.ref_to_var # Resolve var (resp. dim) if resolving as dim (resp. var) failed - resolve_alt = attr.ref_to_dim and attr.ref_to_var + resolve_alt = rules.ref_to_dim and rules.ref_to_var # Reference is already given by absolute path if ref.startswith(group_separator): @@ -837,18 +1004,57 @@ def resolve_reference(self, orig_ref, orig_var, attr): else: method = "Proximity" absolute_ref, ref_type = self.resolve_reference_proximity( - ref, resolve_dim_or_var, resolve_alt, orig_var, attr + ref, + resolve_dim_or_var, + resolve_alt, + orig_var, + rules, ) # Post-search checks and return result return self.resolve_reference_post_processing( - absolute_ref, orig_ref, orig_var, attr, ref_type, method + absolute_ref, + orig_ref, + orig_var, + rules, + ref_type, + method, ) def resolve_reference_proximity( - self, ref, resolve_dim_or_var, resolve_alt, orig_var, attr + self, ref, resolve_dim_or_var, resolve_alt, orig_var, rules ): - """Resolve reference: search by proximity.""" + """Resolve reference: search by proximity. + + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + ref: `str` + The reference to resolve. + + resolve_dim_or_var: `bool` + Try to resolve first as dimension (True), or else as + variable (False). + + resolve_alt: `bool` + Resolve as variable if resolving as dimension failed, + and vice versa. + + orig_var: `netCDF4.Variable` or `h5netcdf.Variable` + The original variable containing the reference. + + rules: `FlatteningRules` + The flattening rules that apply to the reference. + + :Returns: + + (`str` or `None, str) + The resolved reference (or `None` if unresolved), and + the type of reference (either ``'dimension'`` or + ``'variable'``). + + """ # First tentative as dim OR var if resolve_dim_or_var: ref_type = "dimension" @@ -860,7 +1066,7 @@ def resolve_reference_proximity( self.group(orig_var), resolve_dim_or_var, False, - attr.stop_at_local_apex, + rules.stop_at_local_apex, ) # If failed and alternative possible, second tentative @@ -875,7 +1081,7 @@ def resolve_reference_proximity( self.group(orig_var), not resolve_dim_or_var, False, - attr.stop_at_local_apex, + rules.stop_at_local_apex, ) # If found, create ref string @@ -890,11 +1096,42 @@ def resolve_reference_proximity( return None, "" def resolve_reference_post_processing( - self, absolute_ref, orig_ref, orig_var, attr, ref_type, method + self, absolute_ref, orig_ref, orig_var, rules, ref_type, method ): - """Post-processing operations after resolving reference.""" + """Post-processing operations after resolving reference. + + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + absolute_ref: `str` + The absolute path of the reference. + + orig_ref: `str` + The original reference. + + orig_var: `netCDF4.Variable` or `h5netcdf.Variable` + The original variable containing the reference. + + rules: `FlatteningRules` + The flattening rules that apply to the reference. + + ref_type: `str` + the type of reference (either ``'dimension'`` or + ``'variable'``). + + method: `str` + The method of reference resolution (either + ``'proximity'`` or ``'absolute'``). + + :Returns: + + `str` + The absolute reference. + + """ # If not found and accept standard name, assume standard name - if absolute_ref is None and attr.accept_standard_names: + if absolute_ref is None and rules.accept_standard_names: logging.info( f" Reference to {orig_ref!r} not " "resolved. Assumed to be a standard name." @@ -917,7 +1154,7 @@ def resolve_reference_post_processing( # additional check if ( ref_type == "variable" - and attr.limit_to_scalar_coordinates + and rules.limit_to_scalar_coordinates and ( ( "coordinates" not in self.ncattrs(orig_var) @@ -941,10 +1178,24 @@ def search_by_relative_path(self, ref, current_group, search_dim): Resolves the absolute path to a reference within the group structure, using search by relative path. - :param ref: reference to resolve - :param current_group: current group where searching - :param search_dim: if true, search references to dimensions, if false, search references to variables - :return: absolute path to the coordinate + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + ref: `str` + The reference to resolve. + + current_group: `str` + The current group of the reference. + + search_dim: `bool` + If True then search for a dimension, otherwise a + variable. + + :Returns: + + `str` + The absolute path to the variable. """ # Go up parent groups @@ -989,12 +1240,31 @@ def search_by_proximity( group is reached. If coordinate variable, search until local apex is reached, Then search down in siblings. - :param ref: reference to resolve - :param current_group: current group where searching - :param search_dim: if true, search references to dimensions, if false, search references to variables - :param local_apex_reached: False initially, until apex is reached. - :param is_coordinate_variable: true, if looking for a coordinate variable - :return: absolute path to the coordinate + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + ref: `str` + The reference to resolve. + + current_group: + The current group where searching. + + search_dim: `bool` + If True then search for a dimension, otherwise a + variable. + + local_apex_reached: `bool` + Whether or not the apex is previously been reached. + + is_coordinate_variable: `bool` + Whether the search is for a coordiante variable. + + :Returns: + + `str` or `None` + The absolute path to the variable, if found, otherwise + `None`. """ if search_dim: @@ -1049,32 +1319,26 @@ def search_by_proximity( # Did not find return None - # def __escape_index_error(self, match, group_name): - # """ - # - # :param match: regex match - # :param group_name: group name - # - # :Returns: - # - # `str` - # The group in a match if it exists, an empty string - # otherwise. - # - # """ - # try: - # return match.group(group_name) - # except IndexError: - # return "" - def resolve_references(self, var, old_var): """Resolve references. In a given variable, replace all references to other variables in its attributes by absolute references. - :param var: flattened variable in which references should be renamed with absolute references - :param old_var: original variable (in group structure) + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + var: `netCDF4.Variable` or `h5netcdf.Variable` + The flattened variable in which references should be + renamed with absolute references. + + old_var: `netCDF4.Variable` or `h5netcdf.Variable` + The original variable (in group structure). + + :Returns: + + `None` """ var_attrs = self.attrs(var) @@ -1086,20 +1350,20 @@ def resolve_references(self, var, old_var): # properties resolved_parsed_attr = {} - attr = attribute_features.get(name) + rules = flattening_rules.get(name) for k, v in parsed_attr.items(): - if attr.resolve_key: - k = self.resolve_reference(k, old_var, attr) + if rules.resolve_key: + k = self.resolve_reference(k, old_var, rules) - if attr.resolve_value and v is not None: - v = [self.resolve_reference(x, old_var, attr) for x in v] + if rules.resolve_value and v is not None: + v = [self.resolve_reference(x, old_var, rules) for x in v] resolved_parsed_attr[k] = v # Re-generate attribute value string with resolved # references var.setncattr( - attr.name, generate_var_attr_str(resolved_parsed_attr) + rules.name, generate_var_attr_str(resolved_parsed_attr) ) def adapt_references(self, var): @@ -1110,7 +1374,17 @@ def adapt_references(self, var): netCDF. All references have to be already resolved as absolute references. - :param var: flattened variable in which references should be renamed with new names + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + var: `netCDF4.Variable` or `h5netcdf.Variable` + The flattened variable in which references should be + renamed with new names. + + :Returns: + + `None` """ var_attrs = self.attrs(var) @@ -1121,33 +1395,42 @@ def adapt_references(self, var): adapted_parsed_attr = {} - attr = attribute_features.get(name) + rules = flattening_rules.get(name) for k, v in parsed_attr.items(): - if attr.resolve_key: - k = self.adapt_name(k, attr) + if rules.resolve_key: + k = self.adapt_name(k, rules) - if attr.resolve_value and v is not None: - v = [self.adapt_name(x, attr) for x in v] + if rules.resolve_value and v is not None: + v = [self.adapt_name(x, rules) for x in v] adapted_parsed_attr[k] = v new_attr_value = generate_var_attr_str(adapted_parsed_attr) - var.setncattr(attr.name, new_attr_value) + var.setncattr(rules.name, new_attr_value) logging.info( - f" Value of {self.name(var)}.{attr.name} changed from " - f"{attr_value!r} to {new_attr_value!r}" + f" Value of {self.name(var)}.{rules.name} changed " + f"from {attr_value!r} to {new_attr_value!r}" ) - def adapt_name(self, resolved_ref, attr): + def adapt_name(self, resolved_ref, rules): """Apapt the name. Return name of flattened reference. If not found, raise exception or continue warning. - :param resolved_ref: resolved reference to adapt - :param attr: AttributeFeatures object item to know in which dict to look for name mapping - :return: adapted reference + .. versionadded:: (cfdm) 1.11.1.0 + + resolved_ref: `str` + The resolved reference. + + rules: `FlatteningRules` + The flattening rules that apply to the reference. + + :Returns: + + `str` + The adapted reference. """ # If ref contains Error message, leave as such @@ -1155,10 +1438,10 @@ def adapt_name(self, resolved_ref, attr): return resolved_ref # Select highest priority map - if attr.ref_to_dim > attr.ref_to_var: + if rules.ref_to_dim > rules.ref_to_var: name_mapping = self.__dim_map - if attr.ref_to_dim < attr.ref_to_var: + if rules.ref_to_dim < rules.ref_to_var: name_mapping = self.__var_map # Try to find mapping @@ -1167,19 +1450,19 @@ def adapt_name(self, resolved_ref, attr): # If not found, look in other map if allowed except KeyError: - if attr.ref_to_dim and attr.ref_to_var: - name_mapping = ( - self.__dim_map - if attr.ref_to_dim < attr.ref_to_var - else self.__var_map - ) + if rules.ref_to_dim and rules.ref_to_var: + if rules.ref_to_dim < rules.ref_to_var: + name_mapping = self.__dim_map + else: + name_mapping = self.__var_map + try: return name_mapping[resolved_ref] except KeyError: pass # If still not found, check if any standard name is allowed - if attr.accept_standard_names: + if rules.accept_standard_names: return resolved_ref else: @@ -1189,47 +1472,73 @@ def adapt_name(self, resolved_ref, attr): def pathname(self, group, name): """Compose full path name to an element in a group structure: - /path/to/group/elt. + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + current_group: + The group containing the dimension or variable. - :param group: group containing element - :param name: name of the element - :return: pathname + name: `str` + The name of the dimension or variable. + + :Returns: + + `str` + The absolute path to the dimension or variable """ if self.parent(group) is None: return group_separator + name - # return pathname_format.format(self.path(group), name) - # return group_separator.join((self.path(group), name)) - # return f"{self.path(group)}{group_separator}{name}") - # pathname_format.format(self.path(group), name) - def generate_mapping_str(self, input_group, name, new_name): """Generate string mapping. Generates a string representing the name mapping of an element before and after flattening. - :param input_group: group containing the non-flattened element - :param name: name of the non-flattened element - :param new_name: name of the flattened element - :return: string representing the name mapping for the element + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + input_group: + The group containing the non-flattened dimension or + variable. + + name: `str` + The name of the non-flattened dimension or variable. + + new_name: `str` + The name of the flattened dimension or variable. + + :Returns: + + `str` + A string representing the name mapping for the + dimension or variable. """ original_pathname = self.pathname(input_group, name) - mapping_str = f"{new_name}: {original_pathname}" - # mapping_str_format.format( - # new_name, original_pathname - # ) - return mapping_str + return f"{new_name}: {original_pathname}" def convert_path_to_valid_name(self, pathname): """Generate valid name from path. - :param pathname: pathname - :return: valid NetCDF name + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + pathname: `str` + The non-flattened namepath to a dimension or variable. + + new_name: `str` + A flattened version of *pathname*. + :Returns: + + `str` + The valid netCDF name. """ return pathname.replace(group_separator, "", 1).replace( @@ -1237,16 +1546,32 @@ def convert_path_to_valid_name(self, pathname): ) def generate_flattened_name(self, input_group, orig_name): - """Convert full path of an element to a valid NetCDF name: + """Convert full path of an element to a valid NetCDF name. + + * The name of an element is the concatenation of its + containing group and its name; - - the name of an element is the concatenation of its containing group and its name, - - replaces / from paths (forbidden as NetCDF name), - - if name is longer than 255 characters, replace path to group by hash, - - if name is still too long, replace complete name by hash. + * replaces ``/`` from paths (forbidden as NetCDF name); - :param input_group: group containing element - :param orig_name: original name of the element - :return: new valid name of the element + * if name is longer than 255 characters, replace path to group + by hash; + + * if name is still too long, replace complete name by hash. + + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + input_group: + The group containing the dimension or variable. + + orig_name: `str` + The original name of the dimension or variable. + + :Returns: + + `str` + The new valid name of the dimension or variable. """ # If element is at root: no change @@ -1284,9 +1609,21 @@ def handle_reference_error(self, ref, context=None): Depending on lax/strict mode, either raise exception or log warning. If lax, return reference placeholder. - :param ref: reference - :param context: additional context info to add to message - :return: if continue with warning, error replacement name for reference + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + ref: `str` + The reference + + context: `str` + Additional context information to add to message. + + :Returns: + + `str` + The error message, or if *lax_mode* is True then a + `ReferenceException` is raised. """ message = f"Reference {ref!r} could not be resolved" @@ -1301,6 +1638,10 @@ def handle_reference_error(self, ref, context=None): class ReferenceException(Exception): - """Exception for unresolvable references in attributes.""" + """Exception for unresolvable references in attributes. + + .. versionadded:: (cfdm) 1.11.1.0 + + """ pass diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 42b03f934..df0d7606b 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -592,7 +592,7 @@ def file_open(self, filename, flatten=True, verbose=None): flat_nc.set_fill_off() # Flatten the file - netcdf_flatten(nc, flat_nc, lax_mode=True, _copy_data=False) + netcdf_flatten(nc, flat_nc, lax_mode=True, copy_data=False) # Store the original grouped file. This is primarily # because the unlimited dimensions in the flattened @@ -616,7 +616,7 @@ def file_open(self, filename, flatten=True, verbose=None): def _open_netCDF4(self, filename): """Return an open `netCDF4.Dataset`. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: @@ -633,7 +633,7 @@ def _open_netCDF4(self, filename): def _open_h5netcdf(self, filename): """Return an open `h5netcdf.File`. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: @@ -941,7 +941,7 @@ def read( unpack: `bool`, optional See `cfdm.read` for details - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 warn_valid: `bool`, optional See `cfdm.read` for details @@ -956,17 +956,17 @@ def read( storage_options: `bool`, optional See `cfdm.read` for details - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 netCDF_backend: `None` or `str`, optional See `cfdm.read` for details - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 _file_systems: `dict`, optional Provide any already-open S3 file systems. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Returns: @@ -1094,7 +1094,7 @@ def read( g["version"][version] = Version(version) if storage_options is None: - storage_options = {"anon": True} + g["storage_options"] = {"anon": True} if _file_systems is not None: # Update S3 file systems with those passed in as keyword @@ -1331,8 +1331,7 @@ def read( group_attr = x[-1] flattener_attributes.setdefault(tuple(groups), {})[ group_attr - ] = self._file_global_attribute(flat_attr) - # ] = nc.getncattr(flat_attr) + ] = self._file_global_attribute(nc, flat_attr) # Remove flattener attributes from the global attributes for attr in ( @@ -1347,7 +1346,6 @@ def read( groups = () group_attributes = {} - # variable = nc.variables[ncvar] variable = self._file_variable(nc, ncvar) # -------------------------------------------------------- @@ -1390,8 +1388,8 @@ def read( flattener_attributes[hierarchy] ) else: - # Remove the leading / from the absolute netCDF - # variable path + # Remove the leading / (slash) from the absolute + # netCDF variable path ncvar = ncvar[1:] flattener_variables[ncvar] = ncvar @@ -1407,9 +1405,7 @@ def read( value = value.decode(errors="ignore") variable_attributes[ncvar][attr] = value - # print (attr, value, type(value)) - # variable_dimensions[ncvar] = tuple(variable.dimensions) variable_dimensions[ncvar] = tuple( self._file_variable_dimensions(variable) ) @@ -1421,9 +1417,8 @@ def read( variable_groups[ncvar] = groups variable_group_attributes[ncvar] = group_attributes - # Populate dimensions_groups abd dimension_basename + # Populate dimensions_groups and dimension_basename # dictionaries - # for ncdim in nc.dimensions: for ncdim in self._file_dimensions(nc): ncdim_org = ncdim ncdim_basename = ncdim @@ -1449,9 +1444,6 @@ def read( dimension_groups[ncdim] = groups dimension_basename[ncdim] = ncdim_basename - # dimension_isunlimited[ncdim] = nc.dimensions[ - # ncdim_org - # ].isunlimited() dimension_isunlimited[ncdim] = self._file_dimension_isunlimited( nc, ncdim_org ) @@ -6119,18 +6111,18 @@ def _create_netcdfarray( return_kwargs_only: `bool`, optional Only return the kwargs dictionary, without - instantiating a new `NetCDFArray`. + instantiating a new `NetCDF4Array` or `H5netcdfArray`. .. versionadded:: (cfdm) 1.10.0.1 :Returns: - (`NetCDFArray`, `dict`) or (`None`, `dict`) or `dict` - The new `NetCDFArray` instance and a dictionary of the - kwargs used to create it. If the array could not be - created then `None` is returned in its place. If - *return_kwargs_only* then only the dictionary is - returned. + (array, `dict`) or (`None`, `dict`) or `dict` + The new `NetCDF4Array` or `H5netcdfArray` instance and + a dictionary of the kwargs used to create it. If the + array could not be created then `None` is returned in + its place. If *return_kwargs_only* then only the + dictionary is returned. """ g = self.read_vars @@ -7310,7 +7302,7 @@ def _create_gathered_array( :Parameters: - gathered_array: `NetCDFArray` + gathered_array: `NetCDF4Array` or `H5netcdfArray` compressed_dimensions: sequence of `int` The position of the compressed dimension in the @@ -10046,10 +10038,35 @@ def _ugrid_check_connectivity_variable( return ok + def _file_global_attribute(self, nc, attr): + """Return a global attribute from a dataset. + + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + nc: `netCDF4.Dataset` or `h5netcdf.File` + The dataset. + + attr: `str` + The global attribute name. + + :Returns: + + The global attribute value + + """ + try: + # netCDF4 + return nc.getncattr(attr) + except AttributeError: + # h5netcdf + return nc.attrs[attr] + def _file_global_attributes(self, nc): """Return the global attributes from a dataset. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: @@ -10073,7 +10090,7 @@ def _file_global_attributes(self, nc): def _file_dimensions(self, nc): """Return all dimensions in the root group. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Returns: @@ -10086,7 +10103,7 @@ def _file_dimensions(self, nc): def _file_dimension(self, nc, dim_name): """Return a dimension from the root group of a dataset. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: @@ -10107,7 +10124,7 @@ def _file_dimension(self, nc, dim_name): def _file_dimension_isunlimited(self, nc, dim_name): """Return a whether a dimension is unlimited. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: @@ -10128,7 +10145,7 @@ def _file_dimension_isunlimited(self, nc, dim_name): def _file_dimension_size(self, nc, dim_name): """Return a dimension is size. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: @@ -10149,7 +10166,7 @@ def _file_dimension_size(self, nc, dim_name): def _file_variables(self, nc): """Return all variables in the root group. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: @@ -10167,7 +10184,7 @@ def _file_variables(self, nc): def _file_variable(self, nc, var_name): """Return a variable. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: @@ -10188,7 +10205,7 @@ def _file_variable(self, nc, var_name): def _file_variable_attributes(self, var): """Return the variable attribute names. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: @@ -10212,7 +10229,7 @@ def _file_variable_attributes(self, var): def _file_variable_dimensions(self, var): """Return the variable dimension names. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: @@ -10230,7 +10247,7 @@ def _file_variable_dimensions(self, var): def _file_variable_size(self, var): """Return the size of a variable's array. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 27649f310..54694821f 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -18,8 +18,8 @@ def read( mask=True, unpack=True, domain=False, - storage_options=None, netCDF_backend=None, + storage_options=None, _implementation=_implementation, ): """Read field or domain constructs from a dataset. @@ -251,7 +251,7 @@ def read( A netCDF array is unpacked depending on the values of the netCDF attributes ``add_offset`` and ``scale_factor``. - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 domain: `bool`, optional If True then return only the domain constructs that are @@ -275,6 +275,16 @@ def read( .. versionadded:: (cfdm) 1.9.0.0 + netCDF_backend: `None` or `str`, optional + Specify which library to use for opening netCDF files. By + default, or if `None`, then `netCDF4` will used unless it + fails to open a given file, in which case `h5netcdf` will + be used instead. Setting *netCDF_backend* to ``'netCDF4'`` + or ``'h5netcdf'`` will force the use of the `netCDF4` or + `h5netcdf` libraries respectively. + + .. versionadded:: (cfdm) 1.11.1.0 + storage_options: `dict` or `None`, optional Key/value pairs to be passed on to the creation of `s3fs.S3FileSystem` file systems to control the opening of @@ -302,17 +312,7 @@ def read( 'https://s3.fr-par.scw.cloud', 'client_kwargs': {'region_name': 'fr-par'}}`` - .. versionadded:: (cfdm) HDFVER - - netCDF_backend: `None` or `str`, optional - Specify which library to use for opening netCDF files. By - default, or if `None`, then `netCDF4` will used unless it - fails to open a given file, in which case `h5netcdf` will - be used instead. Setting *netCDF_backend* to ``'netCDF4'`` - or ``'h5netcdf'`` will force the use of the `netCDF4` or - `h5netcdf` libraries respectively. - - .. versionadded:: (cfdm) HDFVER + .. versionadded:: (cfdm) 1.11.1.0 _implementation: (subclass of) `CFDMImplementation`, optional Define the CF data model implementation that provides the diff --git a/cfdm/test/test_NetCDF4Array.py b/cfdm/test/test_NetCDF4Array.py index 6008b546f..bc42e587f 100644 --- a/cfdm/test/test_NetCDF4Array.py +++ b/cfdm/test/test_NetCDF4Array.py @@ -73,31 +73,6 @@ def test_NetCDF4Array_get_filenames(self): a = cfdm.NetCDF4Array() self.assertEqual(a.get_filenames(), ()) - def test_NetCDF4Array_get_missing_values(self): - """Test NetCDF4Array.get_missing_values.""" - f = cfdm.example_field(0) - - f.set_property("missing_value", -999) - f.set_property("_FillValue", -3) - f.set_property("valid_min", -111) - cfdm.write(f, tmpfile) - - g = cfdm.read(tmpfile)[0] - self.assertEqual( - g.data.source().get_missing_values(), - { - "missing_value": -999.0, - "_FillValue": -3, - "valid_min": -111, - }, - ) - - c = g.coordinate("latitude") - self.assertEqual(c.data.source().get_missing_values(), {}) - - a = cfdm.NetCDF4Array("file.nc", "ncvar") - self.assertIsNone(a.get_missing_values(None)) - def test_NetCDF4Array_mask(self): """Test NetCDF4Array masking.""" f = cfdm.example_field(0) @@ -195,8 +170,12 @@ def test_NetCDF4Array_get_attributes(self): f = cfdm.example_field(0) cfdm.write(f, tmpfile) n = cfdm.NetCDF4Array(tmpfile, f.nc_get_variable(), shape=f.shape) - self.assertIsNone(n.get_attributes()) + self.assertIsNone(n.get_attributes(None)) + + with self.assertRaises(ValueError): + n.get_attributes() + # Set attributes via indexing _ = n[...] self.assertEqual( n.get_attributes(), diff --git a/cfdm/test/test_NetCDFIndexer.py b/cfdm/test/test_NetCDFIndexer.py index 7cc1172f0..ea44ecfbc 100644 --- a/cfdm/test/test_NetCDFIndexer.py +++ b/cfdm/test/test_NetCDFIndexer.py @@ -132,7 +132,7 @@ def test_NetCDFIndexer_numpy(self): self.assertTrue((x == array).all()) x = cfdm.NetCDFIndexer( - array.copy(), attrs={"_FillValue": 4, "missing_value": (0, 8)} + array.copy(), attributes={"_FillValue": 4, "missing_value": (0, 8)} ) x = x[...] array[[0, 4, 8]] = np.ma.masked diff --git a/docs/source/class.rst b/docs/source/class.rst index f08e8e6c9..1059dc244 100644 --- a/docs/source/class.rst +++ b/docs/source/class.rst @@ -74,9 +74,11 @@ Data classes :toctree: class/ cfdm.Data - cfdm.NetCDFArray + cfdm.NetCDF4Array + cfdm.H5netcdfArray cfdm.NumpyArray cfdm.Array + cfdm.NetCDFIndexer Data compression classes ------------------------ diff --git a/docs/source/class/cfdm.H5netcdfArray.rst b/docs/source/class/cfdm.H5netcdfArray.rst new file mode 100644 index 000000000..506fc3c3b --- /dev/null +++ b/docs/source/class/cfdm.H5netcdfArray.rst @@ -0,0 +1,123 @@ +.. currentmodule:: cfdm +.. default-role:: obj + +cfdm.H5netcdfArray +================== + +---- + +.. autoclass:: cfdm.H5netcdfArray + :no-members: + :no-inherited-members: + +Inspection +---------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.H5netcdfArray.get_compression_type + ~cfdm.H5netcdfArray.get_subspace + ~cfdm.H5netcdfArray.get_attributes + + +.. rubric:: Attributes + +.. autosummary:: + :nosignatures: + :toctree: ../attribute/ + :template: attribute.rst + + ~cfdm.H5netcdfArray.array + ~cfdm.H5netcdfArray.dtype + ~cfdm.H5netcdfArray.ndim + ~cfdm.H5netcdfArray.shape + ~cfdm.H5netcdfArray.size + +Units +----- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.H5netcdfArray.get_calendar + ~cfdm.H5netcdfArray.get_units + +File +---- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.H5netcdfArray.get_address + ~cfdm.H5netcdfArray.get_addresses + ~cfdm.H5netcdfArray.close + ~cfdm.H5netcdfArray.open + ~cfdm.H5netcdfArray.get_filename + ~cfdm.H5netcdfArray.get_filenames + ~cfdm.H5netcdfArray.get_format + ~cfdm.H5netcdfArray.get_formats + ~cfdm.H5netcdfArray.get_groups + ~cfdm.H5netcdfArray.get_mask + ~cfdm.H5netcdfArray.get_unpack + ~cfdm.H5netcdfArray.get_storage_options + +Miscellaneous +------------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.H5netcdfArray.copy + ~cfdm.H5netcdfArray.to_memory + +Special +------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.H5netcdfArray.__getitem__ + +Docstring substitutions +----------------------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.H5netcdfArray._docstring_special_substitutions + ~cfdm.H5netcdfArray._docstring_substitutions + ~cfdm.H5netcdfArray._docstring_package_depth + ~cfdm.H5netcdfArray._docstring_method_exclusions + +Deprecated +---------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.H5netcdfArray.get_missing_values diff --git a/docs/source/class/cfdm.NetCDF4Array.rst b/docs/source/class/cfdm.NetCDF4Array.rst new file mode 100644 index 000000000..0b2e22668 --- /dev/null +++ b/docs/source/class/cfdm.NetCDF4Array.rst @@ -0,0 +1,123 @@ +.. currentmodule:: cfdm +.. default-role:: obj + +cfdm.NetCDF4Array +================= + +---- + +.. autoclass:: cfdm.NetCDF4Array + :no-members: + :no-inherited-members: + +Inspection +---------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.NetCDF4Array.get_compression_type + ~cfdm.NetCDF4Array.get_subspace + ~cfdm.NetCDF4Array.get_attributes + + +.. rubric:: Attributes + +.. autosummary:: + :nosignatures: + :toctree: ../attribute/ + :template: attribute.rst + + ~cfdm.NetCDF4Array.array + ~cfdm.NetCDF4Array.dtype + ~cfdm.NetCDF4Array.ndim + ~cfdm.NetCDF4Array.shape + ~cfdm.NetCDF4Array.size + +Units +----- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.NetCDF4Array.get_calendar + ~cfdm.NetCDF4Array.get_units + +File +---- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.NetCDF4Array.get_address + ~cfdm.NetCDF4Array.get_addresses + ~cfdm.NetCDF4Array.close + ~cfdm.NetCDF4Array.open + ~cfdm.NetCDF4Array.get_filename + ~cfdm.NetCDF4Array.get_filenames + ~cfdm.NetCDF4Array.get_format + ~cfdm.NetCDF4Array.get_formats + ~cfdm.NetCDF4Array.get_groups + ~cfdm.NetCDF4Array.get_mask + ~cfdm.NetCDF4Array.get_unpack + ~cfdm.NetCDF4Array.get_storage_options + +Miscellaneous +------------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.NetCDF4Array.copy + ~cfdm.NetCDF4Array.to_memory + +Special +------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.NetCDF4Array.__getitem__ + +Docstring substitutions +----------------------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.NetCDF4Array._docstring_special_substitutions + ~cfdm.NetCDF4Array._docstring_substitutions + ~cfdm.NetCDF4Array._docstring_package_depth + ~cfdm.NetCDF4Array._docstring_method_exclusions + +Deprecated +---------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.NetCDF4Array.get_missing_values diff --git a/docs/source/class/cfdm.NetCDFArray.rst b/docs/source/class/cfdm.NetCDFArray.rst deleted file mode 100644 index 989e90cae..000000000 --- a/docs/source/class/cfdm.NetCDFArray.rst +++ /dev/null @@ -1,108 +0,0 @@ -.. currentmodule:: cfdm -.. default-role:: obj - -cfdm.NetCDFArray -================ - ----- - -.. autoclass:: cfdm.NetCDFArray - :no-members: - :no-inherited-members: - -Inspection ----------- - -.. rubric:: Methods - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - - ~cfdm.NetCDFArray.get_compression_type - ~cfdm.NetCDFArray.get_subspace - ~cfdm.NetCDFArray.get_missing_values - -.. rubric:: Attributes - -.. autosummary:: - :nosignatures: - :toctree: ../attribute/ - :template: attribute.rst - - ~cfdm.NetCDFArray.array - ~cfdm.NetCDFArray.dtype - ~cfdm.NetCDFArray.ndim - ~cfdm.NetCDFArray.shape - ~cfdm.NetCDFArray.size - -Units ------ - -.. rubric:: Methods - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - - ~cfdm.NetCDFArray.get_calendar - ~cfdm.NetCDFArray.get_units - -File ----- - -.. rubric:: Methods - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - - ~cfdm.NetCDFArray.get_address - ~cfdm.NetCDFArray.get_addresses - ~cfdm.NetCDFArray.close - ~cfdm.NetCDFArray.open - ~cfdm.NetCDFArray.get_filename - ~cfdm.NetCDFArray.get_filenames - ~cfdm.NetCDFArray.get_format - ~cfdm.NetCDFArray.get_formats - ~cfdm.NetCDFArray.get_groups - ~cfdm.NetCDFArray.get_mask - -Miscellaneous -------------- - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - - ~cfdm.NetCDFArray.copy - ~cfdm.NetCDFArray.to_memory - -Special -------- - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - - ~cfdm.NetCDFArray.__getitem__ - -Docstring substitutions ------------------------ - -.. rubric:: Methods - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - - ~cfdm.NetCDFArray._docstring_special_substitutions - ~cfdm.NetCDFArray._docstring_substitutions - ~cfdm.NetCDFArray._docstring_package_depth - ~cfdm.NetCDFArray._docstring_method_exclusions diff --git a/docs/source/class/cfdm.NetCDFIndexer.rst b/docs/source/class/cfdm.NetCDFIndexer.rst new file mode 100644 index 000000000..5e236b0f8 --- /dev/null +++ b/docs/source/class/cfdm.NetCDFIndexer.rst @@ -0,0 +1,42 @@ +.. currentmodule:: cfdm +.. default-role:: obj + +cfdm.NetCDFIndexer +================== + +---- + +.. autoclass:: cfdm.NetCDFIndexer + :no-members: + :no-inherited-members: + +Inspection +---------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.NetCDFIndexer.attributes + +.. rubric:: Attributes + +.. autosummary:: + :nosignatures: + :toctree: ../attribute/ + :template: attribute.rst + + ~cfdm.NetCDFIndexer.shape + +Special +------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.NetCDFIndexer.__getitem__ diff --git a/docs/source/conf.py b/docs/source/conf.py index eecf98b9f..d01b55777 100755 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -131,6 +131,7 @@ def _get_date(): "numpy": ("https://numpy.org/doc/stable", None), # 'netCDF4': ('https://unidata.github.io/netcdf4-python/', None), "cftime": ("https://unidata.github.io/cftime", None), + "h5netcdf": ("https://h5netcdf.org", None), } # This extension is meant to help with the common pattern of having diff --git a/docs/source/extensions.rst b/docs/source/extensions.rst index 3b1848007..aaac4d3b2 100644 --- a/docs/source/extensions.rst +++ b/docs/source/extensions.rst @@ -150,7 +150,7 @@ in overridden methods. Data=cfdm.Data, GatheredArray=cfdm.GatheredArray, - NetCDFArray=cfdm.NetCDFArray, + NetCDF4Array=cfdm.NetCDF4Array, RaggedContiguousArray=cfdm.RaggedContiguousArray, RaggedIndexedArray=cfdm.RaggedIndexedArray, RaggedIndexedContiguousArray=cfdm.RaggedIndexedContiguousArray, diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst index 8cb2242ce..4fc6b1e14 100644 --- a/docs/source/introduction.rst +++ b/docs/source/introduction.rst @@ -76,12 +76,15 @@ to add more sophisticated methods. The cfdm package can * read :term:`field constructs ` and :term:`domain - constructs ` from netCDF and CDL datasets, + constructs ` from netCDF and CDL datasets with a + choice of netCDF backends, * create new field and domain constructs in memory, * write field and domain constructs to netCDF datasets on disk, +* read, write, and manipulate UGRID mesh topologies, + * read, write, and create coordinates defined by geometry cells, * read and write netCDF4 string data-type variables, @@ -144,9 +147,10 @@ If you use cfdm, either as a stand-alone application or to provide a CF data model implementation to another software library, please consider including the reference: -Hassell, D., and Bartholomew, S. L. (2020). cfdm: A Python reference - implementation of the CF data model. Journal of Open Source - Software, 5(54), 2717, https://doi.org/10.21105/joss.02717 +Hassell, D., and Bartholomew, S. L. (2020). + cfdm: A Python reference implementation of the CF data + model. Journal of Open Source Software, 5(54), 2717, + https://doi.org/10.21105/joss.02717 .. code-block:: bibtex @@ -168,28 +172,30 @@ Hassell, D., and Bartholomew, S. L. (2020). cfdm: A Python reference **References** -------------- -Eaton, B., Gregory, J., Drach, B., Taylor, K., Hankin, S., Caron, J., - Signell, R., et al. (2020). NetCDF Climate and Forecast (CF) +Eaton, B., Gregory, J., Drach, B., Taylor, K., Hankin, S., Caron, J., Signell, R., et al. (2020). + NetCDF Climate and Forecast (CF) Metadata Conventions. CF Conventions Committee. Retrieved from https://cfconventions.org/cf-conventions/cf-conventions.html -Hassell, D., and Bartholomew, S. L. (2020). cfdm: A Python reference - implementation of the CF data model. Journal of Open Source - Software, 5(54), 2717, https://doi.org/10.21105/joss.02717 +Hassell, D., and Bartholomew, S. L. (2020). + cfdm: A Python reference implementation of the CF data + model. Journal of Open Source Software, 5(54), 2717, + https://doi.org/10.21105/joss.02717 -Hassell, D., Gregory, J., Blower, J., Lawrence, B. N., and - Taylor, K. E. (2017). A data model of the Climate and Forecast - metadata conventions (CF-1.6) with a software implementation - (cf-python v2.1), Geosci. Model Dev., 10, 4619-4646, +Hassell, D., Gregory, J., Blower, J., Lawrence, B. N., and Taylor, K. E. (2017). + A data model of the Climate and Forecast metadata conventions + (CF-1.6) with a software implementation (cf-python v2.1), + Geosci. Model Dev., 10, 4619-4646, https://doi.org/10.5194/gmd-10-4619-2017 -Rew, R., and Davis, G. (1990). NetCDF: An Interface for Scientific - Data Access. IEEE Computer Graphics and Applications, 10(4), +Rew, R., and Davis, G. (1990). + NetCDF: An Interface for Scientific Data Access. IEEE Computer + Graphics and Applications, 10(4), 76–82. https://doi.org/10.1109/38.56302 -Rew, R., Hartnett, E., and Caron, J. (2006). NetCDF-4: Software - Implementing an Enhanced Data Model for the Geosciences. In 22nd - International Conference on Interactive Information Processing - Systems for Meteorology, Oceanography, and Hydrology. AMS. Retrieved - from +Rew, R., Hartnett, E., and Caron, J. (2006). + NetCDF-4: Software Implementing an Enhanced Data Model for the + Geosciences. In 22nd International Conference on Interactive + Information Processing Systems for Meteorology, Oceanography, and + Hydrology. AMS. Retrieved from https://www.unidata.ucar.edu/software/netcdf/papers/2006-ams.pdf diff --git a/docs/source/spelling_false_positives.txt b/docs/source/spelling_false_positives.txt index 013dff0ae..4525de3c6 100644 --- a/docs/source/spelling_false_positives.txt +++ b/docs/source/spelling_false_positives.txt @@ -8,6 +8,8 @@ atol ATOL AuxiliaryCoordinate auxiliarycoordinate +backend +backends basenames Booleans bool @@ -77,6 +79,7 @@ hashable Hassell hdf indexable +init initio inplace instantiation @@ -105,9 +108,11 @@ ncvars nd ndim ness -netcdf netCDF +netcdf NetCDFArray +netcdfArray +NetCDFIndexer Nino nonzero numpy @@ -168,6 +173,7 @@ uncompresses uncompressing unicode unfilter +url varid verboseness versionadded diff --git a/docs/source/tutorial.py b/docs/source/tutorial.py index 50c2f209f..923e6e918 100644 --- a/docs/source/tutorial.py +++ b/docs/source/tutorial.py @@ -538,8 +538,8 @@ import netCDF4 nc = netCDF4.Dataset('file.nc', 'r') v = nc.variables['ta'] -netcdf_array = cfdm.NetCDFArray(filename='file.nc', address='ta', - dtype=v.dtype, shape=v.shape) +netcdf_array = cfdm.NetCDF4Array(filename='file.nc', address='ta', + dtype=v.dtype, shape=v.shape) data_disk = cfdm.Data(netcdf_array) numpy_array = v[...] data_memory = cfdm.Data(numpy_array) diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index f8487d613..c0bfe963e 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -113,11 +113,11 @@ instance or `cfdm.Domain` instance respectively. Henceforth the phrase ---------------------------------------------------- The `cfdm.read` function reads a `netCDF -`_ file from disk, or -from an `OPeNDAP `_ URL [#dap]_, and by -default returns the contents as a Python list of zero or more field -constructs. This list contains a field construct to represent each of -the CF-netCDF data variables in the file. +`_ file from disk, from +an `OPeNDAP `_ URL [#dap]_, or from an S3 +object store, and by default returns the contents as a Python list of +zero or more field constructs. This list contains a field construct to +represent each of the CF-netCDF data variables in the file. Datasets of any version of CF up to and including CF-|version| can be read. @@ -2855,19 +2855,20 @@ All the of above examples use arrays in memory to construct the data instances for the field and metadata constructs. It is, however, possible to create data from arrays that reside on disk. The `cfdm.read` function creates data in this manner. A pointer to an -array in a netCDF file can be stored in a `~cfdm.NetCDFArray` -instance, which is is used to initialise a `~cfdm.Data` instance. +array in a netCDF file can be stored in a `~cfdm.NetCDF4Array` or +`~cfdm.H5netcdfAarray` instance, which is is used to initialise a +`~cfdm.Data` instance. .. code-block:: python :caption: *Define a variable from a dataset with the netCDF package - and use it to create a NetCDFArray instance with which to + and use it to create a NetCDF4Array instance with which to initialise a Data instance.* >>> import netCDF4 >>> nc = netCDF4.Dataset('file.nc', 'r') >>> v = nc.variables['ta'] - >>> netcdf_array = cfdm.NetCDFArray(filename='file.nc', address='ta', - ... dtype=v.dtype, shape=v.shape) + >>> netcdf_array = cfdm.NetCDF4Array(filename='file.nc', address='ta', + ... dtype=v.dtype, shape=v.shape) >>> data_disk = cfdm.Data(netcdf_array) @@ -2883,7 +2884,7 @@ instance, which is is used to initialise a `~cfdm.Data` instance. Note that data type, number of dimensions, dimension sizes and number of elements of the array on disk that are used to initialise the -`~cfdm.NetCDFArray` instance are those expected by the CF data model, +`~cfdm.NetCDF4Array` instance are those expected by the CF data model, which may be different to those of the netCDF variable in the file (although they are the same in the above example). For example, a netCDF character array of shape ``(12, 9)`` is viewed in cfdm as a diff --git a/release_docs b/release_docs index 95ebf52ad..b4ea15309 100755 --- a/release_docs +++ b/release_docs @@ -9,6 +9,12 @@ fi version=`python -c "import cfdm; print(cfdm.__version__)"` +sphinx_version=`python -c "import sphinx; print(sphinx.__version__)"` +if [[ $sphinx_version != "2.4.5" ]] ; then + echo "ERROR: Must (sadly) use sphinx version 2.4.5. Got $sphinx_version" + exit 3 +fi + if [[ $1 = "latest" ]] ; then dir=$PWD/docs elif [[ $1 = "archive" ]] ; then diff --git a/setup.py b/setup.py index 937fdd1e7..19e3c96db 100755 --- a/setup.py +++ b/setup.py @@ -198,6 +198,7 @@ def _get_version(): "cfdm.read_write", "cfdm.read_write.abstract", "cfdm.read_write.netcdf", + "cfdm.read_write.netcdf.flatten", "cfdm.test", ], scripts=["scripts/cfdump"], From 589dc1384ff44552e04273dad3070464ce239057 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 13 Feb 2024 18:35:40 +0000 Subject: [PATCH 35/88] dev --- Changelog.rst | 6 +- cfdm/__init__.py | 3 +- cfdm/data/netcdfindexer.py | 16 +- cfdm/read_write/netcdf/flatten/__init__.py | 14 + cfdm/read_write/netcdf/flatten/config.py | 5 +- cfdm/read_write/netcdf/flatten/flatten.py | 394 ++++++++++----------- cfdm/read_write/netcdf/netcdfread.py | 8 +- cfdm/read_write/write.py | 2 +- docs/source/functions.rst | 1 + docs/source/introduction.rst | 2 + docs/source/tutorial.rst | 3 +- 11 files changed, 231 insertions(+), 223 deletions(-) diff --git a/Changelog.rst b/Changelog.rst index 1e0b0f08c..9666afe0d 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -3,10 +3,12 @@ Version 1.11.1.0 **2024-??-??** +* New function `cfdm.netcdf_flattener` + (https://github.com/NCAS-CMS/cfdm/issues/286) * Allow access to netCDF-4 files in S3 object stores (https://github.com/NCAS-CMS/cfdm/issues/285) -* Refactored the flattening of netCDF-4 groups - (https://github.com/NCAS-CMS/cfdm/issues/286) +* New class `cfdm.H5netcdfArray` +* New class `cfdm.NetCDFIndexer` * New dependency: ``h5netcdf>=1.3.0`` * New dependency: ``h5py>=3.10.0`` * New dependency: ``s3fs>=2024.2.0`` diff --git a/cfdm/__init__.py b/cfdm/__init__.py index db85621a9..84d863b25 100644 --- a/cfdm/__init__.py +++ b/cfdm/__init__.py @@ -77,7 +77,7 @@ except ImportError as error1: raise ImportError(_error0 + str(error1)) -minimum_vn = "1.5.4" +_minimum_vn = "1.5.4" if Version(netCDF4.__version__) < Version(_minimum_vn): raise ValueError( f"Bad netCDF4 version: cfdm requires netCDF4>={_minimum_vn}. " @@ -242,6 +242,7 @@ from .cfdmimplementation import CFDMImplementation, implementation from .read_write import read, write +from .read_write.netcdf.flatten import netcdf_flatten from .examplefield import example_field, example_fields, example_domain diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index 11c96bb42..fced5e0b2 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -1,7 +1,7 @@ -"""License information: +"""A data indexer that applies netCDF masking and unpacking. Portions of this code were adapted from the `netCDF4` library, which -is lcensed with the carries MIT License: +carries the MIT License: Copyright 2008 Jeffrey Whitaker @@ -176,6 +176,7 @@ def __getitem__(self, index): # Index the variable data = variable[index] + # Convert str, char, and object data to byte strings if isinstance(data, str): data = np.array(data, dtype="S") elif data.dtype.kind in "OSU": @@ -193,10 +194,7 @@ def __getitem__(self, index): dtype_unsigned_int = None if unpack: - is_unsigned_int = attributes.get("_Unsigned", False) in ( - "true", - "True", - ) + is_unsigned_int = attributes.get("_Unsigned") in ("true", "True") if is_unsigned_int: data_dtype = data.dtype dtype_unsigned_int = ( @@ -272,8 +270,8 @@ def _check_safecast(self, attname, dtype, attributes): is_safe = _safecast(att, atta) if not is_safe: - logger.warn( - f"WARNING: Attribute {attname} not used since it can't " + logger.info( + f"Mask attribute {attname!r} not used since it can't " f"be safely cast to variable data type {dtype!r}" ) # pragma: no cover @@ -329,7 +327,7 @@ def _mask(self, data, dtype, attributes, dtype_unsigned_int): The masked data. """ - # + # The Boolean mask accounting for all methods of specification totalmask = None # The fill value for the returned numpy array fill_value = None diff --git a/cfdm/read_write/netcdf/flatten/__init__.py b/cfdm/read_write/netcdf/flatten/__init__.py index d09591bec..5f73659eb 100644 --- a/cfdm/read_write/netcdf/flatten/__init__.py +++ b/cfdm/read_write/netcdf/flatten/__init__.py @@ -1 +1,15 @@ +"""Flatten NetCDF groups. + +Portions of this package were adapted from the `netcdf_flattener` +library, which carries the Apache 2.0 License: + +Copyright (c) 2020 EUMETSAT + +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. The ASF licenses this file to you +under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy +of the License at http://www.apache.org/licenses/LICENSE-2.0. + +""" from .flatten import netcdf_flatten diff --git a/cfdm/read_write/netcdf/flatten/config.py b/cfdm/read_write/netcdf/flatten/config.py index b754e1030..1c2451106 100644 --- a/cfdm/read_write/netcdf/flatten/config.py +++ b/cfdm/read_write/netcdf/flatten/config.py @@ -18,9 +18,6 @@ # 'lax_mode=True' in `flatten`. ref_not_found_error = "REF_NOT_FOUND" -# Default size, in bytes, of slice to use when copying data arrays -default_copy_slice_size = 134217728 - # NetCDF global attribute in the flattened dataset containing the # mapping of flattened attribute names to grouped attribute names flattener_attribute_map = "__flattener_attribute_map" @@ -52,7 +49,7 @@ class FlatteningRules: # dimensions (higher values have priority) ref_to_dim: int = 0 # ref_to_var: Positive integer if contains references to variables - # (highest int have priority) + # (highest values have priority) ref_to_var: int = 0 # resolve_key: True if 'keys' have to be resolved in 'key1: value1 # key2: value2 value3' or 'key1 key2' diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index fb2e50600..251112776 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -1,25 +1,9 @@ -"""Flatten NetCDF groups. - -Portions of this code were adapted from the `netcdf_flattener` -library, which carries the Apache 2.0 License: - -Copyright (c) 2020 EUMETSAT - -Licensed to the Apache Software Foundation (ASF) under one or more -contributor license agreements. The ASF licenses this file to you -under the Apache License, Version 2.0 (the "License"); you may not use -this file except in compliance with the License. You may obtain a copy -of the License at http://www.apache.org/licenses/LICENSE-2.0. - -""" - import hashlib import logging import re import warnings from .config import ( - default_copy_slice_size, flattener_attribute_map, flattener_dimension_map, flattener_separator, @@ -45,20 +29,20 @@ def netcdf_flatten( - input_ds, output_ds, lax_mode=False, copy_data=True, copy_slices=None + input_ds, + output_ds, + lax_mode=False, + omit_data=False, + write_chunksize=134217728, ): """Create a flattened version of a netCDF dataset. - For variable that are too big to fit in memory, the optional - "copy_slices" input allows to copy some or all of the variables in - slices. - .. versionadded:: (cfdm) 1.11.1.0 :Parameters: input_ds: `netCDF4.Dataset` or `h5netcdf.File` - The dataset to be falttened. + The dataset to be flattened. output_ds: `netCDF4.Dataset` A container for the flattened dataset. @@ -68,38 +52,40 @@ def netcdf_flatten( halts the execution. If True, then continue with a warning. - copy_data: `bool`, optional - If True, the default, then all data arrays are copied - from the input to the output dataset. If False, then - this does not happen. Use this option only if the data - arrays of the flattened dataset are never to be - accessed. - - copy_slices: `dict`, optional - Dictionary containing variable_name/shape key/value - pairs, where variable_name is the path to the variable - name in the original dataset (for instance - ``/group1/group2/my_variable``), and shape is either - `None` for using default slice value, or a custom - slicing shape in the form of a tuple of the same - dimension as the variable (for instance ``(1000, 2000, - 1500)`` for a 3-dimensional variable). If a variable - from the dataset is not contained in the dictionary - then it will not be sliced and copied normally. + omit_data: `bool`, optional + If True then do not copy the data of any variables from + *input_ds* to *output_ds*. This does not affect the amount + of netCDF variables and dimensions that are written to the + file, nor the netCDF variables' attributes, but for all + variables it does not create data on disk or in + memory. The resulting dataset will be smaller than it + otherwise would have been, and when the new dataset is + accessed the data of these variables will be represented + by an array of all missing data. If False, the default, + then all data arrays are copied. + + write_chunksize: `int`, optional + When *omit_data* is False, the copying of data is done + piecewise to keep memory usage down. *write_chunksize* is + the size in bytes of how much data is copied from + *input_ds* to *output_ds* for each piece. Ignored if + *omit_data* is True. """ _Flattener( - input_ds, lax_mode, copy_data=copy_data, copy_slices=copy_slices - ).flatten(output_ds) + input_ds, + output_ds, + lax_mode, + omit_data=omit_data, + write_chunksize=write_chunksize, + ).flatten() -def parse_var_attr(attribute): +def parse_attribute(name, attribute): """Parse variable attribute of any form into a dict: * 'time' -> {'time': []} - * 'lat lon' -> {'lat': [], 'lon': []} - * 'area: time volume: lat lon' -> {'area': ['time'], 'volume': ['lat', 'lon']} @@ -107,6 +93,9 @@ def parse_var_attr(attribute): :Parameters: + name: `str` + The attribute name (e.g. ``'cell_methods'```). + attribute: `str` The attribute value to parse. @@ -169,8 +158,8 @@ def subst(s): ] out[term] = values else: - raise ReferenceException( - f"Error while parsing attribute value: {attribute!r}" + raise AttributeParsingException( + f"Error parsing {name!r} attribute with value {attribute!r}" ) return out @@ -181,15 +170,15 @@ def generate_var_attr_str(d): .. versionadded:: (cfdm) 1.11.1.0 - :Parameters: + :Parameters: - d: `dict` - A resolved and parsed attribute. + d: `dict` + A resolved and parsed attribute. - :Returns: + :Returns: - `str` - The flattened attribute value. + `str` + The flattened attribute value. """ parsed_list = [] @@ -214,56 +203,74 @@ class _Flattener: """ - def __init__(self, input_ds, lax_mode, copy_data=True, copy_slices=None): + def __init__( + self, + input_ds, + output_ds, + lax_mode, + omit_data=False, + write_chunksize=134217728, + ): """**Initialisation** :Parameters: input_ds: `netCDF4.Dataset` or `h5netcdf.File` - The dataset to be falttened. + See `netcdf_flatten`. + + output_ds: `netCDF4.Dataset` + See `netcdf_flatten`. lax_mode: `bool`, optional - If False, the default, the not resolving a reference - halts the execution. If True, then continue with a - warning. - - copy_data: `bool`, optional - If True, the default, then all data arrays are copied - from the input to the output dataset. If False, then - this does not happen. Use this option only if the data - arrays of the flattened dataset are never to be - accessed. - - copy_slices: `dict`, optional - Dictionary containing variable_name/shape key/value - pairs, where variable_name is the path to the variable - name in the original dataset (for instance - ``/group1/group2/my_variable``), and shape is either - `None` for using default slice value, or a custom - slicing shape in the form of a tuple of the same - dimension as the variable (for instance ``(1000, 2000, - 1500)`` for a 3-dimensional variable). If a variable - from the dataset is not contained in the dictionary - then it will not be sliced and copied normally. + See `netcdf_flatten`. - """ - self.__attr_map_value = [] - self.__dim_map_value = [] - self.__var_map_value = [] + omit_data: `bool`, optional + See `netcdf_flatten`. - self.__dim_map = {} - self.__var_map = {} + write_chunksize: `int`, optional + See `netcdf_flatten`. - self.__lax_mode = lax_mode + """ + self._attr_map_value = [] + self._dim_map_value = [] + self._var_map_value = [] - self.__copy_data = copy_data - self.__copy_slices = copy_slices + self._dim_map = {} + self._var_map = {} - self.__input_file = input_ds - self.__output_file = None + self._input_ds = input_ds + self._output_ds = output_ds + self._lax_mode = lax_mode + self._omit_data = omit_data + self._write_chunksize = write_chunksize + + if ( + output_ds == input_ds + or output_ds.filepath() == self.filepath(input_ds) + or output_ds.data_model != "NETCDF4" + ): + raise ValueError( + "Invalid inputs. Input and output datasets should " + "be different, and output should be of the 'NETCDF4' format." + ) def attrs(self, variable): - """TODOHDF.""" + """Return the variable attributes. + + .. versionadded:: (cfdm) 1.11.1.0 + + :Parameters: + + var: + The dataset variable. + + :Returns: + + `dict` + A dictionary of the attribute values keyed by their + names. + + """ try: # h5netcdf return dict(variable.attrs) @@ -572,57 +579,35 @@ def path(self, group): except AttributeError: return group_separator - def flatten(self, output_ds): - """Flattens and write to output file. + def flatten(self): + """Flattens and writes to output file. .. versionadded:: (cfdm) 1.11.1.0 - :Parameters: - - output_ds: `netCDF4.Dataset` - A container for the flattened dataset. - :Return: `None` """ - logging.info( - f"Flattening the groups of {self.filepath(self.__input_file)}" - ) + input_ds = self._input_ds + output_ds = self._output_ds - if ( - output_ds == self.__input_file - or output_ds.filepath() == self.filepath(self.__input_file) - or output_ds.data_model != "NETCDF4" - ): - raise ValueError( - "Invalid inputs. Input and output datasets should " - "be different, and output should be of the 'NETCDF4' format." - ) - - self.__output_file = output_ds + logging.info(f"Flattening the groups of {self.filepath(input_ds)}") # Flatten product - self.process_group(self.__input_file) + self.process_group(input_ds) # Add name mapping attributes - self.__output_file.setncattr( - flattener_attribute_map, self.__attr_map_value - ) - self.__output_file.setncattr( - flattener_dimension_map, self.__dim_map_value - ) - self.__output_file.setncattr( - flattener_variable_map, self.__var_map_value - ) + output_ds.setncattr(flattener_attribute_map, self._attr_map_value) + output_ds.setncattr(flattener_dimension_map, self._dim_map_value) + output_ds.setncattr(flattener_variable_map, self._var_map_value) # Browse flattened variables to rename references: logging.info( " Browsing flattened variables to rename references " "in attributes" ) - for var in self.__output_file.variables.values(): + for var in output_ds.variables.values(): self.adapt_references(var) def process_group(self, input_group): @@ -681,12 +666,12 @@ def flatten_attribute(self, input_group, attr_name): new_attr_name = self.generate_flattened_name(input_group, attr_name) # Write attribute - self.__output_file.setncattr( + self._output_ds.setncattr( new_attr_name, self.getncattr(input_group, attr_name) ) # Store new naming for later and in mapping attribute - self.__attr_map_value.append( + self._attr_map_value.append( self.generate_mapping_str(input_group, attr_name, new_attr_name) ) @@ -716,17 +701,17 @@ def flatten_dimension(self, dim): ) # Write dimension - self.__output_file.createDimension( + self._output_ds.createDimension( new_name, (len(dim), None)[dim.isunlimited()] ) # Store new name in dict for resolving references later - self.__dim_map[ + self._dim_map[ self.pathname(self.group(dim), self.name(dim)) ] = new_name # Add to name mapping attribute - self.__dim_map_value.append( + self._dim_map_value.append( self.generate_mapping_str( self.group(dim), self.name(dim), new_name ) @@ -760,7 +745,7 @@ def flatten_variable(self, var): # Replace old by new dimension names new_dims = list( map( - lambda x: self.__dim_map[ + lambda x: self._dim_map[ self.pathname(self.group(x), self.name(x)) ], self.get_dims(var), @@ -773,13 +758,13 @@ def flatten_variable(self, var): attributes = self.attrs(var) - copy_data = self.__copy_data - if copy_data: - fill_value = attributes.pop("_FillValue", None) - else: + omit_data = self._omit_data + if omit_data: fill_value = False + else: + fill_value = attributes.pop("_FillValue", None) - new_var = self.__output_file.createVariable( + new_var = self._output_ds.createVariable( new_name, self.dtype(var), new_dims, @@ -794,34 +779,19 @@ def flatten_variable(self, var): fill_value=fill_value, ) - if copy_data: - # Find out slice method for variable and copy data - copy_slices = self.__copy_slices - if copy_slices is None or fullname not in copy_slices: - # Copy data as a whole - new_var[...] = var[...] - elif copy_slices[fullname] is None: - # Copy with default slice size - copy_slice = tuple( - default_copy_slice_size // len(var.shape) - for _ in range(len(var.shape)) - ) - self.copy_var_by_slices(new_var, var, copy_slice) - else: - # Copy in slices - copy_slice = copy_slices[fullname] - self.copy_var_by_slices(new_var, var, copy_slice) + if not omit_data: + self.write_data_in_chunks(var, new_var) # Copy attributes new_var.setncatts(attributes) # Store new name in dict for resolving references later - self.__var_map[ + self._var_map[ self.pathname(self.group(var), self.name(var)) ] = new_name # Add to name mapping attribute - self.__var_map_value.append( + self._var_map_value.append( self.generate_mapping_str( self.group(var), self.name(var), new_name ) @@ -887,44 +857,44 @@ def increment_pos(self, pos, dim, copy_slice_shape, var_shape): # increment. Finish. return False - def copy_var_by_slices(self, new_var, old_var, copy_slice_shape): + def write_data_in_chunks(self, old_var, new_var): """Copy the data of a variable to a new one by slice. .. versionadded:: (cfdm) 1.11.1.0 :Parameters: - new_var: `netCDF4.Variable` - The new variable where to copy dataf. - old_var: `netCDF4.Variable` or `h5netcdf.Variable` The variable where data should be copied from. - copy_slice_shape: `tuple` - The shape of the slice + new_var: `netCDF4.Variable` + The new variable where to copy data. :Returns: `None` """ + ndim = old_var.ndim + shape = old_var.shape + chunk_shape = ( + (self.write_chunksize // (old_var.dtype.itemsize * ndim)), + ) * ndim + logging.info( - f" Copying data of {self.name(old_var)} in " - f"{copy_slice_shape} slices" + f" Copying {self.name(old_var)!r} data in chunks of " + f"{chunk_shape}" ) - # Initial position vector - pos = [0 for _ in range(len(copy_slice_shape))] + pos = [0] * ndim # Copy in slices until end reached var_end_reached = False while not var_end_reached: # Create current slice current_slice = tuple( - slice( - pos[dim_i], min(old_var.shape[dim_i], pos[dim_i] + dim_l) - ) - for dim_i, dim_l in enumerate(copy_slice_shape) + slice(pos[dim_i], min(shape[dim_i], pos[dim_i] + dim_l)) + for dim_i, dim_l in enumerate(chunk_shape) ) # Copy data in slice @@ -932,7 +902,7 @@ def copy_var_by_slices(self, new_var, old_var, copy_slice_shape): # Get next position var_end_reached = not self.increment_pos( - pos, 0, copy_slice_shape, old_var.shape + pos, 0, chunk_shape, shape ) def resolve_reference(self, orig_ref, orig_var, rules): @@ -964,11 +934,14 @@ def resolve_reference(self, orig_ref, orig_var, rules): absolute_ref = None ref_type = "" + ref_to_dim = rules.ref_to_dim + ref_to_var = rules.ref_to_var + # Resolve first as dim (True), or var (False) - resolve_dim_or_var = rules.ref_to_dim > rules.ref_to_var + resolve_dim_or_var = ref_to_dim > ref_to_var # Resolve var (resp. dim) if resolving as dim (resp. var) failed - resolve_alt = rules.ref_to_dim and rules.ref_to_var + resolve_alt = ref_to_dim and ref_to_var # Reference is already given by absolute path if ref.startswith(group_separator): @@ -1061,12 +1034,14 @@ def resolve_reference_proximity( else: ref_type = "variable" + stop_at_local_apex = rules.stop_at_local_apex + resolved_var = self.search_by_proximity( ref, self.group(orig_var), resolve_dim_or_var, False, - rules.stop_at_local_apex, + stop_at_local_apex, ) # If failed and alternative possible, second tentative @@ -1081,7 +1056,7 @@ def resolve_reference_proximity( self.group(orig_var), not resolve_dim_or_var, False, - rules.stop_at_local_apex, + stop_at_local_apex, ) # If found, create ref string @@ -1160,7 +1135,7 @@ def resolve_reference_post_processing( "coordinates" not in self.ncattrs(orig_var) or orig_ref not in self.getncattr(orig_var, "coordinates") ) - or self._Flattener__input_file[absolute_ref].ndim > 0 + or self._input_ds[absolute_ref].ndim > 0 ) ): logging.info( @@ -1344,27 +1319,28 @@ def resolve_references(self, var, old_var): var_attrs = self.attrs(var) for name in special_attributes.intersection(var_attrs): # Parse attribute value - parsed_attr = parse_var_attr(var_attrs[name]) + parsed_attribute = parse_attribute(name, var_attrs[name]) # Resolved references in parsed as required by attribute # properties resolved_parsed_attr = {} - rules = flattening_rules.get(name) - for k, v in parsed_attr.items(): - if rules.resolve_key: + rules = flattening_rules[name] + resolve_key = rules.resolve_key + resolve_value = rules.resolve_value + + for k, v in parsed_attribute.items(): + if resolve_key: k = self.resolve_reference(k, old_var, rules) - if rules.resolve_value and v is not None: + if resolve_value and v is not None: v = [self.resolve_reference(x, old_var, rules) for x in v] resolved_parsed_attr[k] = v # Re-generate attribute value string with resolved # references - var.setncattr( - rules.name, generate_var_attr_str(resolved_parsed_attr) - ) + var.setncattr(name, generate_var_attr_str(resolved_parsed_attr)) def adapt_references(self, var): """Adapt references. @@ -1390,27 +1366,30 @@ def adapt_references(self, var): var_attrs = self.attrs(var) for name in special_attributes.intersection(var_attrs): # Parse attribute value - attr_value = var_attrs[name] - parsed_attr = parse_var_attr(attr_value) + value = var_attrs[name] + parsed_attribute = parse_attribute(name, value) adapted_parsed_attr = {} - rules = flattening_rules.get(name) - for k, v in parsed_attr.items(): - if rules.resolve_key: + rules = flattening_rules[name] + resolve_key = rules.resolve_key + resolve_value = rules.resolve_value + + for k, v in parsed_attribute.items(): + if resolve_key: k = self.adapt_name(k, rules) - if rules.resolve_value and v is not None: + if resolve_value and v is not None: v = [self.adapt_name(x, rules) for x in v] adapted_parsed_attr[k] = v new_attr_value = generate_var_attr_str(adapted_parsed_attr) - var.setncattr(rules.name, new_attr_value) + var.setncattr(name, new_attr_value) logging.info( - f" Value of {self.name(var)}.{rules.name} changed " - f"from {attr_value!r} to {new_attr_value!r}" + f" Value of {self.name(var)}.{name} changed " + f"from {value!r} to {new_attr_value!r}" ) def adapt_name(self, resolved_ref, rules): @@ -1437,12 +1416,15 @@ def adapt_name(self, resolved_ref, rules): if ref_not_found_error in resolved_ref: return resolved_ref + ref_to_dim = rules.ref_to_dim + ref_to_var = rules.ref_to_var + # Select highest priority map - if rules.ref_to_dim > rules.ref_to_var: - name_mapping = self.__dim_map + if ref_to_dim > ref_to_var: + name_mapping = self._dim_map - if rules.ref_to_dim < rules.ref_to_var: - name_mapping = self.__var_map + if ref_to_dim < ref_to_var: + name_mapping = self._var_map # Try to find mapping try: @@ -1450,11 +1432,11 @@ def adapt_name(self, resolved_ref, rules): # If not found, look in other map if allowed except KeyError: - if rules.ref_to_dim and rules.ref_to_var: - if rules.ref_to_dim < rules.ref_to_var: - name_mapping = self.__dim_map + if ref_to_dim and ref_to_var: + if ref_to_dim < ref_to_var: + name_mapping = self._dim_map else: - name_mapping = self.__var_map + name_mapping = self._var_map try: return name_mapping[resolved_ref] @@ -1622,22 +1604,32 @@ def handle_reference_error(self, ref, context=None): :Returns: `str` - The error message, or if *lax_mode* is True then a - `ReferenceException` is raised. + The error message, or if *lax_mode* is True then an + `UnresolvedReferenceException` is raised. """ message = f"Reference {ref!r} could not be resolved" if context is not None: message = f"{message} from {context}" - if self.__lax_mode: + if self._lax_mode: warnings.warn(message) return f"{ref_not_found_error}_{ref}" else: - raise ReferenceException(message) + raise UnresolvedReferenceException(message) + + +class AttributeParsingException(Exception): + """Exception for unparsable attribute. + + .. versionadded:: (cfdm) 1.11.1.0 + + """ + + pass -class ReferenceException(Exception): +class UnresolvedReferenceException(Exception): """Exception for unresolvable references in attributes. .. versionadded:: (cfdm) 1.11.1.0 diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index df0d7606b..b8980d399 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -592,7 +592,7 @@ def file_open(self, filename, flatten=True, verbose=None): flat_nc.set_fill_off() # Flatten the file - netcdf_flatten(nc, flat_nc, lax_mode=True, copy_data=False) + netcdf_flatten(nc, flat_nc, lax_mode=True, omit_data=True) # Store the original grouped file. This is primarily # because the unlimited dimensions in the flattened @@ -10203,7 +10203,7 @@ def _file_variable(self, nc, var_name): return self._file_variables(nc)[var_name] def _file_variable_attributes(self, var): - """Return the variable attribute names. + """Return the variable attributes. .. versionadded:: (cfdm) 1.11.1.0 @@ -10214,14 +10214,14 @@ def _file_variable_attributes(self, var): :Returns: - `dict`-like + `dict` A dictionary of the attribute values keyed by their names. """ try: # h5netcdf - return var.attrs + return dict(var.attrs) except AttributeError: # netCDF4 return {attr: var.getncattr(attr) for attr in var.ncattrs()} diff --git a/cfdm/read_write/write.py b/cfdm/read_write/write.py index 23ed36c41..ff5d1eea3 100644 --- a/cfdm/read_write/write.py +++ b/cfdm/read_write/write.py @@ -502,7 +502,7 @@ def write( variables' attributes, but does not create data on disk for the requested variables. The resulting file will be smaller than it otherwise would have been, and when the - new file is read then the data of these variables will be + new file is read the data of these variables will be represented by an array of all missing data. The *omit_data* parameter may be one, or a sequence, of: diff --git a/docs/source/functions.rst b/docs/source/functions.rst index 5919fe972..fc7b5a901 100644 --- a/docs/source/functions.rst +++ b/docs/source/functions.rst @@ -20,6 +20,7 @@ Reading and writing cfdm.read cfdm.write + cfdm.netcdf_flatten Constants --------- diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst index 4fc6b1e14..8ddbe417e 100644 --- a/docs/source/introduction.rst +++ b/docs/source/introduction.rst @@ -79,6 +79,8 @@ The cfdm package can constructs ` from netCDF and CDL datasets with a choice of netCDF backends, +* read files from OPeNDAP servers and S3 object stores, + * create new field and domain constructs in memory, * write field and domain constructs to netCDF datasets on disk, diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index c0bfe963e..49e4dba2b 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -186,7 +186,8 @@ The `cfdm.read` function has optional parameters to * display information and issue warnings about the mapping of the netCDF file contents to CF data model constructs; -* choose either `netCDF4` or `h5netcdf` backends for accessing netCDF files. +* choose either `netCDF4` or `h5netcdf` backends for accessing netCDF + files. .. _CF-compliance: From 6366990b25f125eb80a80d66d63349e26635c821 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 22 Feb 2024 12:55:09 +0000 Subject: [PATCH 36/88] client_kwargs endpoint_url --- cfdm/data/mixin/filearraymixin.py | 23 +++++++++++++++-------- cfdm/read_write/netcdf/netcdfread.py | 10 ++++++++-- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index 67ee7a305..3ea9c83e8 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -289,13 +289,18 @@ def get_storage_options( 'client_kwargs': {'region_name': 'fr-par'}} """ - out = self._get_component("storage_options", None) - if not out: - out = {} + storage_options = self._get_component("storage_options", None) + if not storage_options: + storage_options = {} else: - out = deepcopy(out) - - if create_endpoint_url and "endpoint_url" not in out: + storage_options = deepcopy(storage_options) + + client_kwargs = storage_options.get("client_kwargs", {}) + if ( + create_endpoint_url + and "endpoint_url" not in storage_options + and "endpoint_url" not in client_kwargs + ): if parsed_filename is None: if filename is None: try: @@ -309,9 +314,11 @@ def get_storage_options( if parsed_filename is not None and parsed_filename.scheme == "s3": # Derive endpoint_url from filename - out["endpoint_url"] = f"https://{parsed_filename.netloc}" + storage_options[ + "endpoint_url" + ] = f"https://{parsed_filename.netloc}" - return out + return storage_options def open(self, func, *args, **kwargs): """Return a dataset file object and address. diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index b8980d399..9d6b8e17f 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -17,6 +17,7 @@ import h5netcdf import netCDF4 import numpy as np +from dask.base import tokenize from packaging.version import Version from s3fs import S3FileSystem @@ -514,12 +515,17 @@ def file_open(self, filename, flatten=True, verbose=None): # Create an openable S3 file object storage_options = g["storage_options"] g["file_system_storage_options"][filename] = storage_options - if "endpoint_url" not in storage_options: + + client_kwargs = storage_options.get("client_kwargs", {}) + if ( + "endpoint_url" not in storage_options + and "endpoint_url" not in client_kwargs + ): # Derive endpoint_url from filename storage_options = storage_options.copy() storage_options["endpoint_url"] = f"https://{u.netloc}" - key = tuple(sorted(storage_options.items())) + key = tokenize(storage_options) file_systems = g["file_systems"] fs = file_systems.get(key) if fs is None: From 0e7ef66ccd7e71c780f5cb9311b11dba3970ad9a Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 27 Feb 2024 16:53:00 +0000 Subject: [PATCH 37/88] netcdf_flatten tidy and docs --- cfdm/read_write/netcdf/flatten/config.py | 6 +- cfdm/read_write/netcdf/flatten/flatten.py | 73 ++++++++++++++++------- cfdm/read_write/netcdf/netcdfread.py | 2 +- 3 files changed, 55 insertions(+), 26 deletions(-) diff --git a/cfdm/read_write/netcdf/flatten/config.py b/cfdm/read_write/netcdf/flatten/config.py index 1c2451106..72329b2c8 100644 --- a/cfdm/read_write/netcdf/flatten/config.py +++ b/cfdm/read_write/netcdf/flatten/config.py @@ -36,7 +36,9 @@ class FlatteningRules: """Define the flattening rules for a netCDF attribute. For a named netCDF attribute, the rules a define how the contents - of the attribute are flattened. + of the attribute are flattened. For instance, the + ``ancillary_variables`` attribute contains the names of other + netCDF variables, separated by spaces. .. versionadded:: (cfdm) 1.11.1.0 @@ -58,7 +60,7 @@ class FlatteningRules: # value1 key2: value2 value3' resolve_value: bool = False # stop_at_local_apex: True if upward research in the hierarchy has - # to stop at local apex + # to stop at local apex. stop_at_local_apex: bool = False # accept_standard_names: True if any standard name is valid in # place of references (in which case no diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 251112776..5668cbc1d 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -14,7 +14,7 @@ ref_not_found_error, ) -# Mapping from numpy dtype endian format to what we expect +# Mapping from numpy dtype endian format that expected by netCDF4 _dtype_endian_lookup = { "=": "native", ">": "big", @@ -23,20 +23,46 @@ None: "native", } -# Set of netCDF attributes that may contain references to dimensions -# or variables -special_attributes = set(flattening_rules) +# Set of netCDF attributes that contain references to dimensions or +# variables +referencing_attributes = set(flattening_rules) def netcdf_flatten( input_ds, output_ds, - lax_mode=False, + strict=True, omit_data=False, write_chunksize=134217728, ): """Create a flattened version of a netCDF dataset. + **CF-netCDF coordinate variables** + + When a CF-netCDF coordinate variable in the input dataset is in a + different group to its corresponding dimension, the same variable + in the output flattened dataset will no longer be a CF-netCDF + coordinate variable, as its name will be prefixed with a different + group identifier than its dimension. + + In such cases, it is up to the user to apply the proximal and + lateral search alogrithms, in conjunction with the mappings + defined in the ``flattener_name_mapping_variables`` and + ``flattener_name_mapping_dimensions`` global attributes, to find + which netCDF variables are acting as CF coordinate variables in + the flattened dataset. See + https://cfconventions.org/cf-conventions/cf-conventions.html#groups + for details. + + For example, if an input dataset has dimension ``lat`` in the root + group and coordinate variable ``lat(lat)`` in group ``/group1``, + then the flattened dataset will contain dimension ``lat`` and + variable ``group1__lat(lat)``, both in its root group. In this + case, the ``flattener_name_mapping_variables`` global attribute of + the flattened dataset will contain the mapping ``'group1__lat: + /group1/lat'`` and the flattener_name_mapping_dimensions global + attribute will contain the mapping ``'lat: /lat'``. + .. versionadded:: (cfdm) 1.11.1.0 :Parameters: @@ -47,10 +73,10 @@ def netcdf_flatten( output_ds: `netCDF4.Dataset` A container for the flattened dataset. - lax_mode: `bool`, optional - If False, the default, the not resolving a reference - halts the execution. If True, then continue with a - warning. + strict: `bool`, optional + If True, the default, then failing to resolve a reference + raises an exception. If False, a warning is issued and + flattening is continued. omit_data: `bool`, optional If True then do not copy the data of any variables from @@ -75,7 +101,7 @@ def netcdf_flatten( _Flattener( input_ds, output_ds, - lax_mode, + strict, omit_data=omit_data, write_chunksize=write_chunksize, ).flatten() @@ -207,7 +233,7 @@ def __init__( self, input_ds, output_ds, - lax_mode, + strict, omit_data=False, write_chunksize=134217728, ): @@ -221,7 +247,7 @@ def __init__( output_ds: `netCDF4.Dataset` See `netcdf_flatten`. - lax_mode: `bool`, optional + strict: `bool`, optional See `netcdf_flatten`. omit_data: `bool`, optional @@ -240,8 +266,8 @@ def __init__( self._input_ds = input_ds self._output_ds = output_ds - self._lax_mode = lax_mode - self._omit_data = omit_data + self._strict = bool(strict) + self._omit_data = bool(omit_data) self._write_chunksize = write_chunksize if ( @@ -1317,7 +1343,7 @@ def resolve_references(self, var, old_var): """ var_attrs = self.attrs(var) - for name in special_attributes.intersection(var_attrs): + for name in referencing_attributes.intersection(var_attrs): # Parse attribute value parsed_attribute = parse_attribute(name, var_attrs[name]) @@ -1364,7 +1390,7 @@ def adapt_references(self, var): """ var_attrs = self.attrs(var) - for name in special_attributes.intersection(var_attrs): + for name in referencing_attributes.intersection(var_attrs): # Parse attribute value value = var_attrs[name] parsed_attribute = parse_attribute(name, value) @@ -1588,8 +1614,9 @@ def generate_flattened_name(self, input_group, orig_name): def handle_reference_error(self, ref, context=None): """Handle reference error. - Depending on lax/strict mode, either raise exception or log - warning. If lax, return reference placeholder. + Depending on the `_strict` mode, either raise an exception or + log a warning. If not strict then a reference placeholder is + returned. .. versionadded:: (cfdm) 1.11.1.0 @@ -1604,7 +1631,7 @@ def handle_reference_error(self, ref, context=None): :Returns: `str` - The error message, or if *lax_mode* is True then an + The error message, or if `_strict` is `True` then an `UnresolvedReferenceException` is raised. """ @@ -1612,12 +1639,12 @@ def handle_reference_error(self, ref, context=None): if context is not None: message = f"{message} from {context}" - if self._lax_mode: - warnings.warn(message) - return f"{ref_not_found_error}_{ref}" - else: + if self._strict: raise UnresolvedReferenceException(message) + warnings.warn(message) + return f"{ref_not_found_error}_{ref}" + class AttributeParsingException(Exception): """Exception for unparsable attribute. diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 9d6b8e17f..789f05382 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -598,7 +598,7 @@ def file_open(self, filename, flatten=True, verbose=None): flat_nc.set_fill_off() # Flatten the file - netcdf_flatten(nc, flat_nc, lax_mode=True, omit_data=True) + netcdf_flatten(nc, flat_nc, strict=False, omit_data=True) # Store the original grouped file. This is primarily # because the unlimited dimensions in the flattened From b00f4f52b440015b63f1c3dffe4d30ceec87bcb8 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 27 Feb 2024 16:57:17 +0000 Subject: [PATCH 38/88] netcdf_flatten tidy and docs --- cfdm/read_write/netcdf/flatten/flatten.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 5668cbc1d..5296fe62e 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -233,7 +233,7 @@ def __init__( self, input_ds, output_ds, - strict, + strict=True, omit_data=False, write_chunksize=134217728, ): From 0ae5bf673e46c5771bba7a4e2583a0a4d5a08343 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 28 Feb 2024 16:50:48 +0000 Subject: [PATCH 39/88] dev --- cfdm/read_write/netcdf/netcdfread.py | 48 +++++++++++++++++++++------- cfdm/read_write/read.py | 5 +-- 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 789f05382..b4ecc7705 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -513,17 +513,7 @@ def file_open(self, filename, flatten=True, verbose=None): u = urlparse(filename) if u.scheme == "s3": # Create an openable S3 file object - storage_options = g["storage_options"] - g["file_system_storage_options"][filename] = storage_options - - client_kwargs = storage_options.get("client_kwargs", {}) - if ( - "endpoint_url" not in storage_options - and "endpoint_url" not in client_kwargs - ): - # Derive endpoint_url from filename - storage_options = storage_options.copy() - storage_options["endpoint_url"] = f"https://{u.netloc}" + storage_options = _get_storage_options(filename, u) key = tokenize(storage_options) file_systems = g["file_systems"] @@ -1100,7 +1090,12 @@ def read( g["version"][version] = Version(version) if storage_options is None: - g["storage_options"] = {"anon": True} + # g["storage_options"] = {"anon": True} + g["storage_options"] = { + "anon": True, + "default_fill_cache": False, + "default_cache_type": "first", + } if _file_systems is not None: # Update S3 file systems with those passed in as keyword @@ -10274,3 +10269,32 @@ def _file_variable_size(self, var): except AttributeError: # h5netcdf return prod(var.shape) + + def _get_storage_options(self, filename, parsed_filename): + """TODO. + + .. versionadded:: (cfdm) 1.11.1.0 + + """ + g = self.read_vars + storage_options = g["storage_options"] + g["file_system_storage_options"][filename] = storage_options + + storage_options = storage_options.copy() + + client_kwargs = storage_options.get("client_kwargs", {}) + if ( + "endpoint_url" not in storage_options + and "endpoint_url" not in client_kwargs + ): + storage_options[ + "endpoint_url" + ] = f"https://{parsed_filename.netloc}" + + if "default_fill_cache" not in storage_options: + storage_options["default_fill_cache"] = False + + if "default_cache_type" not in storage_options: + storage_options["default_cache_type"] = "first" + + return storage_options diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 54694821f..a4ec1aa5f 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -292,8 +292,9 @@ def read( object store, i.e. those whose names do not start with ``s3:``. - By default, or if `None`, then a value of ``{'anon': - True}`` is used. + By default, or if `None`, then a value of ``{'anon': True, + 'default_fill_cache': False, 'default_cache_type': + 'first'}`` is used. If an ``'endpoint_url'`` key is not in *storage_options* then one will be automatically derived for accessing each From 23852a7f1676bbe3bccf79ec535bef2c9be29f12 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 4 Mar 2024 12:23:36 +0000 Subject: [PATCH 40/88] dev --- cfdm/read_write/netcdf/netcdfread.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index b4ecc7705..876fc8546 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -513,7 +513,7 @@ def file_open(self, filename, flatten=True, verbose=None): u = urlparse(filename) if u.scheme == "s3": # Create an openable S3 file object - storage_options = _get_storage_options(filename, u) + storage_options = self._get_storage_options(filename, u) key = tokenize(storage_options) file_systems = g["file_systems"] @@ -10291,10 +10291,4 @@ def _get_storage_options(self, filename, parsed_filename): "endpoint_url" ] = f"https://{parsed_filename.netloc}" - if "default_fill_cache" not in storage_options: - storage_options["default_fill_cache"] = False - - if "default_cache_type" not in storage_options: - storage_options["default_cache_type"] = "first" - return storage_options From d0bb0ce7d760bb25271b3d7bb1fd55034d2e56bc Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 4 Mar 2024 12:50:47 +0000 Subject: [PATCH 41/88] fix upstream merge conflicts --- Changelog.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/Changelog.rst b/Changelog.rst index 74dd2e1c7..d1b5b7bb7 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -25,7 +25,6 @@ Version 1.11.1.0 ``constructs`` (https://github.com/NCAS-CMS/cfdm/issues/287) * New example field `11`: discrete sampling geometry trajectory features (https://github.com/NCAS-CMS/cfdm/issues/289) ->>>>>>> 2cfcf7f43f1a0940cd2da91439bc55599a8a02dc ---- From 123bd3701bb5bcceccd2aa8ce84c99ea22513b54 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 4 Mar 2024 15:20:26 +0000 Subject: [PATCH 42/88] dev --- cfdm/cfdmimplementation.py | 4 +- cfdm/core/functions.py | 2 +- cfdm/data/data.py | 5 +- cfdm/data/h5netcdfarray.py | 14 ++-- cfdm/data/mixin/filearraymixin.py | 8 +-- cfdm/data/mixin/netcdffilemixin.py | 12 ++-- cfdm/data/netcdf4array.py | 8 +-- cfdm/data/netcdfindexer.py | 43 +++++++------ cfdm/functions.py | 8 +-- cfdm/read_write/netcdf/flatten/config.py | 4 +- cfdm/read_write/netcdf/flatten/flatten.py | 78 +++++++++++------------ cfdm/read_write/netcdf/netcdfread.py | 56 +++++++--------- cfdm/read_write/read.py | 33 ++++++---- 13 files changed, 139 insertions(+), 136 deletions(-) diff --git a/cfdm/cfdmimplementation.py b/cfdm/cfdmimplementation.py index e60bbf0b4..2e3b45ad2 100644 --- a/cfdm/cfdmimplementation.py +++ b/cfdm/cfdmimplementation.py @@ -2300,7 +2300,7 @@ def initialise_NetCDF4Array(self, **kwargs): kwargs: optional Initialisation parameters to pass to the new instance. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Returns: @@ -2313,7 +2313,7 @@ def initialise_NetCDF4Array(self, **kwargs): def initialise_H5netcdfArray(self, **kwargs): """Return a `H5netcdfArray` instance. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: diff --git a/cfdm/core/functions.py b/cfdm/core/functions.py index 4f5c29c46..48bdb9f8d 100644 --- a/cfdm/core/functions.py +++ b/cfdm/core/functions.py @@ -39,7 +39,7 @@ def environment(display=True, paths=True): Python: 3.11.4 packaging: 23.0 numpy: 1.25.2 - cfdm.core: 1.11.1.0 + cfdm.core: NEXTVERSION """ dependency_version_paths_mapping = { diff --git a/cfdm/data/data.py b/cfdm/data/data.py index 479e97d3c..411175c1a 100644 --- a/cfdm/data/data.py +++ b/cfdm/data/data.py @@ -2217,7 +2217,7 @@ def maximum(self, axes=None, squeeze=False): size one. With this option, the result will broadcast correctly against the original data. - .. versionaded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Returns: @@ -2443,7 +2443,7 @@ def sum(self, axes=None, squeeze=False): size one. With this option, the result will broadcast correctly against the original data. - .. versionaded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Returns: @@ -2487,6 +2487,7 @@ def sum(self, axes=None, squeeze=False): axes = self._parse_axes(axes) except ValueError as error: raise ValueError(f"Can't sum data: {error}") + array = self.array array = np.sum(array, axis=axes, keepdims=not squeeze) diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index 8d3645def..0ba5f5da7 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -17,7 +17,7 @@ class H5netcdfArray(NetCDFFileMixin, FileArrayMixin, abstract.Array): """A netCDF array accessed with `h5netcdf`. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION """ @@ -186,7 +186,7 @@ def __getitem__(self, indices): x.__getitem__(indices) <==> x[indices] - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION """ dataset, address = self.open() @@ -227,7 +227,7 @@ def _set_attributes(self, var): they have not already been defined, either during {{class}} instantiation or by a previous call to `_set_attributes`. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -250,7 +250,7 @@ def _set_attributes(self, var): def close(self, dataset): """Close the dataset containing the data. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -268,7 +268,7 @@ def close(self, dataset): def get_groups(self, address): """The netCDF4 group structure of a netCDF variable. - .. versionadded:: (cfdm) 1.8.6.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -276,8 +276,6 @@ def get_groups(self, address): The netCDF variable name, or integer varid, from which to get the groups. - .. versionadded:: (cfdm) 1.10.1.0 - :Returns: (`list`, `str`) or (`list`, `int`) @@ -316,6 +314,8 @@ def open(self, **kwargs): open each one, in the order stored, and a file object is returned from the first file that exists. + .. versionadded:: (cfdm) NEXTVERSION + :Returns: (`h5netcdf.File`, `str`) diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index 3ea9c83e8..a237a4d23 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -114,7 +114,7 @@ def get_addresses(self): def get_attributes(self, default=ValueError()): """The attributes of the array. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -228,12 +228,12 @@ def get_formats(self): def get_missing_values(self): """The missing values of the data. - Deprecated at version 1.11.1.0. Use `get_attributes` instead. + Deprecated at version NEXTVERSION. Use `get_attributes` instead. """ raise DeprecationError( f"{self.__class__.__name__}.get_missing_values was deprecated " - "at version 1.11.1.0 and is no longer available. " + "at version NEXTVERSION and is no longer available. " "Use {self.__class__.__name__}.get_attributes instead." ) # pragma: no cover @@ -242,7 +242,7 @@ def get_storage_options( ): """Return `s3fs.S3FileSystem` options for accessing S3 files. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: diff --git a/cfdm/data/mixin/netcdffilemixin.py b/cfdm/data/mixin/netcdffilemixin.py index 376acf809..7f4c6c25d 100644 --- a/cfdm/data/mixin/netcdffilemixin.py +++ b/cfdm/data/mixin/netcdffilemixin.py @@ -10,14 +10,14 @@ class DeprecationError(Exception): class NetCDFFileMixin: """Mixin class for netCDF file arrays. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION """ def _group(self, dataset, groups): """Retrun the group object containing a variable. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -49,7 +49,7 @@ def _set_attributes(self, var): they have not already been defined, either during {{class}} instantiation or by a previous call to `_set_attributes`. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -179,7 +179,7 @@ def get_mask(self): def get_missing_values(self, default=ValueError()): """The missing value indicators from the netCDF variable. - Deprecated at version 1.11.1.0. Use `get_attributes` instead. + Deprecated at version NEXTVERSION. Use `get_attributes` instead. .. versionadded:: (cfdm) 1.10.0.3 @@ -220,14 +220,14 @@ def get_missing_values(self, default=ValueError()): """ raise DeprecationError( f"{self.__class__.__name__}.get_missing_values was deprecated " - "at version 1.11.1.0 and is no longer available. " + "at version NEXTVERSION and is no longer available. " "Use {self.__class__.__name__}.get_attributes instead." ) def get_unpack(self): """Whether or not to automatically unpack the data. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION **Examples** diff --git a/cfdm/data/netcdf4array.py b/cfdm/data/netcdf4array.py index 72dffbf79..09131480c 100644 --- a/cfdm/data/netcdf4array.py +++ b/cfdm/data/netcdf4array.py @@ -61,7 +61,7 @@ def __init__( {{init unpack: `bool`, optional}} - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION units: `str` or `None`, optional The units of the netCDF variable. Set to `None` to @@ -80,7 +80,7 @@ def __init__( {{init storage_options: `dict` or `None`, optional}} - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION {{init source: optional}} @@ -90,7 +90,7 @@ def __init__( .. versionadded:: (cfdm) 1.10.0.0 - missing_values: Deprecated at version 1.11.1.0 + missing_values: Deprecated at version NEXTVERSION The missing value indicators defined by the netCDF variable attributes. The may now be recorded via the *attributes* parameter @@ -275,7 +275,7 @@ def _set_attributes(self, var): they have not already been defined, either during {{class}} instantiation or by a previous call to `_set_attributes`. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index fced5e0b2..76d4f44b0 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -41,18 +41,19 @@ class NetCDFIndexer: to unicode arrays, the latter with the last dimension concatenated. - Masking and unpacking operations are defined by netCDF attributes, - which are either provided as part of the input *data* object, or - given with the input *attributes* parameter. + Masking and unpacking operations are defined by the conventions + for netCDF attributes, which are either provided as part of the + input *data* object, or given with the input *attributes* + parameter. - The relevant netCDF attributes that may be used are: + The relevant netCDF attributes that are considered: * For masking: ``missing_value``, ``valid_max``, ``valid_min``, ``valid_range``, ``_FillValue``, ``_Unsigned`` * For unpacking: ``add_offset``, ``scale_factor``, ``_Unsigned`` - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION **Examples** @@ -109,21 +110,23 @@ def __init__( variable: `netCDF4.Variable` or `h5netcdf.Variable` or `numpy.ndarray` The variable to be indexed. Any masking and unpacking that could be applied by applied by the *variable* - itself is disabled, i.e. Any masking and unpacking is + itself is disabled, i.e. any masking and unpacking is always done by the `NetCDFIndexer` instance. mask: `bool` If True, the default, then an array returned by - indexing is automatically masked. Masking is governed - by the ``missing_value``, ``valid_max``, - ``valid_min``, ``valid_range``, ``_FillValue``, and - ``_Unsigned`` attributes. + indexing is automatically masked. Masking is + determined by the netCDF conventions for the following + attributes: ``_FillValue``, ``missing_value``, + ``_Unsigned``, ``valid_max``, ``valid_min``, and + ``valid_range``. unpack: `bool` If True, the default, then an array returned by indexing is automatically unpacked. Unpacking is - governed by the ``_Unsigned``, ``add_offset``, and - ``scale_factor`` attributes. + determined netCDF conventions for the following + attributes: ``add_offset``, ``scale_factor``, and + ``_Unsigned``. always_mask: `bool` If False, the default, then an array returned by @@ -137,7 +140,7 @@ def __init__( dictionary key/value pairs. If *attributes* is set then any netCDF attributes stored by *variable* itself are ignored. Only the attributes relevant to masking - and unpacking are considers, and all other attributes + and unpacking are considered, and all other attributes are ignored. """ @@ -154,7 +157,7 @@ def __getitem__(self, index): Indexing follows the rules defined by the variable. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION """ variable = self.variable @@ -224,7 +227,7 @@ def __getitem__(self, index): def shape(self): """Tuple of the data dimension sizes. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION """ return self.variable.shape @@ -235,7 +238,7 @@ def _check_safecast(self, attname, dtype, attributes): Checks to see that variable attribute exists and can be safely cast to variable data type. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameter: @@ -280,7 +283,7 @@ def _check_safecast(self, attname, dtype, attributes): def _default_FillValue(self, dtype): """Return the default ``_FillValue`` for the given data type. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION .. seealso:: `netCDF4.default_fillvals` @@ -302,7 +305,7 @@ def _default_FillValue(self, dtype): def _mask(self, data, dtype, attributes, dtype_unsigned_int): """Mask the data. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameter: @@ -480,7 +483,7 @@ def _mask(self, data, dtype, attributes, dtype_unsigned_int): def _unpack(self, data, attributes): """Unpack the data.. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameter: @@ -544,7 +547,7 @@ def _unpack(self, data, attributes): def attributes(self): """Return the netCDF attributes of the variable. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Returns: diff --git a/cfdm/functions.py b/cfdm/functions.py index 4d2880f3b..76767defd 100644 --- a/cfdm/functions.py +++ b/cfdm/functions.py @@ -325,7 +325,7 @@ def environment(display=True, paths=True): Python: 3.11.4 packaging: 23.0 numpy: 1.25.2 - cfdm.core: 1.11.1.0 + cfdm.core: NEXTVERSION HDF5 library: 1.14.2 netcdf library: 4.9.2 netCDF4: 1.6.4 @@ -334,14 +334,14 @@ def environment(display=True, paths=True): s3fs: 2023.12.2 scipy: 1.11.3 cftime: 1.6.2 - cfdm: 1.11.1.0 + cfdm: NEXTVERSION >>> cfdm.environment() Platform: Linux-5.15.0-92-generic-x86_64-with-glibc2.35 Python: 3.11.4 /home/miniconda3/bin/python packaging: 23.0 /home/miniconda3/lib/python3.11/site-packages/packaging/__init__.py numpy: 1.25.2 /home/miniconda3/lib/python3.11/site-packages/numpy/__init__.py - cfdm.core: 1.11.1.0 /home/cfdm/cfdm/core/__init__.py + cfdm.core: NEXTVERSION /home/cfdm/cfdm/core/__init__.py HDF5 library: 1.14.2 netcdf library: 4.9.2 netCDF4: 1.6.4 /home/miniconda3/lib/python3.11/site-packages/netCDF4/__init__.py @@ -350,7 +350,7 @@ def environment(display=True, paths=True): s3fs: 2023.12.2 /home/miniconda3/lib/python3.11/site-packages/s3fs/__init__.py scipy: 1.11.3 /home/miniconda3/lib/python3.11/site-packages/scipy/__init__.py cftime: 1.6.2 /home/miniconda3/lib/python3.11/site-packages/cftime/__init__.py - cfdm: 1.11.1.0 /home/miniconda3/lib/python3.11/site-packages/cfdm/__init__.py + cfdm: NEXTVERSION /home/miniconda3/lib/python3.11/site-packages/cfdm/__init__.py """ out = core.environment(display=False, paths=paths) # get all core env diff --git a/cfdm/read_write/netcdf/flatten/config.py b/cfdm/read_write/netcdf/flatten/config.py index 72329b2c8..05895ce04 100644 --- a/cfdm/read_write/netcdf/flatten/config.py +++ b/cfdm/read_write/netcdf/flatten/config.py @@ -1,6 +1,6 @@ """Configuration for netCDF group flattening. -.. versionadded:: (cfdm) 1.11.1.0 +.. versionadded:: (cfdm) NEXTVERSION """ from dataclasses import dataclass @@ -40,7 +40,7 @@ class FlatteningRules: ``ancillary_variables`` attribute contains the names of other netCDF variables, separated by spaces. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION """ diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 5296fe62e..68cebb8c4 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -63,7 +63,7 @@ def netcdf_flatten( /group1/lat'`` and the flattener_name_mapping_dimensions global attribute will contain the mapping ``'lat: /lat'``. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -115,7 +115,7 @@ def parse_attribute(name, attribute): * 'area: time volume: lat lon' -> {'area': ['time'], 'volume': ['lat', 'lon']} - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -194,7 +194,7 @@ def subst(s): def generate_var_attr_str(d): """Re-generate the attribute string from a dictionary. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -225,7 +225,7 @@ class _Flattener: Contains the input file, the output file being flattened, and all the logic of the flattening process. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION """ @@ -283,7 +283,7 @@ def __init__( def attrs(self, variable): """Return the variable attributes. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -309,7 +309,7 @@ def attrs(self, variable): def chunksizes(self, variable): """Return the variable chunk sizes. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -345,7 +345,7 @@ def chunksizes(self, variable): def contiguous(self, variable): """Whether or not the variable data is contiguous on disk. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -374,7 +374,7 @@ def contiguous(self, variable): def dtype(self, variable): """Return the data type of a variable. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -404,7 +404,7 @@ def dtype(self, variable): def endian(self, variable): """Return the endian-ness of a variable. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -434,7 +434,7 @@ def endian(self, variable): def filepath(self, dataset): """Return the file path for the dataset. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -463,7 +463,7 @@ def filepath(self, dataset): def get_dims(self, variable): """Return. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Returns: @@ -497,7 +497,7 @@ def get_dims(self, variable): def getncattr(self, x, attr): """Retrieve a netCDF attribute. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -518,7 +518,7 @@ def getncattr(self, x, attr): def group(self, x): """Return a. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Returns: @@ -535,7 +535,7 @@ def group(self, x): def name(self, x): """Return the netCDF name, without its groups. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Returns: @@ -552,7 +552,7 @@ def name(self, x): def ncattrs(self, x): """Return netCDF attribute names. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -573,7 +573,7 @@ def ncattrs(self, x): def parent(self, group): """Return a simulated unix directory path to a group. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Returns: @@ -588,7 +588,7 @@ def parent(self, group): def path(self, group): """Return a simulated unix directory path to a group. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Returns: @@ -608,7 +608,7 @@ def path(self, group): def flatten(self): """Flattens and writes to output file. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Return: @@ -639,7 +639,7 @@ def flatten(self): def process_group(self, input_group): """Flattens a given group to the output file. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -668,7 +668,7 @@ def process_group(self, input_group): def flatten_attribute(self, input_group, attr_name): """Flattens a given attribute from a group to the output file. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -704,7 +704,7 @@ def flatten_attribute(self, input_group, attr_name): def flatten_dimension(self, dim): """Flattens a given dimension to the output file. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -746,7 +746,7 @@ def flatten_dimension(self, dim): def flatten_variable(self, var): """Flattens a given variable to the output file. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -835,7 +835,7 @@ def increment_pos(self, pos, dim, copy_slice_shape, var_shape): dimension is reached, recursively increment the next dimensions until a valid position is found. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -886,7 +886,7 @@ def increment_pos(self, pos, dim, copy_slice_shape, var_shape): def write_data_in_chunks(self, old_var, new_var): """Copy the data of a variable to a new one by slice. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -937,7 +937,7 @@ def resolve_reference(self, orig_ref, orig_var, rules): Resolves the absolute path to a coordinate variable within the group structure. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -1025,7 +1025,7 @@ def resolve_reference_proximity( ): """Resolve reference: search by proximity. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -1101,7 +1101,7 @@ def resolve_reference_post_processing( ): """Post-processing operations after resolving reference. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -1179,7 +1179,7 @@ def search_by_relative_path(self, ref, current_group, search_dim): Resolves the absolute path to a reference within the group structure, using search by relative path. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -1241,7 +1241,7 @@ def search_by_proximity( group is reached. If coordinate variable, search until local apex is reached, Then search down in siblings. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -1326,7 +1326,7 @@ def resolve_references(self, var, old_var): In a given variable, replace all references to other variables in its attributes by absolute references. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -1376,7 +1376,7 @@ def adapt_references(self, var): netCDF. All references have to be already resolved as absolute references. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -1424,7 +1424,7 @@ def adapt_name(self, resolved_ref, rules): Return name of flattened reference. If not found, raise exception or continue warning. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION resolved_ref: `str` The resolved reference. @@ -1480,7 +1480,7 @@ def adapt_name(self, resolved_ref, rules): def pathname(self, group, name): """Compose full path name to an element in a group structure: - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -1507,7 +1507,7 @@ def generate_mapping_str(self, input_group, name, new_name): Generates a string representing the name mapping of an element before and after flattening. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -1534,7 +1534,7 @@ def generate_mapping_str(self, input_group, name, new_name): def convert_path_to_valid_name(self, pathname): """Generate valid name from path. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -1566,7 +1566,7 @@ def generate_flattened_name(self, input_group, orig_name): * if name is still too long, replace complete name by hash. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -1618,7 +1618,7 @@ def handle_reference_error(self, ref, context=None): log a warning. If not strict then a reference placeholder is returned. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -1649,7 +1649,7 @@ def handle_reference_error(self, ref, context=None): class AttributeParsingException(Exception): """Exception for unparsable attribute. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION """ @@ -1659,7 +1659,7 @@ class AttributeParsingException(Exception): class UnresolvedReferenceException(Exception): """Exception for unresolvable references in attributes. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION """ diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 876fc8546..e0404bb8b 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -511,10 +511,10 @@ def file_open(self, filename, flatten=True, verbose=None): # Deal with an file in an S3 object store u = urlparse(filename) + storage_options = self._get_storage_options(filename, u) + if u.scheme == "s3": # Create an openable S3 file object - storage_options = self._get_storage_options(filename, u) - key = tokenize(storage_options) file_systems = g["file_systems"] fs = file_systems.get(key) @@ -612,7 +612,7 @@ def file_open(self, filename, flatten=True, verbose=None): def _open_netCDF4(self, filename): """Return an open `netCDF4.Dataset`. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -629,7 +629,7 @@ def _open_netCDF4(self, filename): def _open_h5netcdf(self, filename): """Return an open `h5netcdf.File`. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -937,7 +937,7 @@ def read( unpack: `bool`, optional See `cfdm.read` for details - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION warn_valid: `bool`, optional See `cfdm.read` for details @@ -952,17 +952,17 @@ def read( storage_options: `bool`, optional See `cfdm.read` for details - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION netCDF_backend: `None` or `str`, optional See `cfdm.read` for details - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION _file_systems: `dict`, optional Provide any already-open S3 file systems. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Returns: @@ -1063,12 +1063,10 @@ def read( # -------------------------------------------------------- # CFA # -------------------------------------------------------- - # "cfa": False, # -------------------------------------------------------- # NetCDF backend # -------------------------------------------------------- - # "netCDF_backend": netCDF_backend, # -------------------------------------------------------- # S3 @@ -1090,12 +1088,7 @@ def read( g["version"][version] = Version(version) if storage_options is None: - # g["storage_options"] = {"anon": True} - g["storage_options"] = { - "anon": True, - "default_fill_cache": False, - "default_cache_type": "first", - } + g["storage_options"] = {} if _file_systems is not None: # Update S3 file systems with those passed in as keyword @@ -10042,7 +10035,7 @@ def _ugrid_check_connectivity_variable( def _file_global_attribute(self, nc, attr): """Return a global attribute from a dataset. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -10067,7 +10060,7 @@ def _file_global_attribute(self, nc, attr): def _file_global_attributes(self, nc): """Return the global attributes from a dataset. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -10091,7 +10084,7 @@ def _file_global_attributes(self, nc): def _file_dimensions(self, nc): """Return all dimensions in the root group. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Returns: @@ -10104,7 +10097,7 @@ def _file_dimensions(self, nc): def _file_dimension(self, nc, dim_name): """Return a dimension from the root group of a dataset. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -10125,7 +10118,7 @@ def _file_dimension(self, nc, dim_name): def _file_dimension_isunlimited(self, nc, dim_name): """Return a whether a dimension is unlimited. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -10146,7 +10139,7 @@ def _file_dimension_isunlimited(self, nc, dim_name): def _file_dimension_size(self, nc, dim_name): """Return a dimension is size. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -10167,7 +10160,7 @@ def _file_dimension_size(self, nc, dim_name): def _file_variables(self, nc): """Return all variables in the root group. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -10185,7 +10178,7 @@ def _file_variables(self, nc): def _file_variable(self, nc, var_name): """Return a variable. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -10206,7 +10199,7 @@ def _file_variable(self, nc, var_name): def _file_variable_attributes(self, var): """Return the variable attributes. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -10230,7 +10223,7 @@ def _file_variable_attributes(self, var): def _file_variable_dimensions(self, var): """Return the variable dimension names. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -10248,7 +10241,7 @@ def _file_variable_dimensions(self, var): def _file_variable_size(self, var): """Return the size of a variable's array. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -10273,14 +10266,11 @@ def _file_variable_size(self, var): def _get_storage_options(self, filename, parsed_filename): """TODO. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION """ g = self.read_vars - storage_options = g["storage_options"] - g["file_system_storage_options"][filename] = storage_options - - storage_options = storage_options.copy() + storage_options = g["storage_options"].copy() client_kwargs = storage_options.get("client_kwargs", {}) if ( @@ -10291,4 +10281,6 @@ def _get_storage_options(self, filename, parsed_filename): "endpoint_url" ] = f"https://{parsed_filename.netloc}" + g["file_system_storage_options"].setdefault(filename, storage_options) + return storage_options diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index a4ec1aa5f..57be3db78 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -251,7 +251,7 @@ def read( A netCDF array is unpacked depending on the values of the netCDF attributes ``add_offset`` and ``scale_factor``. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION domain: `bool`, optional If True then return only the domain constructs that are @@ -283,7 +283,7 @@ def read( or ``'h5netcdf'`` will force the use of the `netCDF4` or `h5netcdf` libraries respectively. - .. versionadded:: (cfdm) 1.11.1.0 + .. versionadded:: (cfdm) NEXTVERSION storage_options: `dict` or `None`, optional Key/value pairs to be passed on to the creation of @@ -292,20 +292,22 @@ def read( object store, i.e. those whose names do not start with ``s3:``. - By default, or if `None`, then a value of ``{'anon': True, - 'default_fill_cache': False, 'default_cache_type': - 'first'}`` is used. + By default, or if `None` or ``{}``, then no options are + passed. - If an ``'endpoint_url'`` key is not in *storage_options* - then one will be automatically derived for accessing each - S3 file. For example, for a file name of - ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key - with value ``'https://store'`` would be created. + If the ``'endpoint_url'`` key is not in *storage_options* + or is not in a dictionary defined by the + ``'client_kwargs`` key (which is always the case when + *storage_options* is `None`), then one will be + automatically inserted for accessing each S3 file. For + example, for a file name of ``'s3://store/data/file.nc'``, + an ``'endpoint_url'`` key with value ``'https://store'`` + would be created. *Parameter example:* For a file name of ``'s3://store/data/file.nc'``, the - following are equivalent: ``None``, ``{'anon': True}``, - and ``{'anon': True, 'endpoint_url': 'https://store'}``. + following are equivalent: ``None``, ``{}``, and + ``{'endpoint_url': 'https://store'}``. *Parameter example:* ``{'key: 'scaleway-api-key...', 'secret': @@ -313,7 +315,12 @@ def read( 'https://s3.fr-par.scw.cloud', 'client_kwargs': {'region_name': 'fr-par'}}`` - .. versionadded:: (cfdm) 1.11.1.0 + *Parameter example:* + The following are equivalent: ``{'endpoint_url': + 'https://store'}`` ``{'client_kwargs': {'endpoint_url': + 'https://store'}}`` + + .. versionadded:: (cfdm) NEXTVERSION _implementation: (subclass of) `CFDMImplementation`, optional Define the CF data model implementation that provides the From 1c0eb30e34c8b0b7c82181b499e7083f09b824d9 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 4 Mar 2024 15:22:13 +0000 Subject: [PATCH 43/88] dev --- cfdm/read_write/read.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 57be3db78..896cc0723 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -304,6 +304,8 @@ def read( an ``'endpoint_url'`` key with value ``'https://store'`` would be created. + + *Parameter example:* For a file name of ``'s3://store/data/file.nc'``, the following are equivalent: ``None``, ``{}``, and From 0255859bd5a2abb35dcf454195c79a797ac1d03d Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 4 Mar 2024 15:22:23 +0000 Subject: [PATCH 44/88] dev --- cfdm/read_write/read.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 896cc0723..57be3db78 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -304,8 +304,6 @@ def read( an ``'endpoint_url'`` key with value ``'https://store'`` would be created. - - *Parameter example:* For a file name of ``'s3://store/data/file.nc'``, the following are equivalent: ``None``, ``{}``, and From dc00a051eab3748cb15ecbf226550471e55a75df Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 4 Mar 2024 22:24:01 +0000 Subject: [PATCH 45/88] dev --- cfdm/data/netcdfarray.py | 6 +++--- cfdm/read_write/read.py | 9 +++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/cfdm/data/netcdfarray.py b/cfdm/data/netcdfarray.py index 5ab97ba8e..ec5b9d4b0 100644 --- a/cfdm/data/netcdfarray.py +++ b/cfdm/data/netcdfarray.py @@ -7,7 +7,7 @@ class DeprecationError(Exception): class NetCDFArray: """A netCDF array accessed with `netCDF4`. - Deprecated at version 1.11.1.0 and is no longer available. Use + Deprecated at version NEXTVERSION and is no longer available. Use `cfdm.NetCDF4Array` instead. .. versionadded:: (cfdm) 1.7.0 @@ -84,7 +84,7 @@ def __init__( .. versionadded:: (cfdm) 1.10.0.0 - missing_values: Deprecated at version 1.11.1.0 + missing_values: Deprecated at version NEXTVERSION The missing value indicators defined by the netCDF variable attributes. The may now be recorded via the *attributes* parameter @@ -100,6 +100,6 @@ def __init__( """ raise DeprecationError( - f"{self.__class__.__name__} was deprecated at version 1.11.1.0 " + f"{self.__class__.__name__} was deprecated at version NEXTVERSION " "and is no longer available. Use cfdm.NetCDF4Array instead." ) diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 57be3db78..7eea07d03 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -245,11 +245,12 @@ def read( .. versionadded:: (cfdm) 1.8.2 unpack: `bool` - If True (the default) then unpack by convention when - reading data from disk. + If True (the default) then unpack arrays by convention + when the data is read from disk. - A netCDF array is unpacked depending on the values of the - netCDF attributes ``add_offset`` and ``scale_factor``. + Unpacking is determined netCDF conventions for the + following attributes: ``add_offset``, ``scale_factor``, + and ``_Unsigned``. .. versionadded:: (cfdm) NEXTVERSION From 00a15e93660bca363b34c9bf36b0cb8b2e1f58a8 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 7 Mar 2024 08:42:41 +0000 Subject: [PATCH 46/88] dev --- cfdm/core/__init__.py | 2 +- cfdm/docstring/docstring.py | 35 +++++++++++++++++++++-------------- cfdm/read_write/read.py | 14 +++++--------- 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/cfdm/core/__init__.py b/cfdm/core/__init__.py index 6ed22a3a0..5e01c0136 100644 --- a/cfdm/core/__init__.py +++ b/cfdm/core/__init__.py @@ -13,7 +13,7 @@ __date__ = "2024-??-??" __cf_version__ = "1.11" -__version__ = "1.11.1.0" +__version__ = "1.11.2.0" from packaging import __version__ as _packaging_ver from packaging import __file__ as _packaging_file diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py index 46728c3e8..37c130c4d 100644 --- a/cfdm/docstring/docstring.py +++ b/cfdm/docstring/docstring.py @@ -421,23 +421,30 @@ ``scale_factor``.""", # init storage_options "{{init storage_options: `dict` or `None`, optional}}": """storage_options: `dict` or `None`, optional - Key/value pairs to be passed on to the creation of an - `s3fs.S3FileSystem` file system to control the opening - of the file in an S3 object store. Ignored for a file - not in an S3 object store, i.e. one whose name does - not start with ``s3:``. - - If an ``'endpoint_url'`` key is not in - *storage_options* then one will be automatically - derived for accessing an S3 file. For example, for a - file name of ``'s3://store/data/file.nc'``, an - ``'endpoint_url'`` key with value ``'https://store'`` - would be created. + Key/value pairs to be passed on to the creation of + `s3fs.S3FileSystem` file systems to control the + opening of files in S3 object stores. Ignored for + files not in an S3 object store, i.e. those whose + names do not start with ``s3:``. + + By default, or if `None`, then *storage_options* is + taken as ``{}``. + + If the ``'endpoint_url'`` key is not in + *storage_options* or is not in a dictionary defined by + the ``'client_kwargs`` key (which is always the case + when *storage_options* is `None`), then one will be + automatically inserted for accessing an S3 file. For + example, for a file name of + ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` + key with value ``'https://store'`` would be created. *Parameter example:* For a file name of ``'s3://store/data/file.nc'``, - the following are equivalent: ``None``, ``{}`` and - ``{'endpoint_url': 'https://store'}``. + the following are equivalent: ``None``, ``{}``, and + ``{'endpoint_url': 'https://store'}``, + ``{'client_kwargs': {'endpoint_url': + 'https://store'}}`` *Parameter example:* ``{'key: 'scaleway-api-key...', 'secret': diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 7eea07d03..30f0cd4ef 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -293,14 +293,14 @@ def read( object store, i.e. those whose names do not start with ``s3:``. - By default, or if `None` or ``{}``, then no options are - passed. + By default, or if `None`, then *storage_options* is taken + as ``{}``. If the ``'endpoint_url'`` key is not in *storage_options* or is not in a dictionary defined by the ``'client_kwargs`` key (which is always the case when *storage_options* is `None`), then one will be - automatically inserted for accessing each S3 file. For + automatically inserted for accessing an S3 file. For example, for a file name of ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key with value ``'https://store'`` would be created. @@ -308,7 +308,8 @@ def read( *Parameter example:* For a file name of ``'s3://store/data/file.nc'``, the following are equivalent: ``None``, ``{}``, and - ``{'endpoint_url': 'https://store'}``. + ``{'endpoint_url': 'https://store'}``, + ``{'client_kwargs': {'endpoint_url': 'https://store'}}`` *Parameter example:* ``{'key: 'scaleway-api-key...', 'secret': @@ -316,11 +317,6 @@ def read( 'https://s3.fr-par.scw.cloud', 'client_kwargs': {'region_name': 'fr-par'}}`` - *Parameter example:* - The following are equivalent: ``{'endpoint_url': - 'https://store'}`` ``{'client_kwargs': {'endpoint_url': - 'https://store'}}`` - .. versionadded:: (cfdm) NEXTVERSION _implementation: (subclass of) `CFDMImplementation`, optional From 17e67c3fd95a25a714924225d1e13e8c530b8a9c Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 7 Mar 2024 09:59:18 +0000 Subject: [PATCH 47/88] dev --- Changelog.rst | 1 + cfdm/data/netcdfindexer.py | 110 ++++++++++++++++++--------- cfdm/read_write/netcdf/netcdfread.py | 83 +++++++++++++------- docs/source/installation.rst | 2 + requirements.txt | 1 + 5 files changed, 134 insertions(+), 63 deletions(-) diff --git a/Changelog.rst b/Changelog.rst index d1b5b7bb7..1b898dce8 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -12,6 +12,7 @@ Version NEXT * New dependency: ``h5netcdf>=1.3.0`` * New dependency: ``h5py>=3.10.0`` * New dependency: ``s3fs>=2024.2.0`` +* New dependency: ``dask>=2024.2.1`` * Removed dependency: ``netcdf_flattener`` ---- diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index 76d4f44b0..8e70e7cf0 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -49,7 +49,10 @@ class NetCDFIndexer: The relevant netCDF attributes that are considered: * For masking: ``missing_value``, ``valid_max``, ``valid_min``, - ``valid_range``, ``_FillValue``, ``_Unsigned`` + ``valid_range``, ``_FillValue``, + ``_Unsigned``. Note that if ``_FillValue`` is not + present then the netCDF default value for the + appropriate data type will be assumed. * For unpacking: ``add_offset``, ``scale_factor``, ``_Unsigned`` @@ -107,11 +110,15 @@ def __init__( :Parameters: - variable: `netCDF4.Variable` or `h5netcdf.Variable` or `numpy.ndarray` - The variable to be indexed. Any masking and unpacking - that could be applied by applied by the *variable* - itself is disabled, i.e. any masking and unpacking is - always done by the `NetCDFIndexer` instance. + variable: + The variable to be indexed. May be any variable that + has the same API as one of `numpy.ndarray`, + `netCDF4.Variable` or `h5py.Variable` (which includes + `h5netcdf.Variable`). Any masking and unpacking that + could be applied by applied by *variable* itself + (e.g. by a `netCDF4.Variable` instance) is disabled, + ensuring that any masking and unpacking is always done + by the `NetCDFIndexer` instance. mask: `bool` If True, the default, then an array returned by @@ -124,7 +131,7 @@ def __init__( unpack: `bool` If True, the default, then an array returned by indexing is automatically unpacked. Unpacking is - determined netCDF conventions for the following + determined by the netCDF conventions for the following attributes: ``add_offset``, ``scale_factor``, and ``_Unsigned``. @@ -136,12 +143,12 @@ def __init__( no missing values. attributes: `dict`, optional - Provide the netCDF attributes of the *variable* as - dictionary key/value pairs. If *attributes* is set - then any netCDF attributes stored by *variable* itself - are ignored. Only the attributes relevant to masking - and unpacking are considered, and all other attributes - are ignored. + Provide the netCDF attributes for *variable* as + dictionary key/value pairs. If *attributes* is not + `None, then any netCDF attributes stored by *variable* + itself are ignored. Only the attributes relevant to + masking and unpacking are considered, with all other + attributes being ignored. """ self.variable = variable @@ -165,20 +172,29 @@ def __getitem__(self, index): attributes = self.attributes() dtype = variable.dtype + # Prevent a netCDF4 variable from doing its own masking and + # unpacking netCDF4_scale = False netCDF4_mask = False try: netCDF4_scale = variable.scale except AttributeError: + # Not a netCDF4 variable pass else: netCDF4_mask = variable.mask - # Prevent netCDF4 from doing any masking and unpacking variable.set_auto_maskandscale(False) # Index the variable data = variable[index] + # Reset a netCDF4 variable's scale and mask behaviour + if netCDF4_scale: + variable.set_auto_scale(True) + + if netCDF4_mask: + variable.set_auto_mask(True) + # Convert str, char, and object data to byte strings if isinstance(data, str): data = np.array(data, dtype="S") @@ -206,21 +222,17 @@ def __getitem__(self, index): data = data.view(dtype_unsigned_int) if self.mask: + # Mask the data data = self._mask(data, dtype, attributes, dtype_unsigned_int) if unpack: + # Unpack the data data = self._unpack(data, attributes) + # Make sure all strings are unicode if data.dtype.kind == "S": data = data.astype("U", copy=False) - # Reset a netCDF4 variables's scale and mask behaviour - if netCDF4_scale: - variable.set_auto_scale(True) - - if netCDF4_mask: - variable.set_auto_mask(True) - return data @property @@ -310,8 +322,7 @@ def _mask(self, data, dtype, attributes, dtype_unsigned_int): :Parameter: data: `numpy.ndarray` - The unmasked and unpacked data indexed from the - variable. + The unmasked and (possibly) packed data. dtype: `numpy.dtype` The data type of the variable (which may be different @@ -382,10 +393,15 @@ def _mask(self, data, dtype, attributes, dtype_unsigned_int): # -------------------------------------------------------- # Create mask from _FillValue # -------------------------------------------------------- + fval = np.array(_FillValue, dtype) if dtype_unsigned_int is not None: fval = fval.view(dtype_unsigned_int) + if fval.ndim == 1: + # _FillValue must be a scalar + fval = fval[0] + try: fvalisnan = np.isnan(fval) except Exception: @@ -450,6 +466,10 @@ def _mask(self, data, dtype, attributes, dtype_unsigned_int): # attribute best practices suggesting clients should do # this). if validmin is not None: + if validmin.ndim == 1: + # valid min must be a scalar + validmin = validmin[0] + mask = data < validmin if totalmask is None: totalmask = mask @@ -457,6 +477,10 @@ def _mask(self, data, dtype, attributes, dtype_unsigned_int): totalmask += mask if validmax is not None: + if validmax.ndim == 1: + # valid max must be a scalar + validmax = validmax[0] + mask = data > validmax if totalmask is None: totalmask = mask @@ -481,29 +505,38 @@ def _mask(self, data, dtype, attributes, dtype_unsigned_int): return data def _unpack(self, data, attributes): - """Unpack the data.. + """Unpack the data. + + If both the ``add_offset`` and ``scale_factor`` attributes + have not been set then no unpacking is done and the data is + returned unchanged. .. versionadded:: (cfdm) NEXTVERSION :Parameter: data: `numpy.ndarray` - The unmasked and unpacked data indexed from the - variable. + The masked and (possibly) packed data. attributes: `dict` The variable attributes. :Returns: - `nump.ndarray` + `numpy.ndarray` The unpacked data. """ scale_factor = attributes.get("scale_factor") add_offset = attributes.get("add_offset") + try: if scale_factor is not None: + scale_factor = np.array(scale_factor) + if scale_factor.ndim == 1: + # scale_factor must be a scalar + scale_factor = scale_factor[0] + float(scale_factor) except ValueError: logging.warn( @@ -514,6 +547,11 @@ def _unpack(self, data, attributes): try: if add_offset is not None: + add_offset = np.array(add_offset) + if add_offset.ndim == 1: + # add_offset must be a scalar + add_offset = add_offset[0] + float(add_offset) except ValueError: logging.warn( @@ -545,7 +583,7 @@ def _unpack(self, data, attributes): return data def attributes(self): - """Return the netCDF attributes of the variable. + """Return the netCDF attributes for the data. .. versionadded:: (cfdm) NEXTVERSION @@ -561,20 +599,24 @@ def attributes(self): 'missing_value': -999.0} """ - if self._attributes is not None: - return self._attributes.copy() + _attributes = self._attributes + if _attributes is not None: + return _attributes.copy() variable = self.variable try: - # h5netcdf - return dict(variable.attrs) + # h5py + attrs = dict(variable.attrs) except AttributeError: try: # netCDF4 - return { + attrs = { attr: variable.getncattr(attr) for attr in variable.ncattrs() } except AttributeError: # numpy - return {} + attrs = {} + + self._attributes = attrs + return attrs diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index e0404bb8b..65c203569 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -470,14 +470,14 @@ def file_close(self): g["nc_grouped"].close() # Close s3fs.File objects - for f in g["s3fs.File_objects"]: + for f in g["s3fs_File_objects"]: f.close() def file_open(self, filename, flatten=True, verbose=None): - """Open the netCDf file for reading. + """Open the netCDF file for reading. - If the file has hierarchical groups then a flattened version of it - is returned, and the original grouped file remains open. + If the file has hierarchical groups then a flattened version + of it is returned, and the original grouped file remains open. .. versionadded:: (cfdm) 1.7.0 @@ -515,20 +515,22 @@ def file_open(self, filename, flatten=True, verbose=None): if u.scheme == "s3": # Create an openable S3 file object - key = tokenize(storage_options) + fs_key = tokenize(("s3", storage_options)) file_systems = g["file_systems"] - fs = file_systems.get(key) - if fs is None: + file_system = file_systems.get(fs_key) + if file_system is None: # An S3 file system with these options does not exist, # so create one. - fs = S3FileSystem(**storage_options) - file_systems[key] = fs + file_system = S3FileSystem(**storage_options) + file_systems[fs_key] = file_system - filename = fs.open(u.path[1:], "rb") - g["s3fs.File_objects"].append(filename) + # Reset 'filename' to an s3fs.File object that can be + # passed to the netCDF backend + filename = file_system.open(u.path[1:], "rb") + g["s3fs_File_objects"].append(filename) if is_log_level_detail(logger): - logger.debug( + logger.detail( f" S3: s3fs.S3FileSystem options: {storage_options}\n" ) # pragma: no cover @@ -561,9 +563,7 @@ def file_open(self, filename, flatten=True, verbose=None): raise error else: - raise ValueError( - "Unknown netCDF backend: netCDF_backend={netCDF_backend!r}" - ) + raise ValueError("Unknown netCDF backend: {netCDF_backend!r}") g["original_h5netcdf"] = HDF g["original_netCDF4"] = netCDF @@ -974,6 +974,11 @@ def read( # Initialise netCDF read parameters # ------------------------------------------------------------ self.read_vars = { + # -------------------------------------------------------- + # Verbosity + # -------------------------------------------------------- + "debug": is_log_level_debug(logger), + # "new_dimension_sizes": {}, "formula_terms": {}, "compression": {}, @@ -1078,11 +1083,13 @@ def read( # "file_system_storage_options": {}, # - "s3fs.File_objects": [], + "s3fs_File_objects": [], } g = self.read_vars + debug = g["debug"] + # Set versions for version in ("1.6", "1.7", "1.8", "1.9", "1.10", "1.11"): g["version"][version] = Version(version) @@ -1149,7 +1156,7 @@ def read( # ------------------------------------------------------------ nc = self.file_open(filename, flatten=True, verbose=None) logger.info(f"Reading netCDF file: {filename}\n") # pragma: no cover - if is_log_level_debug(logger): + if debug: logger.debug( f" Input netCDF dataset:\n {nc}\n" ) # pragma: no cover @@ -1181,7 +1188,7 @@ def read( # pass g["global_attributes"] = global_attributes - if is_log_level_debug(logger): + if debug: logger.debug( f" Global attributes:\n {g['global_attributes']}" ) # pragma: no cover @@ -1448,7 +1455,7 @@ def read( for name, value in variable_dimensions.items() } - if is_log_level_debug(logger): + if debug: logger.debug( " General read variables:\n" " read_vars['variable_dimensions'] =\n" @@ -1564,7 +1571,7 @@ def read( # '/forecasts/model/t': 't'} g["dimension_basename"] = dimension_basename - if is_log_level_debug(logger): + if debug: logger.debug( " read_vars['dimension_isunlimited'] =\n" f" {g['dimension_isunlimited']}\n" @@ -1720,7 +1727,7 @@ def read( # node coordinate variable g["do_not_create_field"].add(geometry_ncvar) - if is_log_level_debug(logger): + if debug: logger.debug( " Compression read vars:\n" " read_vars['compression'] =\n" @@ -1770,7 +1777,7 @@ def read( # location_index_set self._ugrid_parse_location_index_set(attributes) - if is_log_level_debug(logger): + if debug: logger.debug(f" UGRID meshes:\n {g['mesh']}") if _scan_only: @@ -1854,7 +1861,7 @@ def read( }, ) - if is_log_level_debug(logger): + if debug: logger.debug( " Reference read vars:\n" " read_vars['references'] =\n" @@ -2458,12 +2465,13 @@ def _parse_indexed_contiguous_compression( """ g = self.read_vars + debug = g["debug"] profile_dimension = g["compression"][sample_dimension][ "ragged_contiguous" ]["profile_dimension"] - if is_log_level_debug(logger): + if debug: logger.debug( " Pre-processing indexed and contiguous compression " f"for instance dimension: {instance_dimension}\n" @@ -2506,7 +2514,7 @@ def _parse_indexed_contiguous_compression( del g["compression"][sample_dimension]["ragged_contiguous"] - if is_log_level_debug(logger): + if debug: logger.debug( f" Created read_vars['compression'][{sample_dimension!r}]" "['ragged_indexed_contiguous']\n" @@ -2958,7 +2966,7 @@ def _set_ragged_indexed_parameters( "element_dimension_size": element_dimension_size, } - if is_log_level_debug(logger): + if g["debug"]: logger.debug( " Created " f"read_vars['compression'][{indexed_sample_dimension!r}]['ragged_indexed']" @@ -3426,7 +3434,7 @@ def _create_field_or_domain( field_properties.update(g["variable_attributes"][field_ncvar]) - if is_log_level_debug(logger): + if g["debug"]: logger.debug( " netCDF attributes:\n" f" {field_properties}" @@ -7898,7 +7906,7 @@ def _check_ancillary_variables(self, field_ncvar, string, parsed_string): # Though an error of sorts, set as debug level message; # read not terminated - if is_log_level_debug(logger): + if g["debug"]: logger.debug( f" Error processing netCDF variable {field_ncvar}: " f"{d['reason']}" @@ -10264,10 +10272,27 @@ def _file_variable_size(self, var): return prod(var.shape) def _get_storage_options(self, filename, parsed_filename): - """TODO. + """Get the storage options for accessing a file. + + If returned storage options will always include an + ``'endpoint_url'`` key. .. versionadded:: (cfdm) NEXTVERSION + :Parameters: + + filename: `str` + The name of the file. + + parsed_filename: `urllib.parse.ParseResult` + The parsed file name, as returned by + ``urllib.parse.urlparse(filename)``. + + :Returns: + + `dict` + The storage options for accessing the file. + """ g = self.read_vars storage_options = g["storage_options"].copy() diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 0ea81b745..d0d8ec6f3 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -169,6 +169,8 @@ The cfdm package requires: * `s3fs `_, version 2024.2.0 or newer. +* `dask `_, version 2024.2.1 or newer. + * `packaging `_, version 20.0 or newer. diff --git a/requirements.txt b/requirements.txt index 7cfb76173..f4b2a5aa2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,5 @@ scipy>=1.10.0 h5netcdf>=1.3.0 h5py>=3.10.0 s3fs>=2024.2.0 +dask>=2024.2.1 From 2899d96cbcad98eace85b51b0e0f141d0b3a853a Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 8 Mar 2024 10:18:57 +0000 Subject: [PATCH 48/88] dev --- cfdm/read_write/netcdf/flatten/flatten.py | 111 +++++++++++++--------- cfdm/read_write/read.py | 4 +- 2 files changed, 69 insertions(+), 46 deletions(-) diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 68cebb8c4..67a950024 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -45,12 +45,12 @@ def netcdf_flatten( coordinate variable, as its name will be prefixed with a different group identifier than its dimension. - In such cases, it is up to the user to apply the proximal and - lateral search alogrithms, in conjunction with the mappings - defined in the ``flattener_name_mapping_variables`` and - ``flattener_name_mapping_dimensions`` global attributes, to find - which netCDF variables are acting as CF coordinate variables in - the flattened dataset. See + In such cases it is up to the user to apply the proximal and + lateral search alogrithms to the flattened dataset returned by + `netcdf_flatten`, in conjunction with the mappings defined in the + newly created global attributes ``__flattener_variable_map`` and + ``__flattener_dimension_map``, to find which netCDF variables are + acting as CF coordinate variables in the flattened dataset. See https://cfconventions.org/cf-conventions/cf-conventions.html#groups for details. @@ -58,17 +58,18 @@ def netcdf_flatten( group and coordinate variable ``lat(lat)`` in group ``/group1``, then the flattened dataset will contain dimension ``lat`` and variable ``group1__lat(lat)``, both in its root group. In this - case, the ``flattener_name_mapping_variables`` global attribute of - the flattened dataset will contain the mapping ``'group1__lat: - /group1/lat'`` and the flattener_name_mapping_dimensions global + case, the ``__flattener_variable_map`` global attribute of the + flattened dataset will contain the mapping ``'group1__lat: + /group1/lat'``, and the ``__flattener_dimension_map`` global attribute will contain the mapping ``'lat: /lat'``. .. versionadded:: (cfdm) NEXTVERSION :Parameters: - input_ds: `netCDF4.Dataset` or `h5netcdf.File` - The dataset to be flattened. + input_ds: + The dataset to be flattened, that has the same API as + `netCDF4.Dataset` or `h5netcdf.File`. output_ds: `netCDF4.Dataset` A container for the flattened dataset. @@ -241,11 +242,12 @@ def __init__( :Parameters: - input_ds: `netCDF4.Dataset` or `h5netcdf.File` - See `netcdf_flatten`. + input_ds: + The dataset to be flattened, that has the same API as + `netCDF4.Dataset` or `h5netcdf.File`. output_ds: `netCDF4.Dataset` - See `netcdf_flatten`. + A container for the flattened dataset. strict: `bool`, optional See `netcdf_flatten`. @@ -287,8 +289,9 @@ def attrs(self, variable): :Parameters: - var: - The dataset variable. + variable: + The variable, that has the same API as + `netCDF4.Variable` or `h5netcdf.Variable`. :Returns: @@ -314,7 +317,8 @@ def chunksizes(self, variable): :Parameters: variable: - The dataset variable. + The variable, that has the same API as + `netCDF4.Variable` or `h5netcdf.Variable`. :Returns: @@ -349,8 +353,9 @@ def contiguous(self, variable): :Parameters: - variable: `netCDF4.Variable` or `h5netcdf.Variable` - The variable. + variable: + The variable, that has the same API as + `netCDF4.Variable` or `h5netcdf.Variable`. :Returns: @@ -379,7 +384,8 @@ def dtype(self, variable): :Parameters: variable: - The dataset variable. + The variable, that has the same API as + `netCDF4.Variable` or `h5netcdf.Variable`. :Returns: @@ -408,8 +414,9 @@ def endian(self, variable): :Parameters: - variable: `netCDF4.Variable` or `h5netcdf.Variable` - The variable. + variable: + The variable, that has the same API as + `netCDF4.Variable` or `h5netcdf.Variable`. :Returns: @@ -438,8 +445,9 @@ def filepath(self, dataset): :Parameters: - dataset: `netCDF4.Dataset` or `h5netcdf.File` - The dataset. + dataset: + The dataset, that has the same API as + `netCDF4.Dataset` or `h5netcdf.File`. :Returns: @@ -708,8 +716,9 @@ def flatten_dimension(self, dim): :Parameters: - dim: `netCDF4.Dimension` or `h5netcdf.Dimension` - The dimension to flatten. + dim: + The dimension to flatten, that has the same API as + `netCDF4.Dimension` or `h5netcdf.Dimension`. :Returns: @@ -750,8 +759,9 @@ def flatten_variable(self, var): :Parameters: - var: `netCDF4.Variable` or `h5netcdf.Variable` - The variable to flatten. + var: + The variable, that has the same API as + `netCDF4.Variable` or `h5netcdf.Variable`. :Returns: @@ -890,11 +900,14 @@ def write_data_in_chunks(self, old_var, new_var): :Parameters: - old_var: `netCDF4.Variable` or `h5netcdf.Variable` - The variable where data should be copied from. + old_var: + The variable where the data should be copied from, + that has the same API as `netCDF4.Variable` or + `h5netcdf.Variable`. - new_var: `netCDF4.Variable` - The new variable where to copy data. + new_var: + The new variable in which copy the data, that has the + same API as `netCDF4.Variable` or `h5netcdf.Variable`. :Returns: @@ -944,8 +957,10 @@ def resolve_reference(self, orig_ref, orig_var, rules): orig_ref: `str` The reference to resolve. - orig_var: `netCDF4.Variable` or `h5netcdf.Variable` - The original variable containing the reference. + orig_var: + The original variable containing the reference, that + has the same API as `netCDF4.Variable` or + `h5netcdf.Variable`. rules: `FlatteningRules` The flattening rules that apply to the reference. @@ -1040,8 +1055,10 @@ def resolve_reference_proximity( Resolve as variable if resolving as dimension failed, and vice versa. - orig_var: `netCDF4.Variable` or `h5netcdf.Variable` - The original variable containing the reference. + orig_var: + The original variable containing the reference, that + has the same API as `netCDF4.Variable` or + `h5netcdf.Variable`. rules: `FlatteningRules` The flattening rules that apply to the reference. @@ -1111,8 +1128,10 @@ def resolve_reference_post_processing( orig_ref: `str` The original reference. - orig_var: `netCDF4.Variable` or `h5netcdf.Variable` - The original variable containing the reference. + orig_var: + The original variable containing the reference, that + has the same API as `netCDF4.Variable` or + `h5netcdf.Variable`. rules: `FlatteningRules` The flattening rules that apply to the reference. @@ -1330,12 +1349,15 @@ def resolve_references(self, var, old_var): :Parameters: - var: `netCDF4.Variable` or `h5netcdf.Variable` + var: The flattened variable in which references should be - renamed with absolute references. + renamed with absolute references, that has the same + API as `netCDF4.Variable` or `h5netcdf.Variable`. - old_var: `netCDF4.Variable` or `h5netcdf.Variable` - The original variable (in group structure). + old_var: + The original variable (in group structure), that has + the same API as `netCDF4.Variable` or + `h5netcdf.Variable`. :Returns: @@ -1380,9 +1402,10 @@ def adapt_references(self, var): :Parameters: - var: `netCDF4.Variable` or `h5netcdf.Variable` + var: The flattened variable in which references should be - renamed with new names. + renamed with new names, that has the same API as + `netCDF4.Variable` or `h5netcdf.Variable`. :Returns: diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 30f0cd4ef..8353ef8ff 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -26,8 +26,8 @@ def read( The following file formats are supported: netCDF and CDL. - NetCDF files may be on disk, on an OPeNDAP server, or in an S3 - object store. + NetCDF files may be on local disk, on an OPeNDAP server, or in an + S3 object store. The returned constructs are sorted by the netCDF variable names of their corresponding data or domain variables. From 7312b5306085b06e3606ee0e4972e07e92368019 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 15 Mar 2024 08:31:49 +0000 Subject: [PATCH 49/88] dev --- cfdm/__init__.py | 2 +- cfdm/data/__init__.py | 2 +- cfdm/data/h5netcdfarray.py | 4 +-- cfdm/data/netcdf4array.py | 4 +-- cfdm/data/netcdfindexer.py | 14 +++++----- ...etCDFIndexer.py => test_netcdf_indexer.py} | 26 +++++++++---------- 6 files changed, 26 insertions(+), 26 deletions(-) rename cfdm/test/{test_NetCDFIndexer.py => test_netcdf_indexer.py} (86%) diff --git a/cfdm/__init__.py b/cfdm/__init__.py index 84d863b25..e8859e239 100644 --- a/cfdm/__init__.py +++ b/cfdm/__init__.py @@ -186,7 +186,7 @@ H5netcdfArray, NetCDFArray, NetCDF4Array, - NetCDFIndexer, + netcdf_indexer, NumpyArray, PointTopologyArray, RaggedArray, diff --git a/cfdm/data/__init__.py b/cfdm/data/__init__.py index bcbece15c..22a783539 100644 --- a/cfdm/data/__init__.py +++ b/cfdm/data/__init__.py @@ -21,7 +21,7 @@ from .h5netcdfarray import H5netcdfArray from .netcdfarray import NetCDFArray from .netcdf4array import NetCDF4Array -from .netcdfindexer import NetCDFIndexer +from .netcdfindexer import netcdf_indexer from .numpyarray import NumpyArray from .pointtopologyarray import PointTopologyArray from .raggedcontiguousarray import RaggedContiguousArray diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index 0ba5f5da7..3f1831ea7 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -5,7 +5,7 @@ from . import abstract from .mixin import FileArrayMixin, NetCDFFileMixin -from .netcdfindexer import NetCDFIndexer +from .netcdfindexer import netcdf_indexer _safecast = netCDF4.utils._safecast default_fillvals = netCDF4.default_fillvals.copy() @@ -200,7 +200,7 @@ def __getitem__(self, indices): variable = dataset.variables[address] # Get the data, applying masking and scaling as required. - array = NetCDFIndexer( + array = netcdf_indexer( variable, mask=self.get_mask(), unpack=self.get_unpack(), diff --git a/cfdm/data/netcdf4array.py b/cfdm/data/netcdf4array.py index 09131480c..a1e403b67 100644 --- a/cfdm/data/netcdf4array.py +++ b/cfdm/data/netcdf4array.py @@ -2,7 +2,7 @@ from . import abstract from .mixin import FileArrayMixin, NetCDFFileMixin -from .netcdfindexer import NetCDFIndexer +from .netcdfindexer import netcdf_indexer class NetCDF4Array(NetCDFFileMixin, FileArrayMixin, abstract.Array): @@ -229,7 +229,7 @@ def __getitem__(self, indices): break # Get the data, applying masking and scaling as required. - array = NetCDFIndexer( + array = netcdf_indexer( variable, mask=self.get_mask(), unpack=self.get_unpack(), diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index 8e70e7cf0..f6ea95b0a 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -30,7 +30,7 @@ logger = logging.getLogger(__name__) -class NetCDFIndexer: +class netcdf_indexer: """A data indexer that applies netCDF masking and unpacking. During indexing, masking and unpacking is applied according to the @@ -62,7 +62,7 @@ class NetCDFIndexer: >>> import netCDF4 >>> nc = netCDF4.Dataset('file.nc', 'r') - >>> x = cfdm.NetCDFIndexer(nc.variables['x']) + >>> x = cfdm.netcdf_indexer(nc.variables['x']) >>> x.shape (12, 64, 128) >>> print(x[0, 0:4, 0:3]) @@ -73,7 +73,7 @@ class NetCDFIndexer: >>> import h5netcdf >>> h5 = h5netcdf.File('file.nc', 'r') - >>> x = cfdm.NetCDFIndexer(h5.variables['x']) + >>> x = cfdm.netcdf_indexer(h5.variables['x']) >>> x.shape (12, 64, 128) >>> print(x[0, 0:4, 0:3]) @@ -84,15 +84,15 @@ class NetCDFIndexer: >>> import numpy as np >>> n = np.arange(7) - >>> x = cfdm.NetCDFIndexer(n) + >>> x = cfdm.netcdf_indexer(n) >>> x.shape (9,) >>> print(x[...]) [0 1 2 3 4 5 6] - >>> x = cfdm.NetCDFIndexer(n, attributes={'_FillValue': 4}) + >>> x = cfdm.netcdf_indexer(n, attributes={'_FillValue': 4}) >>> print(x[...]) [0 1 2 3 -- 5 6] - >>> x = cfdm.NetCDFIndexer(n, mask=False, attributes={'_FillValue': 4}) + >>> x = cfdm.netcdf_indexer(n, mask=False, attributes={'_FillValue': 4}) >>> print(x[...]) [0 1 2 3 4 5 6] @@ -118,7 +118,7 @@ def __init__( could be applied by applied by *variable* itself (e.g. by a `netCDF4.Variable` instance) is disabled, ensuring that any masking and unpacking is always done - by the `NetCDFIndexer` instance. + by the `netcdf_indexer` instance. mask: `bool` If True, the default, then an array returned by diff --git a/cfdm/test/test_NetCDFIndexer.py b/cfdm/test/test_netcdf_indexer.py similarity index 86% rename from cfdm/test/test_NetCDFIndexer.py rename to cfdm/test/test_netcdf_indexer.py index ea44ecfbc..afc113cea 100644 --- a/cfdm/test/test_NetCDFIndexer.py +++ b/cfdm/test/test_netcdf_indexer.py @@ -14,7 +14,7 @@ n_tmpfiles = 1 tmpfiles = [ - tempfile.mkstemp("_test_NetCDFIndexer.nc", dir=os.getcwd())[1] + tempfile.mkstemp("_test_netcdf_indexer.nc", dir=os.getcwd())[1] for i in range(n_tmpfiles) ] (tmpfile,) = tmpfiles @@ -34,17 +34,17 @@ def _remove_tmpfiles(): netCDF_backends = ("netCDF4", "h5netcdf") -class NetCDFIndexerTest(unittest.TestCase): +class netcdf_indexerTest(unittest.TestCase): """Test the masking and scaling of netCDF data.""" - def test_NetCDFIndexer_shape(self): - """Test NetCDFIndexer shape.""" + def test_netcdf_indexer_shape(self): + """Test netcdf_indexer shape.""" n = np.ma.arange(9) - x = cfdm.NetCDFIndexer(n) + x = cfdm.netcdf_indexer(n) self.assertEqual(x.shape, n.shape) - def test_NetCDFIndexer_mask(self): - """Test NetCDFIndexer for CF masking.""" + def test_netcdf_indexer_mask(self): + """Test netcdf_indexer for CF masking.""" f0 = cfdm.example_field(0) f0.del_property("missing_value", None) f0.del_property("_FillValue", None) @@ -92,8 +92,8 @@ def test_NetCDFIndexer_mask(self): nc.close() - def test_NetCDFIndexer_scale(self): - """Test NetCDFIndexer for CF scaling.""" + def test_netcdf_indexer_scale(self): + """Test netcdf_indexer for CF scaling.""" f = cfdm.example_field(0) array = np.ma.arange(40, dtype="int32").reshape(f.shape) @@ -124,14 +124,14 @@ def test_NetCDFIndexer_scale(self): nc.close() - def test_NetCDFIndexer_numpy(self): - """Test NetCDFIndexer for numpy.""" + def test_netcdf_indexer_numpy(self): + """Test netcdf_indexer for numpy.""" array = np.ma.arange(9) - x = cfdm.NetCDFIndexer(array) + x = cfdm.netcdf_indexer(array) x = x[...] self.assertTrue((x == array).all()) - x = cfdm.NetCDFIndexer( + x = cfdm.netcdf_indexer( array.copy(), attributes={"_FillValue": 4, "missing_value": (0, 8)} ) x = x[...] From fdab1c9450dc725cb71ed29b4459261a3f5ea1de Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 21 Mar 2024 00:48:29 +0000 Subject: [PATCH 50/88] orthogonal indexing --- cfdm/data/netcdfindexer.py | 57 ++++++++++++++++++++++++++++++-- cfdm/test/test_netcdf_indexer.py | 15 +++++++++ cfdm/test/test_read_write.py | 2 ++ 3 files changed, 71 insertions(+), 3 deletions(-) diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index f6ea95b0a..8ee792797 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -20,9 +20,11 @@ """ import logging +from numbers import Integral import netCDF4 import numpy as np +from dask.array.slicing import normalize_index _safecast = netCDF4.utils._safecast _default_fillvals = netCDF4.default_fillvals @@ -185,8 +187,53 @@ def __getitem__(self, index): netCDF4_mask = variable.mask variable.set_auto_maskandscale(False) + # ------------------------------------------------------------ # Index the variable - data = variable[index] + # ------------------------------------------------------------ + index = normalize_index(index, variable.shape) + + # Find the positions of any list/1-d array indices + axes_with_list_indices = [ + n + for n, i in enumerate(index) + if isinstance(i, list) or getattr(i, "shape", False) + ] + + # Convert any integer indices to size 1 slices + index0 = [ + slice(i, i + 1) if isinstance(i, Integral) else i for i in index + ] + + data = variable + if len(axes_with_list_indices) <= 1 or getattr( + variable, "__orthogonal_indexing__", False + ): + # There is at most one list/1-d array index, and/or the + # variable natively supports orthogonal indexing. + data = data[tuple(index0)] + else: + # Emulate orthogonal indexing + # + # Apply the slice indices and the first list/1-d array + # index + index1 = [ + i if isinstance(i, slice) else slice(None) for i in index0 + ] + n = axes_with_list_indices[0] + index1[n] = index[n] + data = data[tuple(index1)] + + # Apply the rest of the list/1-d array indices one at a time + ndim = variable.ndim + for n in axes_with_list_indices[1:]: + index2 = [slice(None)] * ndim + index2[n] = index[n] + data = data[tuple(index2)] + + # Apply any integer indices + index3 = [0 if isinstance(i, Integral) else slice(None) for i in index] + if index3: + data = data[tuple(index3)] # Reset a netCDF4 variable's scale and mask behaviour if netCDF4_scale: @@ -221,12 +268,16 @@ def __getitem__(self, index): ) data = data.view(dtype_unsigned_int) + # ------------------------------------------------------------ + # Mask the data + # ------------------------------------------------------------ if self.mask: - # Mask the data data = self._mask(data, dtype, attributes, dtype_unsigned_int) + # ------------------------------------------------------------ + # Unpack the data + # ------------------------------------------------------------ if unpack: - # Unpack the data data = self._unpack(data, attributes) # Make sure all strings are unicode diff --git a/cfdm/test/test_netcdf_indexer.py b/cfdm/test/test_netcdf_indexer.py index afc113cea..1c3866cb7 100644 --- a/cfdm/test/test_netcdf_indexer.py +++ b/cfdm/test/test_netcdf_indexer.py @@ -139,6 +139,21 @@ def test_netcdf_indexer_numpy(self): self.assertTrue((x.mask == array.mask).all()) self.assertTrue((x == array).all()) + def test_netcdf_indexer_orthogonal_indexing(self): + """Test netcdf_indexer for numpy.""" + array = np.ma.arange(120).reshape(2, 3, 4, 5) + x = cfdm.netcdf_indexer(array, mask=False, unpack=False) + + y = x[..., [0, 2], :] + a = array[..., [0, 2], :] + self.assertTrue((y == a).all()) + + y = x[1, ..., [0, 2], [0, 2, 3]] + a = array[:, :, [0, 2], :] + a = a[..., [0, 2, 3]] + a = a[1, ...] + self.assertTrue((y == a).all()) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/cfdm/test/test_read_write.py b/cfdm/test/test_read_write.py index 076506c15..2415b544e 100644 --- a/cfdm/test/test_read_write.py +++ b/cfdm/test/test_read_write.py @@ -996,6 +996,8 @@ def test_read_write_domain_ancillary(self): def test_read_url(self): """Test reading urls.""" + print("SKIPPING URL TEST") + return for scheme in ("http", "https"): remote = f"{scheme}://psl.noaa.gov/thredds/dodsC/Datasets/cru/crutem5/Monthlies/air.mon.anom.nobs.nc" # Check that cfdm can access it From a715c2374bb59d74e10de63820629d4260eb3684 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 21 Mar 2024 10:02:31 +0000 Subject: [PATCH 51/88] dev --- cfdm/data/netcdfindexer.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index 8ee792797..2165b4607 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -33,7 +33,12 @@ class netcdf_indexer: - """A data indexer that applies netCDF masking and unpacking. + """A data indexer that also applies netCDF masking and unpacking. + + Indexing is orthogonal, meaning that the index for each dimension + is applied independently, regardless of how that index was + defined. For instance, the indices ``[[0, 1], [1, 3], 0]`` and + ``[:2, 1::2, 0]`` will give identical results. During indexing, masking and unpacking is applied according to the netCDF conventions, either or both of which may be disabled via @@ -164,7 +169,10 @@ def __getitem__(self, index): v.__getitem__(index) <==> v[index] - Indexing follows the rules defined by the variable. + Indexing is orthogonal, meaning that the index for each + dimension is applied independently, regardless of how that + index was defined. For instance, the indices ``[[0, 1], [1, + 3], 0]`` and ``[:2, 1::2, 0]`` will give identical results. .. versionadded:: (cfdm) NEXTVERSION @@ -230,7 +238,7 @@ def __getitem__(self, index): index2[n] = index[n] data = data[tuple(index2)] - # Apply any integer indices + # Apply any integer indices orthogonally index3 = [0 if isinstance(i, Integral) else slice(None) for i in index] if index3: data = data[tuple(index3)] From 6b2049a3b0118af9fbb2d3da2ef084b3f9873529 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 22 Mar 2024 17:23:33 +0000 Subject: [PATCH 52/88] dev --- cfdm/read_write/netcdf/netcdfread.py | 20 +++++++++--------- cfdm/read_write/read.py | 31 ++++++++++++++++------------ cfdm/test/test_groups.py | 14 ++++++------- cfdm/test/test_netcdf_indexer.py | 16 +++++++------- cfdm/test/test_read_write.py | 4 ++-- 5 files changed, 45 insertions(+), 40 deletions(-) diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 65c203569..dce24b641 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -507,7 +507,7 @@ def file_open(self, filename, flatten=True, verbose=None): netCDF = False HDF = False - netCDF_backend = g["netCDF_backend"] + netcdf_engine = g["netcdf_engine"] # Deal with an file in an S3 object store u = urlparse(filename) @@ -525,7 +525,7 @@ def file_open(self, filename, flatten=True, verbose=None): file_systems[fs_key] = file_system # Reset 'filename' to an s3fs.File object that can be - # passed to the netCDF backend + # passed to the netCDF engine filename = file_system.open(u.path[1:], "rb") g["s3fs_File_objects"].append(filename) @@ -534,7 +534,7 @@ def file_open(self, filename, flatten=True, verbose=None): f" S3: s3fs.S3FileSystem options: {storage_options}\n" ) # pragma: no cover - if netCDF_backend is None: + if netcdf_engine is None: try: # Try opening the file with netCDF4 nc = self._open_netCDF4(filename) @@ -548,14 +548,14 @@ def file_open(self, filename, flatten=True, verbose=None): except Exception as error: raise error - elif netCDF_backend == "netCDF4": + elif netcdf_engine == "netCDF4": try: nc = self._open_netCDF4(filename) netCDF = True except Exception as error: raise error - elif netCDF_backend == "h5netcdf": + elif netcdf_engine == "h5netcdf": try: nc = self._open_h5netcdf(filename) HDF = True @@ -563,7 +563,7 @@ def file_open(self, filename, flatten=True, verbose=None): raise error else: - raise ValueError("Unknown netCDF backend: {netCDF_backend!r}") + raise ValueError("Unknown netCDF engine: {netcdf_engine!r}") g["original_h5netcdf"] = HDF g["original_netCDF4"] = netCDF @@ -899,7 +899,7 @@ def read( domain=False, storage_options=None, _file_systems=None, - netCDF_backend=None, + netcdf_engine=None, ): """Reads a netCDF dataset from file or OPenDAP URL. @@ -954,7 +954,7 @@ def read( .. versionadded:: (cfdm) NEXTVERSION - netCDF_backend: `None` or `str`, optional + netcdf_engine: `None` or `str`, optional See `cfdm.read` for details .. versionadded:: (cfdm) NEXTVERSION @@ -1070,9 +1070,9 @@ def read( # -------------------------------------------------------- "cfa": False, # -------------------------------------------------------- - # NetCDF backend + # NetCDF engine # -------------------------------------------------------- - "netCDF_backend": netCDF_backend, + "netcdf_engine": netcdf_engine, # -------------------------------------------------------- # S3 # -------------------------------------------------------- diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 8353ef8ff..2e45a9da9 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -18,7 +18,7 @@ def read( mask=True, unpack=True, domain=False, - netCDF_backend=None, + netcdf_engine=None, storage_options=None, _implementation=_implementation, ): @@ -276,15 +276,19 @@ def read( .. versionadded:: (cfdm) 1.9.0.0 - netCDF_backend: `None` or `str`, optional - Specify which library to use for opening netCDF files. By - default, or if `None`, then `netCDF4` will used unless it - fails to open a given file, in which case `h5netcdf` will - be used instead. Setting *netCDF_backend* to ``'netCDF4'`` - or ``'h5netcdf'`` will force the use of the `netCDF4` or - `h5netcdf` libraries respectively. + netcdf_eninge: `None` or `str`, optional + Specify which library to use for opening and reading + netCDF files. By default, or if `None`, then the first one + of `netCDF4` and `h5netcdf` to successfully open the file + netCDF file is used. Setting *netcdf_engine* to one of + ``'netCDF4'`` and ``'h5netcdf'`` will force the use of + that library. - .. versionadded:: (cfdm) NEXTVERSION + .. note:: `h5netcdf` restricts the types of indices that + define subspaces of its data. See + https://docs.h5py.org for details. + + .. versionadded:: NEXTVERSION storage_options: `dict` or `None`, optional Key/value pairs to be passed on to the creation of @@ -378,9 +382,10 @@ def read( filename = netcdf.cdl_to_netcdf(filename) if netcdf.is_netcdf_file(filename): - # See https://github.com/NCAS-CMS/cfdm/issues/128 for context on the - # try/except here, which acts as a temporary fix pending decisions on - # the best way to handle CDL with only header or coordinate info. + # See https://github.com/NCAS-CMS/cfdm/issues/128 for context + # on the try/except here, which acts as a temporary fix + # pending decisions on the best way to handle CDL with only + # header or coordinate info. try: fields = netcdf.read( filename, @@ -393,7 +398,7 @@ def read( unpack=unpack, domain=domain, storage_options=storage_options, - netCDF_backend=netCDF_backend, + netcdf_engine=netcdf_engine, extra_read_vars=None, ) except MaskError: diff --git a/cfdm/test/test_groups.py b/cfdm/test/test_groups.py index e8369a590..fec52a6f8 100644 --- a/cfdm/test/test_groups.py +++ b/cfdm/test/test_groups.py @@ -181,14 +181,14 @@ def test_groups(self): nc.close() h = cfdm.read( - grouped_file, netCDF_backend="netCDF4", verbose="WARNING" + grouped_file, netcdf_engine="netCDF4", verbose="WARNING" ) self.assertEqual(len(h), 1) h = h[0] self.assertTrue(f.equals(h, verbose=2)) # Check that h5netcdf reads the file correctly - h5 = cfdm.read(grouped_file, netCDF_backend="h5netcdf") + h5 = cfdm.read(grouped_file, netcdf_engine="h5netcdf") self.assertEqual(len(h5), 1) self._check_h5netcdf_groups(h5[0], h) @@ -321,7 +321,7 @@ def test_groups_geometry(self): self.assertTrue(f.equals(h, verbose=2)) # Check that h5netcdf reads the file correctly - h5 = cfdm.read(grouped_file, netCDF_backend="h5netcdf") + h5 = cfdm.read(grouped_file, netcdf_engine="h5netcdf") self.assertEqual(len(h5), 1) self._check_h5netcdf_groups(h5[0], h) @@ -394,7 +394,7 @@ def test_groups_compression(self): self.assertTrue(f.equals(h, verbose=2)) # Check that h5netcdf reads the file correctly - h5 = cfdm.read(grouped_file, netCDF_backend="h5netcdf") + h5 = cfdm.read(grouped_file, netcdf_engine="h5netcdf") self.assertEqual(len(h5), 1) self._check_h5netcdf_groups(h5[0], h) @@ -467,7 +467,7 @@ def test_groups_dimension(self): self.assertTrue(f.equals(h, verbose=3)) # Check that h5netcdf reads the file correctly - h5 = cfdm.read(grouped_file, netCDF_backend="h5netcdf") + h5 = cfdm.read(grouped_file, netcdf_engine="h5netcdf") self.assertEqual(len(h5), 1) self._check_h5netcdf_groups(h5[0], h) @@ -504,13 +504,13 @@ def test_groups_unlimited_dimension(self): cfdm.write(f, grouped_file5, verbose=1) - h = cfdm.read(grouped_file, netCDF_backend="netCDF4") + h = cfdm.read(grouped_file, netcdf_engine="netCDF4") self.assertEqual(len(h), 1) h = h[0] self.assertTrue(f.equals(h)) # Check that h5netcdf reads the file correctly - h5 = cfdm.read(grouped_file, netCDF_backend="h5netcdf") + h5 = cfdm.read(grouped_file, netcdf_engine="h5netcdf") self.assertEqual(len(h5), 1) self._check_h5netcdf_groups(h5[0], h) diff --git a/cfdm/test/test_netcdf_indexer.py b/cfdm/test/test_netcdf_indexer.py index 1c3866cb7..974a835ef 100644 --- a/cfdm/test/test_netcdf_indexer.py +++ b/cfdm/test/test_netcdf_indexer.py @@ -31,7 +31,7 @@ def _remove_tmpfiles(): atexit.register(_remove_tmpfiles) -netCDF_backends = ("netCDF4", "h5netcdf") +netcdf_engines = ("netCDF4", "h5netcdf") class netcdf_indexerTest(unittest.TestCase): @@ -44,7 +44,7 @@ def test_netcdf_indexer_shape(self): self.assertEqual(x.shape, n.shape) def test_netcdf_indexer_mask(self): - """Test netcdf_indexer for CF masking.""" + """Test netcdf_indexer for masking.""" f0 = cfdm.example_field(0) f0.del_property("missing_value", None) f0.del_property("_FillValue", None) @@ -81,8 +81,8 @@ def test_netcdf_indexer_mask(self): nc = netCDF4.Dataset(tmpfile, "r") nc.set_auto_maskandscale(True) nc.set_always_mask(True) - for backend in netCDF_backends: - f = cfdm.read(tmpfile, netCDF_backend=backend) + for engine in netcdf_engines: + f = cfdm.read(tmpfile, netcdf_engine=engine) for g in f: ncvar = g.nc_get_variable() n = nc.variables[ncvar] @@ -92,8 +92,8 @@ def test_netcdf_indexer_mask(self): nc.close() - def test_netcdf_indexer_scale(self): - """Test netcdf_indexer for CF scaling.""" + def test_netcdf_indexer_unpack(self): + """Test netcdf_indexer for unpacking.""" f = cfdm.example_field(0) array = np.ma.arange(40, dtype="int32").reshape(f.shape) @@ -113,8 +113,8 @@ def test_netcdf_indexer_scale(self): nc = netCDF4.Dataset(tmpfile, "r") nc.set_auto_maskandscale(True) nc.set_always_mask(True) - for backend in netCDF_backends: - f = cfdm.read(tmpfile, netCDF_backend=backend) + for engine in netcdf_engines: + f = cfdm.read(tmpfile, netcdf_engine=engine) for g in f: ncvar = g.nc_get_variable() n = nc.variables[ncvar] diff --git a/cfdm/test/test_read_write.py b/cfdm/test/test_read_write.py index 2415b544e..71f218509 100644 --- a/cfdm/test/test_read_write.py +++ b/cfdm/test/test_read_write.py @@ -671,8 +671,8 @@ def test_read_CDL(self): def test_read_write_string(self): """Test the `string` keyword argument to `read` and `write`.""" - fn = cfdm.read(self.string_filename, netCDF_backend="netCDF4") - fh = cfdm.read(self.string_filename, netCDF_backend="h5netcdf") + fn = cfdm.read(self.string_filename, netcdf_engine="netCDF4") + fh = cfdm.read(self.string_filename, netcdf_engine="h5netcdf") n = int(len(fn) / 2) From 3d90befda9abe2c59600f9a8c5eb8534ac17be38 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 26 Mar 2024 18:46:04 +0000 Subject: [PATCH 53/88] dev --- cfdm/data/h5netcdfarray.py | 2 +- cfdm/data/netcdf4array.py | 2 +- cfdm/data/netcdfindexer.py | 54 ++++++++++++++++++-------------- cfdm/test/test_groups.py | 4 +-- cfdm/test/test_netcdf_indexer.py | 8 +++++ 5 files changed, 42 insertions(+), 28 deletions(-) diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index 3f1831ea7..b7bd9bd03 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -204,7 +204,7 @@ def __getitem__(self, indices): variable, mask=self.get_mask(), unpack=self.get_unpack(), - always_mask=False, + always_masked_array=False, ) array = array[indices] diff --git a/cfdm/data/netcdf4array.py b/cfdm/data/netcdf4array.py index a1e403b67..8bba10eea 100644 --- a/cfdm/data/netcdf4array.py +++ b/cfdm/data/netcdf4array.py @@ -233,7 +233,7 @@ def __getitem__(self, indices): variable, mask=self.get_mask(), unpack=self.get_unpack(), - always_mask=False, + always_masked_array=False, ) array = array[indices] diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index 2165b4607..8188e8982 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -53,15 +53,17 @@ class netcdf_indexer: input *data* object, or given with the input *attributes* parameter. - The relevant netCDF attributes that are considered: + The relevant netCDF attributes that are considered are: * For masking: ``missing_value``, ``valid_max``, ``valid_min``, - ``valid_range``, ``_FillValue``, + ``valid_range``, ``_FillValue``, and ``_Unsigned``. Note that if ``_FillValue`` is not present then the netCDF default value for the - appropriate data type will be assumed. + appropriate data type will be assumed, as defined + by `netCDF4.default_fillvals`. - * For unpacking: ``add_offset``, ``scale_factor``, ``_Unsigned`` + * For unpacking: ``add_offset``, ``scale_factor``, and + ``_Unsigned`` .. versionadded:: (cfdm) NEXTVERSION @@ -102,6 +104,9 @@ class netcdf_indexer: >>> x = cfdm.netcdf_indexer(n, mask=False, attributes={'_FillValue': 4}) >>> print(x[...]) [0 1 2 3 4 5 6] + >>> x = cfdm.netcdf_indexer(n, mask=False, attributes={'_FillValue': 4}) + >>> print(x[...]) + [0 1 2 3 4 5 6] """ @@ -110,7 +115,7 @@ def __init__( variable, mask=True, unpack=True, - always_mask=False, + always_masked_array=False, attributes=None, ): """**Initialisation** @@ -122,10 +127,10 @@ def __init__( has the same API as one of `numpy.ndarray`, `netCDF4.Variable` or `h5py.Variable` (which includes `h5netcdf.Variable`). Any masking and unpacking that - could be applied by applied by *variable* itself - (e.g. by a `netCDF4.Variable` instance) is disabled, - ensuring that any masking and unpacking is always done - by the `netcdf_indexer` instance. + could be applied by *variable* itself (e.g. by a + `netCDF4.Variable` instance) is disabled, ensuring + that any masking and unpacking is always done by the + `netcdf_indexer` instance. mask: `bool` If True, the default, then an array returned by @@ -142,7 +147,7 @@ def __init__( attributes: ``add_offset``, ``scale_factor``, and ``_Unsigned``. - always_mask: `bool` + always_masked_array: `bool` If False, the default, then an array returned by indexing which has no missing values is created as a regular numpy array. If True then an array returned by @@ -159,9 +164,9 @@ def __init__( """ self.variable = variable - self.mask = mask - self.unpack = unpack - self.always_mask = always_mask + self.mask = bool(mask) + self.unpack = bool(unpack) + self.always_masked_array = bool(always_masked_array) self._attributes = attributes def __getitem__(self, index): @@ -370,8 +375,8 @@ def _default_FillValue(self, dtype): """ if dtype.kind in "OS": return _default_fillvals["S1"] - else: - return _default_fillvals[dtype.str[1:]] + + return _default_fillvals[dtype.str[1:]] def _mask(self, data, dtype, attributes, dtype_unsigned_int): """Mask the data. @@ -452,7 +457,6 @@ def _mask(self, data, dtype, attributes, dtype_unsigned_int): # -------------------------------------------------------- # Create mask from _FillValue # -------------------------------------------------------- - fval = np.array(_FillValue, dtype) if dtype_unsigned_int is not None: fval = fval.view(dtype_unsigned_int) @@ -551,15 +555,19 @@ def _mask(self, data, dtype, attributes, dtype_unsigned_int): # ------------------------------------------------------------ if totalmask is not None and totalmask.any(): data = np.ma.masked_array( - data, mask=totalmask, fill_value=fill_value + data, mask=totalmask, fill_value=fill_value, copy=False ) if not data.ndim: # Return a scalar numpy masked constant not a 0-d # masked array, so that data == np.ma.masked. data = data[()] - elif self.always_mask and not np.ma.isMA(data): - # Return a masked array when there are no masked elements - data = np.ma.masked_array(data) + elif np.ma.isMA(data): + if not (self.always_masked_array or np.ma.is_masked(data)): + # Return a non-masked array + data = np.array(data, copy=False) + elif self.always_masked_array: + # Return a masked array + data = np.ma.masked_array(data, copy=False) return data @@ -664,17 +672,17 @@ def attributes(self): variable = self.variable try: - # h5py + # h5py API attrs = dict(variable.attrs) except AttributeError: try: - # netCDF4 + # netCDF4 API attrs = { attr: variable.getncattr(attr) for attr in variable.ncattrs() } except AttributeError: - # numpy + # numpy API attrs = {} self._attributes = attrs diff --git a/cfdm/test/test_groups.py b/cfdm/test/test_groups.py index fec52a6f8..cbe51ab85 100644 --- a/cfdm/test/test_groups.py +++ b/cfdm/test/test_groups.py @@ -180,9 +180,7 @@ def test_groups(self): ) nc.close() - h = cfdm.read( - grouped_file, netcdf_engine="netCDF4", verbose="WARNING" - ) + h = cfdm.read(grouped_file, netcdf_engine="netCDF4", verbose="WARNING") self.assertEqual(len(h), 1) h = h[0] self.assertTrue(f.equals(h, verbose=2)) diff --git a/cfdm/test/test_netcdf_indexer.py b/cfdm/test/test_netcdf_indexer.py index 974a835ef..95fb4a32d 100644 --- a/cfdm/test/test_netcdf_indexer.py +++ b/cfdm/test/test_netcdf_indexer.py @@ -154,6 +154,14 @@ def test_netcdf_indexer_orthogonal_indexing(self): a = a[1, ...] self.assertTrue((y == a).all()) + def test_netcdf_always_masked_array(self): + """Test netcdf_indexer for numpy masked output.""" + array = np.ma.arange(9) + x = cfdm.netcdf_indexer(array) + self.assertFalse(np.ma.isMA(x[...])) + x = cfdm.netcdf_indexer(array, always_masked_array=True) + self.assertTrue(np.ma.isMA(x[...])) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) From 01687fc391a0da1e1fe46895c26adc318ece6da8 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 28 Mar 2024 10:50:41 +0000 Subject: [PATCH 54/88] dev --- cfdm/data/netcdfindexer.py | 250 +++++++++++++++++---- cfdm/read_write/netcdf/flatten/__init__.py | 2 +- cfdm/test/test_netcdf_indexer.py | 3 + 3 files changed, 207 insertions(+), 48 deletions(-) diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index 8188e8982..36c568b32 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -1,7 +1,7 @@ """A data indexer that applies netCDF masking and unpacking. Portions of this code were adapted from the `netCDF4` library, which -carries the MIT License: +carries the following MIT License: Copyright 2008 Jeffrey Whitaker @@ -20,14 +20,13 @@ """ import logging +from math import prod from numbers import Integral -import netCDF4 import numpy as np from dask.array.slicing import normalize_index - -_safecast = netCDF4.utils._safecast -_default_fillvals = netCDF4.default_fillvals +from netCDF4 import chartostring, default_fillvals +from netCDF4.utils import _safecast logger = logging.getLogger(__name__) @@ -150,17 +149,19 @@ def __init__( always_masked_array: `bool` If False, the default, then an array returned by indexing which has no missing values is created as a - regular numpy array. If True then an array returned by - indexing is always a masked array, even if there are - no missing values. + regular `numpy` array. If True then an array returned + by indexing is always a masked `numpy` array, even if + there are no missing values. attributes: `dict`, optional - Provide the netCDF attributes for *variable* as - dictionary key/value pairs. If *attributes* is not - `None, then any netCDF attributes stored by *variable* - itself are ignored. Only the attributes relevant to - masking and unpacking are considered, with all other - attributes being ignored. + Provide the netCDF attributes for the *variable* as + dictionary key/value pairs. Only the attributes + relevant to masking and unpacking are considered, with + all other attributes being ignored. If *attributes* is + `None`, the default, then the netCDF attributes stored + by *variable* itself (if any) are used. If + *attributes* is not `None`, then any netCDF attributes + stored by *variable* itself are ignored. """ self.variable = variable @@ -176,8 +177,10 @@ def __getitem__(self, index): Indexing is orthogonal, meaning that the index for each dimension is applied independently, regardless of how that - index was defined. For instance, the indices ``[[0, 1], [1, - 3], 0]`` and ``[:2, 1::2, 0]`` will give identical results. + index was defined. For instance, the indices ``[[0, 1], [3, + 6], 0]`` and ``[:2, 3:7:3, 0]`` will give identical + results. Note that this behaviour is different to that of + `numpy`. .. versionadded:: (cfdm) NEXTVERSION @@ -201,18 +204,20 @@ def __getitem__(self, index): variable.set_auto_maskandscale(False) # ------------------------------------------------------------ - # Index the variable + # Index the variable with orthogonal indexing # ------------------------------------------------------------ index = normalize_index(index, variable.shape) - # Find the positions of any list/1-d array indices + # Find the positions of any list/1-d array indices (which by + # now will contain only integers) axes_with_list_indices = [ n for n, i in enumerate(index) if isinstance(i, list) or getattr(i, "shape", False) ] - # Convert any integer indices to size 1 slices + # Convert any integer indices to size 1 slices, so that their + # axes are not dropped yet (they will be dealt with later). index0 = [ slice(i, i + 1) if isinstance(i, Integral) else i for i in index ] @@ -225,25 +230,52 @@ def __getitem__(self, index): # variable natively supports orthogonal indexing. data = data[tuple(index0)] else: - # Emulate orthogonal indexing - # - # Apply the slice indices and the first list/1-d array - # index + # Emulate orthogonal indexing with a sequence of + # subspaces, one for each list/1-d array index. + + # 1) Apply the slice indices at the time as the list/1-d + # array index that gives the smallest result. + + # Create an index that replaces each list/1-d arrays with + # slice(None) index1 = [ i if isinstance(i, slice) else slice(None) for i in index0 ] - n = axes_with_list_indices[0] + + # Find the position of the list/1-d array index that gives + # the smallest result + shape1 = self.index_shape(index1, data.shape) + size1 = prod(shape1) + sizes = [ + len(index[i]) * size1 // shape1[i] + for i in axes_with_list_indices + ] + n = axes_with_list_indices.pop(np.argmin(sizes)) + + # Apply the subspace of slices and the chosen list/1-d + # array index index1[n] = index[n] data = data[tuple(index1)] - # Apply the rest of the list/1-d array indices one at a time + # 2) Apply the rest of the list/1-d array indices, in the + # order that gives the smallest result after each step. ndim = variable.ndim - for n in axes_with_list_indices[1:]: + while axes_with_list_indices: + shape1 = data.shape + size1 = data.size + sizes = [ + len(index[i]) * size1 // shape1[i] + for i in axes_with_list_indices + ] + n = axes_with_list_indices.pop(np.argmin(sizes)) + + # Apply the subspace of for the chosen list/1-d array + # index index2 = [slice(None)] * ndim index2[n] = index[n] data = data[tuple(index2)] - # Apply any integer indices orthogonally + # Apply any integer indices that will drop axes index3 = [0 if isinstance(i, Integral) else slice(None) for i in index] if index3: data = data[tuple(index3)] @@ -261,7 +293,7 @@ def __getitem__(self, index): elif data.dtype.kind in "OSU": kind = data.dtype.kind if kind == "S": - data = netCDF4.chartostring(data) + data = chartostring(data) # Assume that object arrays are arrays of strings data = data.astype("S", copy=False) @@ -299,27 +331,26 @@ def __getitem__(self, index): return data - @property - def shape(self): - """Tuple of the data dimension sizes. + def __orthogonal_indexing__(self): + """Flag to indicate that orthogonal indexing is supported. .. versionadded:: (cfdm) NEXTVERSION """ - return self.variable.shape + return True - def _check_safecast(self, attname, dtype, attributes): + def _check_safecast(self, attr, dtype, attributes): """Check an attribute's data type. Checks to see that variable attribute exists and can be safely - cast to variable data type. + cast to variable's data type. .. versionadded:: (cfdm) NEXTVERSION :Parameter: - attname: `str` - The attribute name. + attr: `str` + The name of the attribute. dtype: `numpy.dtype` The variable data type. @@ -334,27 +365,26 @@ def _check_safecast(self, attname, dtype, attributes): with the variable data type, and the attribute value. """ - if attname in attributes: - attvalue = attributes[attname] + if attr in attributes: + attvalue = attributes[attr] att = np.array(attvalue) else: return False, None - is_safe = True try: atta = np.array(att, dtype) except ValueError: - is_safe = False + safe = False else: - is_safe = _safecast(att, atta) + safe = _safecast(att, atta) - if not is_safe: + if not safe: logger.info( - f"Mask attribute {attname!r} not used since it can't " + f"Mask attribute {attr!r} not used since it can't " f"be safely cast to variable data type {dtype!r}" ) # pragma: no cover - return is_safe, attvalue + return safe, attvalue def _default_FillValue(self, dtype): """Return the default ``_FillValue`` for the given data type. @@ -366,7 +396,7 @@ def _default_FillValue(self, dtype): :Parameter: dtype: `numpy.dtype` - The variable's data type + The data type. :Returns: @@ -374,9 +404,9 @@ def _default_FillValue(self, dtype): """ if dtype.kind in "OS": - return _default_fillvals["S1"] + return default_fillvals["S1"] - return _default_fillvals[dtype.str[1:]] + return default_fillvals[dtype.str[1:]] def _mask(self, data, dtype, attributes, dtype_unsigned_int): """Mask the data. @@ -649,6 +679,42 @@ def _unpack(self, data, attributes): return data + @property + def dtype(self): + """The data type of the array elements. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + return self.variable.dtype + + @property + def ndim(self): + """Number of dimensions in the data array. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + return self.variable.ndim + + @property + def shape(self): + """Tuple of the data dimension sizes. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + return self.variable.shape + + @property + def size(self): + """Number of elements in the data array. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + return self.variable.size + def attributes(self): """Return the netCDF attributes for the data. @@ -687,3 +753,93 @@ def attributes(self): self._attributes = attrs return attrs + + @classmethod + def index_shape(cls, index, shape): + """Return the shape of the array subspace implied by indices. + + .. versionadded:: (cfdm) NEXTRELEASE + + :Parameters: + + indices: `tuple` + The indices to be applied to an array with shape + *shape*. + + shape: sequence of `ints` + The shape of the array to be subspaced. + + :Returns: + + `list` + The shape of the subspace defined by the *indices*. + + **Examples** + + >>> import numpy as np + >>> n.indices_shape((slice(2, 5), 4), (10, 20)) + [3, 1] + >>> n.indices_shape(([2, 3, 4], np.arange(1, 6)), (10, 20)) + [3, 5] + + >>> n.indices_shape((slice(None), [True, False, True]), (10, 3)) + [10, 2] + + >>> index0 = np.arange(5) + >>> index0 = index0[index0 < 3] + >>> n.indices_shape((index0, []), (10, 20)) + [3, 0] + + >>> n.indices_shape((slice(1, 5, 3), 3), (10, 20)) + [2, 1] + >>> n.indices_shape((slice(5, 1, -2), 3), (10, 20)) + [2, 1] + >>> n.indices_shape((slice(5, 1, 3), 3), (10, 20)) + [0, 1] + >>> n.indices_shape((slice(1, 5, -3), 3), (10, 20)) + [0, 1] + + """ + implied_shape = [] + for ind, full_size in zip(index, shape): + if isinstance(ind, slice): + start, stop, step = ind.indices(full_size) + if (stop - start) * step < 0: + # E.g. 5:1:3 or 1:5:-3 + size = 0 + else: + size = abs((stop - start) / step) + int_size = round(size) + if size > int_size: + size = int_size + 1 + else: + size = int_size + elif isinstance(ind, np.ndarray): + if ind.dtype == bool: + # Size is the number of True values in the array + size = int(ind.sum()) + else: + size = ind.size + + if not ind.ndim: + # Scalar array + continue + elif isinstance(ind, list): + if not ind: + size = 0 + else: + i = ind[0] + if isinstance(i, bool): + # List of bool: Size is the number of True + # values in the list + size = sum(ind) + else: + # List of int + size = len(ind) + else: + # Index is Integral + continue + + implied_shape.append(size) + + return implied_shape diff --git a/cfdm/read_write/netcdf/flatten/__init__.py b/cfdm/read_write/netcdf/flatten/__init__.py index 5f73659eb..4a9887947 100644 --- a/cfdm/read_write/netcdf/flatten/__init__.py +++ b/cfdm/read_write/netcdf/flatten/__init__.py @@ -1,7 +1,7 @@ """Flatten NetCDF groups. Portions of this package were adapted from the `netcdf_flattener` -library, which carries the Apache 2.0 License: +library, which carries the following Apache 2.0 License: Copyright (c) 2020 EUMETSAT diff --git a/cfdm/test/test_netcdf_indexer.py b/cfdm/test/test_netcdf_indexer.py index 95fb4a32d..5df635efc 100644 --- a/cfdm/test/test_netcdf_indexer.py +++ b/cfdm/test/test_netcdf_indexer.py @@ -42,6 +42,9 @@ def test_netcdf_indexer_shape(self): n = np.ma.arange(9) x = cfdm.netcdf_indexer(n) self.assertEqual(x.shape, n.shape) + self.assertEqual(x.size, n.size) + self.assertEqual(x.ndim, n.ndim) + self.assertEqual(x.dtype, n.dtype) def test_netcdf_indexer_mask(self): """Test netcdf_indexer for masking.""" From c68b7a061084b493ad95e415d1ff7fca3415fa95 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 5 Apr 2024 11:43:15 +0100 Subject: [PATCH 55/88] dev --- cfdm/data/h5netcdfarray.py | 42 +++--- cfdm/data/mixin/arraymixin.py | 97 +++++++++++--- cfdm/data/mixin/filearraymixin.py | 56 ++++---- cfdm/data/mixin/netcdffilemixin.py | 78 +++++------ cfdm/data/netcdf4array.py | 40 +++--- cfdm/data/netcdfindexer.py | 192 ++++++++++++++++----------- cfdm/docstring/docstring.py | 4 + cfdm/read_write/netcdf/netcdfread.py | 51 +++++-- cfdm/read_write/read.py | 45 ++++--- 9 files changed, 376 insertions(+), 229 deletions(-) diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index b7bd9bd03..5bceaa884 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -29,8 +29,8 @@ def __init__( shape=None, mask=True, unpack=True, - units=False, - calendar=False, + # units=False, + # calendar=False, attributes=None, storage_options=None, source=None, @@ -89,6 +89,14 @@ def __init__( applicable. If unset then the calendar will be set during the first `__getitem__` call. + {{attributes: `dict` or `None`, optional}} + + If *attributes* is `None`, the default, then the + attributes will be set from *variable* during the + first `__getitem__` call. + + .. versionadded:: (cfdm) NEXTRELEASE + {{init storage_options: `dict` or `None`, optional}} {{init source: optional}} @@ -129,15 +137,15 @@ def __init__( except AttributeError: unpack = True - try: - units = source._get_component("units", False) - except AttributeError: - units = False - - try: - calendar = source._get_component("calendar", False) - except AttributeError: - calendar = False + # try: + # units = source._get_component("units", False) + # except AttributeError: + # units = False + # + # try: + # calendar = source._get_component("calendar", False) + # except AttributeError: + # calendar = False try: attributes = source._get_component("attributes", None) @@ -173,8 +181,8 @@ def __init__( self._set_component("dtype", dtype, copy=False) self._set_component("mask", bool(mask), copy=False) self._set_component("unpack", bool(unpack), copy=False) - self._set_component("units", units, copy=False) - self._set_component("calendar", calendar, copy=False) + # self._set_component("units", units, copy=False) + # self._set_component("calendar", calendar, copy=False) self._set_component("storage_options", storage_options, copy=False) self._set_component("attributes", attributes, copy=False) @@ -211,9 +219,9 @@ def __getitem__(self, indices): # Set the attributes, if they haven't been set already. self._set_attributes(variable) - # Set the units, if they haven't been set already (do this - # after setting the attributes). - self._set_units(variable) + # # Set the units, if they haven't been set already (do this + # # after setting the attributes). + # self._set_units(variable) self.close(dataset0) del dataset, dataset0 @@ -224,7 +232,7 @@ def _set_attributes(self, var): """Set the netCDF variable attributes. These are set from the netCDF variable attributes, but only if - they have not already been defined, either during {{class}} + they have not already been defined, either during `{{class}}` instantiation or by a previous call to `_set_attributes`. .. versionadded:: (cfdm) NEXTVERSION diff --git a/cfdm/data/mixin/arraymixin.py b/cfdm/data/mixin/arraymixin.py index 901087744..7276b635c 100644 --- a/cfdm/data/mixin/arraymixin.py +++ b/cfdm/data/mixin/arraymixin.py @@ -1,3 +1,5 @@ +from copy import deepcopy + import numpy as np @@ -76,30 +78,60 @@ def __docstring_package_depth__(self): """ return 0 - def _set_units(self): - """The units and calendar properties. + # def _set_units(self): + # """The units and calendar properties. + # + # These are the values set during initialisation, defaulting to + # `None` if either was not set at that time. + # + # .. versionadded:: (cfdm) 1.10.1.0 + # + # :Returns: + # + # `tuple` + # The units and calendar values, either of which may be + # `None`. + # + # """ + # units = self.get_units(False) + # if units is False: + # self._set_component("units", None, copy=False) + # + # calendar = self.get_calendar(False) + # if calendar is False: + # self._set_component("calendar", None, copy=False) + # + # return units, calendar + + def get_attributes(self, default=ValueError()): + """The attributes of the array. + + .. versionadded:: (cfdm) NEXTVERSION - These are the values set during initialisation, defaulting to - `None` if either was not set at that time. + :Parameters: - .. versionadded:: (cfdm) 1.10.1.0 + default: optional + Return the value of the *default* parameter if the + attributes have not been set. If set to an `Exception` + instance then it will be raised instead. :Returns: - `tuple` - The units and calendar values, either of which may be - `None`. + `dict` + The attributes. """ - units = self.get_units(False) - if units is False: - self._set_component("units", None, copy=False) + attributes = self._get_component("attributes", None) + if attributes is None: + if default is None: + return - calendar = self.get_calendar(False) - if calendar is False: - self._set_component("calendar", None, copy=False) + return self._default( + default, + f"{self.__class__.__name__} attributes have not yet been set", + ) - return units, calendar + return deepcopy(attributes) def get_calendar(self, default=ValueError()): """The calendar of the array. @@ -122,8 +154,9 @@ def get_calendar(self, default=ValueError()): The calendar value. """ - calendar = self._get_component("calendar", False) - if calendar is False: + attributes = self.get_attributes({}) + calendar = attributes.get("calendar") + if calendar is None: if default is None: return @@ -134,6 +167,18 @@ def get_calendar(self, default=ValueError()): return calendar + # calendar = self._get_component("calendar", False) + # if calendar is False: + # if default is None: + # return + # + # return self._default( + # default, + # f"{self.__class__.__name__} 'calendar' has not been set", + # ) + # + # return calendar + def get_compression_type(self): """Returns the array's compression type. @@ -290,8 +335,9 @@ def get_units(self, default=ValueError()): The units value. """ - units = self._get_component("units", False) - if units is False: + attributes = self.get_attributes({}) + units = attributes.get("units") + if units is None: if default is None: return @@ -301,3 +347,16 @@ def get_units(self, default=ValueError()): ) return units + + +# units = self._get_component("units", False) +# if units is False: +# if default is None: +# return +# +# return self._default( +# default, +# f"{self.__class__.__name__} 'units' have not been set", +# ) +# +# return units diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index a237a4d23..939fab4db 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -111,34 +111,34 @@ def get_addresses(self): """ return self._get_component("address", ()) - def get_attributes(self, default=ValueError()): - """The attributes of the array. - - .. versionadded:: (cfdm) NEXTVERSION - - :Parameters: - - default: optional - Return the value of the *default* parameter if the - attributes have not been set. If set to an `Exception` - instance then it will be raised instead. - - :Returns: - - The attributes. - - """ - attributes = self._get_component("attributes", None) - if attributes is None: - if default is None: - return - - return self._default( - default, - f"{self.__class__.__name__} attributes have not yet been set", - ) - - return deepcopy(attributes) + # def get_attributes(self, default=ValueError()): + # """The attributes of the array. + # + # .. versionadded:: (cfdm) NEXTVERSION + # + # :Parameters: + # + # default: optional + # Return the value of the *default* parameter if the + # attributes have not been set. If set to an `Exception` + # instance then it will be raised instead. + # + # :Returns: + # + # The attributes. + # + # """ + # attributes = self._get_component("attributes", None) + # if attributes is None: + # if default is None: + # return + # + # return self._default( + # default, + # f"{self.__class__.__name__} attributes have not yet been set", + # ) + # + # return deepcopy(attributes) def get_filename(self, default=AttributeError()): """The name of the file containing the array. diff --git a/cfdm/data/mixin/netcdffilemixin.py b/cfdm/data/mixin/netcdffilemixin.py index 7f4c6c25d..0cd75886a 100644 --- a/cfdm/data/mixin/netcdffilemixin.py +++ b/cfdm/data/mixin/netcdffilemixin.py @@ -66,45 +66,45 @@ def _set_attributes(self, var): f"Must implement {self.__class__.__name__}._set_attributes" ) # pragma: no cover - def _set_units(self, var): - """The units and calendar properties. - - These are set from the netCDF variable attributes, but only if - they have already not been defined, either during {{class}} - instantiation or by a previous call to `_set_units`. - - .. versionadded:: (cfdm) 1.10.0.1 - - :Parameters: - - var: `netCDF4.Variable` or `h5netcdf.Variable` - The variable containing the units and calendar - definitions. - - :Returns: - - `tuple` - The units and calendar values, either of which may be - `None`. - - """ - # We assume that an attributes dictionary exists - attributes = self._get_component("attributes") - - # Note: Can't use None as the default since it is a valid - # `units` or 'calendar' value that indicates that the - # attribute has not been set in the dataset. - units = self._get_component("units", False) - if units is False: - self._set_component("units", attributes.get("units"), copy=False) - - calendar = self._get_component("calendar", False) - if calendar is False: - self._set_component( - "calendar", attributes.get("calendar"), copy=False - ) - - return units, calendar + # def _set_units(self, var): + # """The units and calendar properties. + # + # These are set from the netCDF variable attributes, but only if + # they have already not been defined, either during {{class}} + # instantiation or by a previous call to `_set_units`. + # + # .. versionadded:: (cfdm) 1.10.0.1 + # + # :Parameters: + # + # var: `netCDF4.Variable` or `h5netcdf.Variable` + # The variable containing the units and calendar + # definitions. + # + # :Returns: + # + # `tuple` + # The units and calendar values, either of which may be + # `None`. + # + # """ + # # We assume that an attributes dictionary exists + # attributes = self._get_component("attributes") + # + # # Note: Can't use None as the default since it is a valid + # # `units` or 'calendar' value that indicates that the + # # attribute has not been set in the dataset. + # units = self._get_component("units", False) + # if units is False: + # self._set_component("units", attributes.get("units"), copy=False) + # + # calendar = self._get_component("calendar", False) + # if calendar is False: + # self._set_component( + # "calendar", attributes.get("calendar"), copy=False + # ) + # + # return units, calendar @property def array(self): diff --git a/cfdm/data/netcdf4array.py b/cfdm/data/netcdf4array.py index 8bba10eea..cf1dc3a62 100644 --- a/cfdm/data/netcdf4array.py +++ b/cfdm/data/netcdf4array.py @@ -20,8 +20,8 @@ def __init__( shape=None, mask=True, unpack=True, - units=False, - calendar=False, + # units=False, + # calendar=False, attributes=None, storage_options=None, source=None, @@ -78,6 +78,14 @@ def __init__( .. versionadded:: (cfdm) 1.10.0.1 + {{attributes: `dict` or `None`, optional}} + + If *attributes* is `None`, the default, then the + attributes will be set from *variable* during the + first `__getitem__` call. + + .. versionadded:: (cfdm) NEXTRELEASE + {{init storage_options: `dict` or `None`, optional}} .. versionadded:: (cfdm) NEXTVERSION @@ -138,15 +146,15 @@ def __init__( except AttributeError: unpack = True - try: - units = source._get_component("units", False) - except AttributeError: - units = False - - try: - calendar = source._get_component("calendar", False) - except AttributeError: - calendar = False + # try: + # units = source._get_component("units", False) + # except AttributeError: + # units = False + # + # try: + # calendar = source._get_component("calendar", False) + # except AttributeError: + # calendar = False try: attributes = source._get_component("attributes", None) @@ -182,8 +190,8 @@ def __init__( self._set_component("dtype", dtype, copy=False) self._set_component("mask", bool(mask), copy=False) self._set_component("unpack", bool(unpack), copy=False) - self._set_component("units", units, copy=False) - self._set_component("calendar", calendar, copy=False) + # self._set_component("units", units, copy=False) + # self._set_component("calendar", calendar, copy=False) self._set_component("storage_options", storage_options, copy=False) self._set_component("attributes", attributes, copy=False) @@ -240,8 +248,8 @@ def __getitem__(self, indices): # Set the units, if they haven't been set already. self._set_attributes(variable) - # Set the units, if they haven't been set already. - self._set_units(variable) + # # Set the units, if they haven't been set already. + # self._set_units(variable) self.close(dataset) del netcdf, dataset @@ -272,7 +280,7 @@ def _set_attributes(self, var): """Set the netCDF variable attributes. These are set from the netCDF variable attributes, but only if - they have not already been defined, either during {{class}} + they have not already been defined, either during `{{class}}` instantiation or by a previous call to `_set_attributes`. .. versionadded:: (cfdm) NEXTVERSION diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index 36c568b32..e14c56b90 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -37,7 +37,8 @@ class netcdf_indexer: Indexing is orthogonal, meaning that the index for each dimension is applied independently, regardless of how that index was defined. For instance, the indices ``[[0, 1], [1, 3], 0]`` and - ``[:2, 1::2, 0]`` will give identical results. + ``[:2, 1:4:2, 0]`` will give identical results. Note that this + behaviour is different to that of `numpy`. During indexing, masking and unpacking is applied according to the netCDF conventions, either or both of which may be disabled via @@ -154,8 +155,8 @@ def __init__( there are no missing values. attributes: `dict`, optional - Provide the netCDF attributes for the *variable* as - dictionary key/value pairs. Only the attributes + Provide netCDF attributes for the *variable* as a + dictionary key/value pairs. Only the attributes relevant to masking and unpacking are considered, with all other attributes being ignored. If *attributes* is `None`, the default, then the netCDF attributes stored @@ -177,8 +178,8 @@ def __getitem__(self, index): Indexing is orthogonal, meaning that the index for each dimension is applied independently, regardless of how that - index was defined. For instance, the indices ``[[0, 1], [3, - 6], 0]`` and ``[:2, 3:7:3, 0]`` will give identical + index was defined. For instance, the indices ``[[0, 1], [1, + 3], 0]`` and ``[:2, 1:4:2, 0]`` will give identical results. Note that this behaviour is different to that of `numpy`. @@ -191,7 +192,7 @@ def __getitem__(self, index): dtype = variable.dtype # Prevent a netCDF4 variable from doing its own masking and - # unpacking + # unpacking during the indexing netCDF4_scale = False netCDF4_mask = False try: @@ -204,81 +205,9 @@ def __getitem__(self, index): variable.set_auto_maskandscale(False) # ------------------------------------------------------------ - # Index the variable with orthogonal indexing + # Index the variable # ------------------------------------------------------------ - index = normalize_index(index, variable.shape) - - # Find the positions of any list/1-d array indices (which by - # now will contain only integers) - axes_with_list_indices = [ - n - for n, i in enumerate(index) - if isinstance(i, list) or getattr(i, "shape", False) - ] - - # Convert any integer indices to size 1 slices, so that their - # axes are not dropped yet (they will be dealt with later). - index0 = [ - slice(i, i + 1) if isinstance(i, Integral) else i for i in index - ] - - data = variable - if len(axes_with_list_indices) <= 1 or getattr( - variable, "__orthogonal_indexing__", False - ): - # There is at most one list/1-d array index, and/or the - # variable natively supports orthogonal indexing. - data = data[tuple(index0)] - else: - # Emulate orthogonal indexing with a sequence of - # subspaces, one for each list/1-d array index. - - # 1) Apply the slice indices at the time as the list/1-d - # array index that gives the smallest result. - - # Create an index that replaces each list/1-d arrays with - # slice(None) - index1 = [ - i if isinstance(i, slice) else slice(None) for i in index0 - ] - - # Find the position of the list/1-d array index that gives - # the smallest result - shape1 = self.index_shape(index1, data.shape) - size1 = prod(shape1) - sizes = [ - len(index[i]) * size1 // shape1[i] - for i in axes_with_list_indices - ] - n = axes_with_list_indices.pop(np.argmin(sizes)) - - # Apply the subspace of slices and the chosen list/1-d - # array index - index1[n] = index[n] - data = data[tuple(index1)] - - # 2) Apply the rest of the list/1-d array indices, in the - # order that gives the smallest result after each step. - ndim = variable.ndim - while axes_with_list_indices: - shape1 = data.shape - size1 = data.size - sizes = [ - len(index[i]) * size1 // shape1[i] - for i in axes_with_list_indices - ] - n = axes_with_list_indices.pop(np.argmin(sizes)) - - # Apply the subspace of for the chosen list/1-d array - # index - index2 = [slice(None)] * ndim - index2[n] = index[n] - data = data[tuple(index2)] - - # Apply any integer indices that will drop axes - index3 = [0 if isinstance(i, Integral) else slice(None) for i in index] - if index3: - data = data[tuple(index3)] + data = self._index(index) # Reset a netCDF4 variable's scale and mask behaviour if netCDF4_scale: @@ -408,6 +337,109 @@ def _default_FillValue(self, dtype): return default_fillvals[dtype.str[1:]] + def _index(self, index): + """Get a subspace of the variable with orthogonal indexing. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `__getitem__` + + :Parameter: + + index: + The indices that define the subspace. + + :Returns: + + `numpy.ndarray` + The subspace of the variable. + + """ + variable = self.variable + index = normalize_index(index, variable.shape) + + # Find the positions of any list/1-d array indices (which by + # now will contain only integers) + axes_with_list_indices = [ + n + for n, i in enumerate(index) + if isinstance(i, list) or getattr(i, "shape", False) + ] + + # Create an index that replaces integer indices with size 1 + # slices, so that their axes are not dropped yet (they will be + # dealt with later). + index0 = [ + slice(i, i + 1) if isinstance(i, Integral) else i for i in index + ] + + data = variable + if len(axes_with_list_indices) <= 1 or getattr( + variable, "__orthogonal_indexing__", False + ): + # There is at most one list/1-d array index, and/or the + # variable natively supports orthogonal indexing. + # + # Note: `netCDF4.Variable` supports orthogonal indexing; + # but `numpy.ndarray`, `h5netcdf.File` and + # `h5py.File` do not. + data = data[tuple(index0)] + else: + # There are two or more list/1-d array index, and the + # variable does not natively support orthogonal indexing. + # + # Emulate orthogonal indexing with a sequence of + # subspaces, one for each list/1-d array index. + + # 1) Apply the slice indices at the time as the list/1-d + # array index that gives the smallest result. + + # Create an index that replaces each list/1-d array with + # slice(None) + index1 = [ + i if isinstance(i, slice) else slice(None) for i in index0 + ] + + # Find the position of the list/1-d array index that gives + # the smallest result. + shape1 = self.index_shape(index1, data.shape) + size1 = prod(shape1) + sizes = [ + len(index[i]) * size1 // shape1[i] + for i in axes_with_list_indices + ] + n = axes_with_list_indices.pop(np.argmin(sizes)) + + # Apply the subspace of slices and the chosen list/1-d + # array index + index1[n] = index[n] + data = data[tuple(index1)] + + # 2) Apply the rest of the list/1-d array indices, in the + # order that gives the smallest result after each step. + ndim = variable.ndim + while axes_with_list_indices: + shape1 = data.shape + size1 = data.size + sizes = [ + len(index[i]) * size1 // shape1[i] + for i in axes_with_list_indices + ] + n = axes_with_list_indices.pop(np.argmin(sizes)) + + # Apply the subspace of for the chosen list/1-d array + # index + index2 = [slice(None)] * ndim + index2[n] = index[n] + data = data[tuple(index2)] + + # Apply any integer indices that will drop axes + index3 = [0 if isinstance(i, Integral) else slice(None) for i in index] + if index3: + data = data[tuple(index3)] + + return data + def _mask(self, data, dtype, attributes, dtype_unsigned_int): """Mask the data. diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py index 37c130c4d..1a2584675 100644 --- a/cfdm/docstring/docstring.py +++ b/cfdm/docstring/docstring.py @@ -451,6 +451,10 @@ 'scaleway-secretkey...', 'endpoint_url': 'https://s3.fr-par.scw.cloud', 'client_kwargs': {'region_name': 'fr-par'}}``""", + # attributes + "{{attributes: `dict` or `None`, optional}}": """attributes: `dict` or `None`, optional + Provide netCDF attributes for the data as a dictionary + of key/value pairs.""", # ---------------------------------------------------------------- # Method description susbstitutions (4 levels of indentataion) # ---------------------------------------------------------------- diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index dce24b641..b18488425 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -6170,20 +6170,32 @@ def _create_netcdfarray( filename = g["variable_filename"][ncvar] + attributes = g["variable_attributes"][ncvar].copy() + # Get the units and calendar (before we overwrite ncvar) - units = g["variable_attributes"][ncvar].get("units") - calendar = g["variable_attributes"][ncvar].get("calendar") + # units = g["variable_attributes"][ncvar].get("units") + # calendar = g["variable_attributes"][ncvar].get("calendar") if coord_ncvar is not None: # Get the Units from the parent coordinate variable, if # they've not already been set. - if units is None: + if "units" not in attributes: + # if units is None: + # units = g["vaariable_attributes"][coord_ncvar].get("units") units = g["variable_attributes"][coord_ncvar].get("units") - - if calendar is None: + if units is not None: + attributes["units"] = units + + if "calendar" not in attributes: + # if calendar is None: + # calendar = g["variable_attributes"][coord_ncvar].get( + # "calendar" + # ) calendar = g["variable_attributes"][coord_ncvar].get( "calendar" ) + if calendar is not None: + attributes["calendar"] = calendar kwargs = { "filename": filename, @@ -6192,9 +6204,10 @@ def _create_netcdfarray( "dtype": dtype, "mask": g["mask"], "unpack": g["unpack"], - "units": units, - "calendar": calendar, - "attributes": g["variable_attributes"][ncvar], + # "units": units, + # "calendar": calendar, + # "attributes": g["variable_attributes"][ncvar], + "attributes": attributes, "storage_options": g["file_system_storage_options"].get(filename), } @@ -6260,8 +6273,12 @@ def _create_data( return None filename = kwargs["filename"] - units = kwargs["units"] - calendar = kwargs["calendar"] + + attributes = kwargs["attributes"] + units = attributes.get("units") + calendar = attributes.get("calendar") + # units = kwargs["units"] + # calendar = kwargs["calendar"] compression = g["compression"] @@ -9277,10 +9294,13 @@ def _ugrid_create_domain_topology(self, parent_ncvar, f, mesh, location): copy=False, **{connectivity_attr: indices}, ) + attributes = kwargs["attributes"] data = self._create_Data( array, - units=kwargs["units"], - calendar=kwargs["calendar"], + # units=kwargs["units"], + # calendar=kwargs["calendar"], + units=attributes.get("units"), + calendar=attributes.get("calendar"), ncvar=connectivity_ncvar, ) else: @@ -9382,10 +9402,13 @@ def _ugrid_create_cell_connectivities( cell_dimension=cell_dimension, copy=False, ) + attributes = kwargs["attributes"] data = self._create_Data( array, - units=kwargs["units"], - calendar=kwargs["calendar"], + # units=kwargs["units"], + # calendar=kwargs["calendar"], + units=attributes.get("units"), + calendar=attributes.get("calendar"), ncvar=connectivity_ncvar, ) diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 2e45a9da9..971438593 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -248,9 +248,9 @@ def read( If True (the default) then unpack arrays by convention when the data is read from disk. - Unpacking is determined netCDF conventions for the - following attributes: ``add_offset``, ``scale_factor``, - and ``_Unsigned``. + Unpacking is determined by netCDF conventions for the + following variable attributes: ``add_offset``, + ``scale_factor``, and ``_Unsigned``. .. versionadded:: (cfdm) NEXTVERSION @@ -277,7 +277,8 @@ def read( .. versionadded:: (cfdm) 1.9.0.0 netcdf_eninge: `None` or `str`, optional - Specify which library to use for opening and reading + + Specify which library to use for the opening and reading netCDF files. By default, or if `None`, then the first one of `netCDF4` and `h5netcdf` to successfully open the file netCDF file is used. Setting *netcdf_engine* to one of @@ -288,21 +289,33 @@ def read( define subspaces of its data. See https://docs.h5py.org for details. - .. versionadded:: NEXTVERSION + .. versionadded:: (cfdm) NEXTVERSION storage_options: `dict` or `None`, optional - Key/value pairs to be passed on to the creation of - `s3fs.S3FileSystem` file systems to control the opening of - files in S3 object stores. Ignored for files not in an S3 - object store, i.e. those whose names do not start with - ``s3:``. + Pass parameters to the backend file system driver, such as + username, password, server, port, etc. How the storage + options are interpreted depends on the location of the + file: + + **Local File System** + + Storage options are ignored for local files. + + **HTTP(S)** + + Storage options are ignored for files available across the + network via OPeNDAP. + + **S3-compatible services** - By default, or if `None`, then *storage_options* is taken - as ``{}``. + The backend used is `s3fs`, and the storage options are + used to initialise an `s3fs.S3FileSystem` file system + object. By default, or if `None`, then *storage_options* + is taken as ``{}``. - If the ``'endpoint_url'`` key is not in *storage_options* + If the ``'endpoint_url'`` key is not in *storage_options*, or is not in a dictionary defined by the - ``'client_kwargs`` key (which is always the case when + ``'client_kwargs'`` key (both of which are the case when *storage_options* is `None`), then one will be automatically inserted for accessing an S3 file. For example, for a file name of ``'s3://store/data/file.nc'``, @@ -311,8 +324,8 @@ def read( *Parameter example:* For a file name of ``'s3://store/data/file.nc'``, the - following are equivalent: ``None``, ``{}``, and - ``{'endpoint_url': 'https://store'}``, + following are equivalent: ``None``, ``{}``, + ``{'endpoint_url': 'https://store'}``, and ``{'client_kwargs': {'endpoint_url': 'https://store'}}`` *Parameter example:* From 4335e02ae01a3040c6193ecf7a05524af737d45b Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 5 Apr 2024 15:53:27 +0100 Subject: [PATCH 56/88] dev --- cfdm/data/abstract/compressedarray.py | 4 +- cfdm/data/abstract/mesharray.py | 4 +- cfdm/data/h5netcdfarray.py | 52 +---- cfdm/data/mixin/arraymixin.py | 272 +++++++++++--------------- cfdm/data/netcdf4array.py | 31 ++- cfdm/data/netcdfindexer.py | 11 ++ cfdm/data/numpyarray.py | 6 +- cfdm/data/subsampledarray.py | 4 +- cfdm/docstring/docstring.py | 8 +- cfdm/read_write/netcdf/netcdfread.py | 24 +-- 10 files changed, 155 insertions(+), 261 deletions(-) diff --git a/cfdm/data/abstract/compressedarray.py b/cfdm/data/abstract/compressedarray.py index da5b1e67e..197f900e1 100644 --- a/cfdm/data/abstract/compressedarray.py +++ b/cfdm/data/abstract/compressedarray.py @@ -1,5 +1,6 @@ import numpy as np +from ..netcdfindexer import netcdf_indexer from .array import Array @@ -192,7 +193,8 @@ def __getitem__(self, indices): if indices is Ellipsis: return u - return self.get_subspace(u, indices, copy=True) + u = netcdf_indexer(u, mask=False, unpack=False) + return u[indices] def _first_or_last_element(self, indices): """Return the first or last element of the compressed array. diff --git a/cfdm/data/abstract/mesharray.py b/cfdm/data/abstract/mesharray.py index 0599540f8..fc76bb35a 100644 --- a/cfdm/data/abstract/mesharray.py +++ b/cfdm/data/abstract/mesharray.py @@ -3,6 +3,7 @@ import numpy as np +from ..netcdfindexer import netcdf_indexer from .compressedarray import CompressedArray @@ -151,7 +152,8 @@ def __getitem__(self, indices): if indices is Ellipsis: return u - return self.get_subspace(u, indices, copy=False) + u = netcdf_indexer(u, mask=False, unpack=False) + return u[indices] @property def dtype(self): diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index 5bceaa884..00a52d864 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -29,8 +29,6 @@ def __init__( shape=None, mask=True, unpack=True, - # units=False, - # calendar=False, attributes=None, storage_options=None, source=None, @@ -61,39 +59,15 @@ def __init__( ndim: `int` The number of array dimensions in the file. - mask: `bool` - If True (the default) then mask by convention when - reading data from disk. + {{init mask: `bool`, optional}} - A netCDF array is masked depending on the values of - any of the netCDF attributes ``_FillValue``, - ``missing_value``, ``_Unsigned``, ``valid_min``, - ``valid_max``, and ``valid_range``. + {{init unpack: `bool`, optional}} - unpack: `bool` - If True (the default) then unpack by convention when - reading data from disk. - - A netCDF array is unpacked depending on the values of - the netCDF attributes ``add_offset`` and - ``scale_factor``. - - units: `str` or `None`, optional - The units of the variable. Set to `None` to indicate - that there are no units. If unset then the units will - be set during the first `__getitem__` call. - - calendar: `str` or `None`, optional - The calendar of the variable. By default, or if set to - `None`, then the CF default calendar is assumed, if - applicable. If unset then the calendar will be set - during the first `__getitem__` call. - - {{attributes: `dict` or `None`, optional}} + {{init attributes: `dict` or `None`, optional}} If *attributes* is `None`, the default, then the - attributes will be set from *variable* during the - first `__getitem__` call. + netCDF attributes will be set from the netCDF variable + during the first `__getitem__` call. .. versionadded:: (cfdm) NEXTRELEASE @@ -137,16 +111,6 @@ def __init__( except AttributeError: unpack = True - # try: - # units = source._get_component("units", False) - # except AttributeError: - # units = False - # - # try: - # calendar = source._get_component("calendar", False) - # except AttributeError: - # calendar = False - try: attributes = source._get_component("attributes", None) except AttributeError: @@ -181,8 +145,6 @@ def __init__( self._set_component("dtype", dtype, copy=False) self._set_component("mask", bool(mask), copy=False) self._set_component("unpack", bool(unpack), copy=False) - # self._set_component("units", units, copy=False) - # self._set_component("calendar", calendar, copy=False) self._set_component("storage_options", storage_options, copy=False) self._set_component("attributes", attributes, copy=False) @@ -219,10 +181,6 @@ def __getitem__(self, indices): # Set the attributes, if they haven't been set already. self._set_attributes(variable) - # # Set the units, if they haven't been set already (do this - # # after setting the attributes). - # self._set_units(variable) - self.close(dataset0) del dataset, dataset0 diff --git a/cfdm/data/mixin/arraymixin.py b/cfdm/data/mixin/arraymixin.py index 7276b635c..20a666efb 100644 --- a/cfdm/data/mixin/arraymixin.py +++ b/cfdm/data/mixin/arraymixin.py @@ -1,7 +1,5 @@ from copy import deepcopy -import numpy as np - class ArrayMixin: """Mixin class for a container of an array. @@ -78,31 +76,6 @@ def __docstring_package_depth__(self): """ return 0 - # def _set_units(self): - # """The units and calendar properties. - # - # These are the values set during initialisation, defaulting to - # `None` if either was not set at that time. - # - # .. versionadded:: (cfdm) 1.10.1.0 - # - # :Returns: - # - # `tuple` - # The units and calendar values, either of which may be - # `None`. - # - # """ - # units = self.get_units(False) - # if units is False: - # self._set_component("units", None, copy=False) - # - # calendar = self.get_calendar(False) - # if calendar is False: - # self._set_component("calendar", None, copy=False) - # - # return units, calendar - def get_attributes(self, default=ValueError()): """The attributes of the array. @@ -155,8 +128,7 @@ def get_calendar(self, default=ValueError()): """ attributes = self.get_attributes({}) - calendar = attributes.get("calendar") - if calendar is None: + if "calendar" not in attributes: if default is None: return @@ -165,19 +137,7 @@ def get_calendar(self, default=ValueError()): f"{self.__class__.__name__} 'calendar' has not been set", ) - return calendar - - # calendar = self._get_component("calendar", False) - # if calendar is False: - # if default is None: - # return - # - # return self._default( - # default, - # f"{self.__class__.__name__} 'calendar' has not been set", - # ) - # - # return calendar + return attributes["calendar"] def get_compression_type(self): """Returns the array's compression type. @@ -207,111 +167,111 @@ def get_compression_type(self): """ return self._get_component("compression_type", "") - @classmethod - def get_subspace(cls, array, indices, copy=True): - """Return a subspace, defined by indices, of a numpy array. - - Only certain type of indices are allowed. See the *indices* - parameter for details. - - Indexing is similar to numpy indexing. Given the restrictions on - the type of indices allowed - see the *indicies* parameter - the - only difference to numpy indexing is - - * When two or more dimension's indices are sequences of integers - then these indices work independently along each dimension - (similar to the way vector subscripts work in Fortran). - - .. versionadded:: (cfdm) 1.8.7.0 - - :Parameters: - - array: `numpy.ndarray` - The array to be subspaced. - - indices: - The indices that define the subspace. - - Must be either `Ellipsis` or a sequence that contains an - index for each dimension. In the latter case, each - dimension's index must either be a `slice` object or a - sequence of two or more integers. - - *Parameter example:* - indices=Ellipsis - - *Parameter example:* - indices=[[5, 7, 8]] - - *Parameter example:* - indices=[slice(4, 7)] - - *Parameter example:* - indices=[slice(None), [5, 7, 8]] - - *Parameter example:* - indices=[[2, 5, 6], slice(15, 4, -2), [8, 7, 5]] - - copy: `bool` - If `False` then the returned subspace may (or may not) be - independent of the input *array*. By default the returned - subspace is independent of the input *array*. - - :Returns: - - `numpy.ndarray` - - """ - if indices is not Ellipsis: - if not isinstance(indices, tuple): - indices = (indices,) - - axes_with_list_indices = [ - i for i, x in enumerate(indices) if not isinstance(x, slice) - ] - n_axes_with_list_indices = len(axes_with_list_indices) - - if n_axes_with_list_indices < 2: - # ---------------------------------------------------- - # At most one axis has a list-of-integers index so we - # can do a normal numpy subspace - # ---------------------------------------------------- - array = array[tuple(indices)] - else: - # ---------------------------------------------------- - # At least two axes have list-of-integers indices so - # we can't do a normal numpy subspace - # ---------------------------------------------------- - n_indices = len(indices) - if n_axes_with_list_indices < n_indices: - # Apply subspace defined by slices - slices = [ - i if isinstance(i, slice) else slice(None) - for i in indices - ] - array = array[tuple(slices)] - - if n_axes_with_list_indices: - # Apply subspaces defined by lists (this - # methodology works for both numpy arrays and - # scipy sparse arrays). - lists = [slice(None)] * n_indices - for axis in axes_with_list_indices: - lists[axis] = indices[axis] - array = array[tuple(lists)] - lists[axis] = slice(None) - - if copy: - if np.ma.isMA(array) and not array.ndim: - # This is because numpy.ma.copy doesn't work for - # scalar arrays (at the moment, at least) - ma_array = np.ma.empty((), dtype=array.dtype) - ma_array[...] = array - array = ma_array - else: - array = array.copy() - - return array + # @classmethod + # def get_subspace(cls, array, indices, copy=True): + # """Return a subspace, defined by indices, of a numpy array. + # + # Only certain type of indices are allowed. See the *indices* + # parameter for details. + # + # Indexing is similar to numpy indexing. Given the restrictions on + # the type of indices allowed - see the *indicies* parameter - the + # only difference to numpy indexing is + # + # * When two or more dimension's indices are sequences of integers + # then these indices work independently along each dimension + # (similar to the way vector subscripts work in Fortran). + # + # .. versionadded:: (cfdm) 1.8.7.0 + # + # :Parameters: + # + # array: `numpy.ndarray` + # The array to be subspaced. + # + # indices: + # The indices that define the subspace. + # + # Must be either `Ellipsis` or a sequence that contains an + # index for each dimension. In the latter case, each + # dimension's index must either be a `slice` object or a + # sequence of two or more integers. + # + # *Parameter example:* + # indices=Ellipsis + # + # *Parameter example:* + # indices=[[5, 7, 8]] + # + # *Parameter example:* + # indices=[slice(4, 7)] + # + # *Parameter example:* + # indices=[slice(None), [5, 7, 8]] + # + # *Parameter example:* + # indices=[[2, 5, 6], slice(15, 4, -2), [8, 7, 5]] + # + # copy: `bool` + # If `False` then the returned subspace may (or may not) be + # independent of the input *array*. By default the returned + # subspace is independent of the input *array*. + # + # :Returns: + # + # `numpy.ndarray` + # + # """ + # if indices is not Ellipsis: + # if not isinstance(indices, tuple): + # indices = (indices,) + # + # axes_with_list_indices = [ + # i for i, x in enumerate(indices) if not isinstance(x, slice) + # ] + # n_axes_with_list_indices = len(axes_with_list_indices) + # + # if n_axes_with_list_indices < 2: + # # ---------------------------------------------------- + # # At most one axis has a list-of-integers index so we + # # can do a normal numpy subspace + # # ---------------------------------------------------- + # array = array[tuple(indices)] + # else: + # # ---------------------------------------------------- + # # At least two axes have list-of-integers indices so + # # we can't do a normal numpy subspace + # # ---------------------------------------------------- + # n_indices = len(indices) + # if n_axes_with_list_indices < n_indices: + # # Apply subspace defined by slices + # slices = [ + # i if isinstance(i, slice) else slice(None) + # for i in indices + # ] + # array = array[tuple(slices)] + # + # if n_axes_with_list_indices: + # # Apply subspaces defined by lists (this + # # methodology works for both numpy arrays and + # # scipy sparse arrays). + # lists = [slice(None)] * n_indices + # for axis in axes_with_list_indices: + # lists[axis] = indices[axis] + # array = array[tuple(lists)] + # lists[axis] = slice(None) + # + # if copy: + # if np.ma.isMA(array) and not array.ndim: + # # This is because numpy.ma.copy doesn't work for + # # scalar arrays (at the moment, at least) + # ma_array = np.ma.empty((), dtype=array.dtype) + # ma_array[...] = array + # array = ma_array + # else: + # array = array.copy() + # + # return array def get_units(self, default=ValueError()): """The units of the array. @@ -336,8 +296,7 @@ def get_units(self, default=ValueError()): """ attributes = self.get_attributes({}) - units = attributes.get("units") - if units is None: + if "units" not in attributes: if default is None: return @@ -346,17 +305,4 @@ def get_units(self, default=ValueError()): f"{self.__class__.__name__} 'units' have not been set", ) - return units - - -# units = self._get_component("units", False) -# if units is False: -# if default is None: -# return -# -# return self._default( -# default, -# f"{self.__class__.__name__} 'units' have not been set", -# ) -# -# return units + return attributes["units"] diff --git a/cfdm/data/netcdf4array.py b/cfdm/data/netcdf4array.py index cf1dc3a62..f9a17cfd7 100644 --- a/cfdm/data/netcdf4array.py +++ b/cfdm/data/netcdf4array.py @@ -20,8 +20,6 @@ def __init__( shape=None, mask=True, unpack=True, - # units=False, - # calendar=False, attributes=None, storage_options=None, source=None, @@ -63,26 +61,11 @@ def __init__( .. versionadded:: (cfdm) NEXTVERSION - units: `str` or `None`, optional - The units of the netCDF variable. Set to `None` to - indicate that there are no units. If unset then the - units will be set during the first `__getitem__` call. - - .. versionadded:: (cfdm) 1.10.0.1 - - calendar: `str` or `None`, optional - The calendar of the netCDF variable. By default, or if - set to `None`, then the CF default calendar is - assumed, if applicable. If unset then the calendar - will be set during the first `__getitem__` call. - - .. versionadded:: (cfdm) 1.10.0.1 - - {{attributes: `dict` or `None`, optional}} + {{init attributes: `dict` or `None`, optional}} If *attributes* is `None`, the default, then the - attributes will be set from *variable* during the - first `__getitem__` call. + netCDF attributes will be set from the netCDF variable + during the first `__getitem__` call. .. versionadded:: (cfdm) NEXTRELEASE @@ -112,6 +95,14 @@ def __init__( group: Deprecated at version 1.10.1.0 Use the *address* parameter instead. + units: `str` or `None`, optional + Deprecated at version NEXTRELEASE. Use the + *attributes* parameter instead. + + calendar: `str` or `None`, optional + Deprecated at version NEXTRELEASE. Use the + *attributes* parameter instead. + """ super().__init__(source=source, copy=copy) diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index e14c56b90..62af5e438 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -117,6 +117,7 @@ def __init__( unpack=True, always_masked_array=False, attributes=None, + copy=False, ): """**Initialisation** @@ -170,6 +171,7 @@ def __init__( self.unpack = bool(unpack) self.always_masked_array = bool(always_masked_array) self._attributes = attributes + self._copy = copy def __getitem__(self, index): """Return a subspace of the variable as a `numpy` array. @@ -258,6 +260,12 @@ def __getitem__(self, index): if data.dtype.kind == "S": data = data.astype("U", copy=False) + # ------------------------------------------------------------ + # Copy the data + # ------------------------------------------------------------ + if self._copy: + data = data.copy() + return data def __orthogonal_indexing__(self): @@ -694,18 +702,21 @@ def _unpack(self, data, attributes): # scale_factor and add_offset if add_offset != 0.0 or scale_factor != 1.0: data = data * scale_factor + add_offset + self._copy = False else: data = data.astype(np.array(scale_factor).dtype) else: # scale_factor with no add_offset if scale_factor != 1.0: data = data * scale_factor + self._copy = False else: data = data.astype(scale_factor.dtype) elif add_offset is not None: # add_offset with no scale_factor if add_offset != 0.0: data = data + add_offset + self._copy = False else: data = data.astype(np.array(add_offset).dtype) diff --git a/cfdm/data/numpyarray.py b/cfdm/data/numpyarray.py index 7c26af684..10684a810 100644 --- a/cfdm/data/numpyarray.py +++ b/cfdm/data/numpyarray.py @@ -1,5 +1,6 @@ from .. import core from .mixin import ArrayMixin +from .netcdfindexer import netcdf_indexer class NumpyArray(ArrayMixin, core.NumpyArray): @@ -31,9 +32,10 @@ def __getitem__(self, indices): .. versionadded:: (cfdm) 1.7.0 """ - return self.get_subspace( - self._get_component("array"), indices, copy=True + array = netcdf_indexer( + self._get_component("array"), mask=False, unpack=False, copy=True ) + return array[indices] def to_memory(self): """Bring data on disk into memory. diff --git a/cfdm/data/subsampledarray.py b/cfdm/data/subsampledarray.py index 15c68bd0a..55ef28ef5 100644 --- a/cfdm/data/subsampledarray.py +++ b/cfdm/data/subsampledarray.py @@ -6,6 +6,7 @@ from ..core.utils import cached_property from .abstract import CompressedArray +from .netcdfindexer import netcdf_indexer from .subarray import ( BiLinearSubarray, BiQuadraticLatitudeLongitudeSubarray, @@ -405,7 +406,8 @@ def __getitem__(self, indices): if indices is Ellipsis: return u - return self.get_subspace(u, indices, copy=True) + u = netcdf_indexer(u, mask=False, unpack=False) + return u[indices] def _conformed_dependent_tie_points(self): """Return the dependent tie points. diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py index 1a2584675..b7e0dcea4 100644 --- a/cfdm/docstring/docstring.py +++ b/cfdm/docstring/docstring.py @@ -419,6 +419,10 @@ A netCDF array is unpacked depending on the values of the netCDF attributes ``add_offset`` and ``scale_factor``.""", + # init attributes + "{{init attributes: `dict` or `None`, optional}}": """attributes: `dict` or `None`, optional + Provide netCDF attributes for the data as a dictionary + of key/value pairs.""", # init storage_options "{{init storage_options: `dict` or `None`, optional}}": """storage_options: `dict` or `None`, optional Key/value pairs to be passed on to the creation of @@ -451,10 +455,6 @@ 'scaleway-secretkey...', 'endpoint_url': 'https://s3.fr-par.scw.cloud', 'client_kwargs': {'region_name': 'fr-par'}}``""", - # attributes - "{{attributes: `dict` or `None`, optional}}": """attributes: `dict` or `None`, optional - Provide netCDF attributes for the data as a dictionary - of key/value pairs.""", # ---------------------------------------------------------------- # Method description susbstitutions (4 levels of indentataion) # ---------------------------------------------------------------- diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index b18488425..9428dee0d 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -1505,12 +1505,12 @@ def read( # size from the original grouped dataset, because # unlimited dimensions have size 0 in the flattened # dataset (because it contains no data) (v1.8.8.1) - group, ncdim = self._netCDF4_group( # h5netcdf + group, ncdim = self._netCDF4_group( # TODO h5netcdf ? g["nc_grouped"], flattener_dimensions[name] ) internal_dimension_sizes[name] = group.dimensions[ ncdim - ].size # h5netcdf + ].size # TODO h5netcdf ? else: internal_dimension_sizes[name] = dimension.size @@ -6171,26 +6171,15 @@ def _create_netcdfarray( filename = g["variable_filename"][ncvar] attributes = g["variable_attributes"][ncvar].copy() - - # Get the units and calendar (before we overwrite ncvar) - # units = g["variable_attributes"][ncvar].get("units") - # calendar = g["variable_attributes"][ncvar].get("calendar") - if coord_ncvar is not None: # Get the Units from the parent coordinate variable, if # they've not already been set. if "units" not in attributes: - # if units is None: - # units = g["vaariable_attributes"][coord_ncvar].get("units") units = g["variable_attributes"][coord_ncvar].get("units") if units is not None: attributes["units"] = units if "calendar" not in attributes: - # if calendar is None: - # calendar = g["variable_attributes"][coord_ncvar].get( - # "calendar" - # ) calendar = g["variable_attributes"][coord_ncvar].get( "calendar" ) @@ -6204,9 +6193,6 @@ def _create_netcdfarray( "dtype": dtype, "mask": g["mask"], "unpack": g["unpack"], - # "units": units, - # "calendar": calendar, - # "attributes": g["variable_attributes"][ncvar], "attributes": attributes, "storage_options": g["file_system_storage_options"].get(filename), } @@ -6277,8 +6263,6 @@ def _create_data( attributes = kwargs["attributes"] units = attributes.get("units") calendar = attributes.get("calendar") - # units = kwargs["units"] - # calendar = kwargs["calendar"] compression = g["compression"] @@ -9297,8 +9281,6 @@ def _ugrid_create_domain_topology(self, parent_ncvar, f, mesh, location): attributes = kwargs["attributes"] data = self._create_Data( array, - # units=kwargs["units"], - # calendar=kwargs["calendar"], units=attributes.get("units"), calendar=attributes.get("calendar"), ncvar=connectivity_ncvar, @@ -9405,8 +9387,6 @@ def _ugrid_create_cell_connectivities( attributes = kwargs["attributes"] data = self._create_Data( array, - # units=kwargs["units"], - # calendar=kwargs["calendar"], units=attributes.get("units"), calendar=attributes.get("calendar"), ncvar=connectivity_ncvar, From 4283f449fe524270819b4b2d0d43d9f09c00ddf1 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 5 Apr 2024 16:41:38 +0100 Subject: [PATCH 57/88] dev --- cfdm/data/h5netcdfarray.py | 4 ++-- cfdm/data/netcdf4array.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index 00a52d864..b31d32e90 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -66,8 +66,8 @@ def __init__( {{init attributes: `dict` or `None`, optional}} If *attributes* is `None`, the default, then the - netCDF attributes will be set from the netCDF variable - during the first `__getitem__` call. + attributes will be set from the netCDF variable during + the first `__getitem__` call. .. versionadded:: (cfdm) NEXTRELEASE diff --git a/cfdm/data/netcdf4array.py b/cfdm/data/netcdf4array.py index f9a17cfd7..29d4b2efc 100644 --- a/cfdm/data/netcdf4array.py +++ b/cfdm/data/netcdf4array.py @@ -64,8 +64,8 @@ def __init__( {{init attributes: `dict` or `None`, optional}} If *attributes* is `None`, the default, then the - netCDF attributes will be set from the netCDF variable - during the first `__getitem__` call. + attributes will be set from the netCDF variable during + the first `__getitem__` call. .. versionadded:: (cfdm) NEXTRELEASE From 889490ac167c85739acda4a61b1e0c665b51f386 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 8 Apr 2024 19:12:30 +0100 Subject: [PATCH 58/88] dev --- Changelog.rst | 9 ++- README.md | 2 +- cfdm/__init__.py | 15 +++- cfdm/core/functions.py | 6 +- cfdm/data/abstract/compressedarray.py | 12 ++- cfdm/data/abstract/mesharray.py | 12 ++- cfdm/data/h5netcdfarray.py | 2 + cfdm/data/mixin/arraymixin.py | 106 -------------------------- cfdm/data/netcdf4array.py | 19 +---- cfdm/data/netcdfindexer.py | 87 ++++++++++++++------- cfdm/data/numpyarray.py | 7 +- cfdm/data/subsampledarray.py | 12 ++- cfdm/functions.py | 15 ++-- cfdm/read_write/read.py | 23 +++--- cfdm/test/test_netcdf_indexer.py | 33 +++++++- requirements.txt | 4 +- 16 files changed, 170 insertions(+), 194 deletions(-) diff --git a/Changelog.rst b/Changelog.rst index 1b898dce8..b5d4e4b15 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -3,16 +3,19 @@ Version NEXT **2024-??-??** -* New function `cfdm.netcdf_flattener` +* New function `cfdm.netcdf_flattener` that replaces the + `netcdf_flattener` import (https://github.com/NCAS-CMS/cfdm/issues/286) * Allow access to netCDF-4 files in S3 object stores (https://github.com/NCAS-CMS/cfdm/issues/285) +* Allow a choice of netCDF engines + (https://github.com/NCAS-CMS/cfdm/issues/285) * New class `cfdm.H5netcdfArray` * New class `cfdm.NetCDFIndexer` * New dependency: ``h5netcdf>=1.3.0`` * New dependency: ``h5py>=3.10.0`` -* New dependency: ``s3fs>=2024.2.0`` -* New dependency: ``dask>=2024.2.1`` +* New dependency: ``s3fs>=2024.3.0`` +* New dependency: ``dask>=2024.4.1`` * Removed dependency: ``netcdf_flattener`` ---- diff --git a/README.md b/README.md index 9bd31953e..c54e042a6 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ inspecting it: The ``cfdm`` package can: * read field and domain constructs from netCDF and CDL datasets with a - choice of netCDF backends, + choice of netCDF engines, * create new field and domain constructs in memory, * write and append field and domain constructs to netCDF datasets on disk, * read, write, and manipulate UGRID mesh topologies, diff --git a/cfdm/__init__.py b/cfdm/__init__.py index e8859e239..78cb6646e 100644 --- a/cfdm/__init__.py +++ b/cfdm/__init__.py @@ -116,7 +116,7 @@ except ImportError as error1: raise ImportError(_error0 + str(error1)) -_minimum_vn = "2024.2.0" +_minimum_vn = "2024.3.0" if Version(s3fs.__version__) < Version(_minimum_vn): raise ValueError( f"Bad s3fs version: cfdm requires s3fs>={_minimum_vn}. " @@ -136,6 +136,19 @@ f"Got {scipy.__version__} at {scipy.__file__}" ) +# Check the version of dask +try: + import dask +except ImportError as error1: + raise ImportError(_error0 + str(error1)) + +_minimum_vn = "2024.4.0" +if Version(dask.__version__) < Version(_minimum_vn): + raise ValueError( + f"Bad scipy version: cfdm requires dask>={_minimum_vn}. " + f"Got {dask.__version__} at {dask.__file__}" + ) + from .constants import masked # Internal ones passed on so they can be used in cf-python (see diff --git a/cfdm/core/functions.py b/cfdm/core/functions.py index 48bdb9f8d..65f5daa36 100644 --- a/cfdm/core/functions.py +++ b/cfdm/core/functions.py @@ -3,9 +3,6 @@ import sys from pickle import dumps, loads -import numpy as np -import packaging - from . import __cf_version__, __file__, __version__ @@ -42,6 +39,9 @@ def environment(display=True, paths=True): cfdm.core: NEXTVERSION """ + import numpy as np + import packaging + dependency_version_paths_mapping = { "Platform": (platform.platform(), ""), "Python": (platform.python_version(), sys.executable), diff --git a/cfdm/data/abstract/compressedarray.py b/cfdm/data/abstract/compressedarray.py index 197f900e1..c8c387c86 100644 --- a/cfdm/data/abstract/compressedarray.py +++ b/cfdm/data/abstract/compressedarray.py @@ -190,10 +190,14 @@ def __getitem__(self, indices): ) u[u_indices] = subarray[...] - if indices is Ellipsis: - return u - - u = netcdf_indexer(u, mask=False, unpack=False) + u = netcdf_indexer( + u, + mask=False, + unpack=False, + always_masked_array=False, + orthogonal_indexing=True, + copy=False, + ) return u[indices] def _first_or_last_element(self, indices): diff --git a/cfdm/data/abstract/mesharray.py b/cfdm/data/abstract/mesharray.py index fc76bb35a..0c7831c42 100644 --- a/cfdm/data/abstract/mesharray.py +++ b/cfdm/data/abstract/mesharray.py @@ -149,10 +149,14 @@ def __getitem__(self, indices): # future reference. self._set_component("shape", u.shape, copy=False) - if indices is Ellipsis: - return u - - u = netcdf_indexer(u, mask=False, unpack=False) + u = netcdf_indexer( + u, + mask=False, + unpack=False, + always_masked_array=False, + orthogonal_indexing=True, + copy=False, + ) return u[indices] @property diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index b31d32e90..ba5beae82 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -175,6 +175,8 @@ def __getitem__(self, indices): mask=self.get_mask(), unpack=self.get_unpack(), always_masked_array=False, + orthogonal_indexing=True, + copy=False, ) array = array[indices] diff --git a/cfdm/data/mixin/arraymixin.py b/cfdm/data/mixin/arraymixin.py index 20a666efb..f3b1f8e4e 100644 --- a/cfdm/data/mixin/arraymixin.py +++ b/cfdm/data/mixin/arraymixin.py @@ -167,112 +167,6 @@ def get_compression_type(self): """ return self._get_component("compression_type", "") - # @classmethod - # def get_subspace(cls, array, indices, copy=True): - # """Return a subspace, defined by indices, of a numpy array. - # - # Only certain type of indices are allowed. See the *indices* - # parameter for details. - # - # Indexing is similar to numpy indexing. Given the restrictions on - # the type of indices allowed - see the *indicies* parameter - the - # only difference to numpy indexing is - # - # * When two or more dimension's indices are sequences of integers - # then these indices work independently along each dimension - # (similar to the way vector subscripts work in Fortran). - # - # .. versionadded:: (cfdm) 1.8.7.0 - # - # :Parameters: - # - # array: `numpy.ndarray` - # The array to be subspaced. - # - # indices: - # The indices that define the subspace. - # - # Must be either `Ellipsis` or a sequence that contains an - # index for each dimension. In the latter case, each - # dimension's index must either be a `slice` object or a - # sequence of two or more integers. - # - # *Parameter example:* - # indices=Ellipsis - # - # *Parameter example:* - # indices=[[5, 7, 8]] - # - # *Parameter example:* - # indices=[slice(4, 7)] - # - # *Parameter example:* - # indices=[slice(None), [5, 7, 8]] - # - # *Parameter example:* - # indices=[[2, 5, 6], slice(15, 4, -2), [8, 7, 5]] - # - # copy: `bool` - # If `False` then the returned subspace may (or may not) be - # independent of the input *array*. By default the returned - # subspace is independent of the input *array*. - # - # :Returns: - # - # `numpy.ndarray` - # - # """ - # if indices is not Ellipsis: - # if not isinstance(indices, tuple): - # indices = (indices,) - # - # axes_with_list_indices = [ - # i for i, x in enumerate(indices) if not isinstance(x, slice) - # ] - # n_axes_with_list_indices = len(axes_with_list_indices) - # - # if n_axes_with_list_indices < 2: - # # ---------------------------------------------------- - # # At most one axis has a list-of-integers index so we - # # can do a normal numpy subspace - # # ---------------------------------------------------- - # array = array[tuple(indices)] - # else: - # # ---------------------------------------------------- - # # At least two axes have list-of-integers indices so - # # we can't do a normal numpy subspace - # # ---------------------------------------------------- - # n_indices = len(indices) - # if n_axes_with_list_indices < n_indices: - # # Apply subspace defined by slices - # slices = [ - # i if isinstance(i, slice) else slice(None) - # for i in indices - # ] - # array = array[tuple(slices)] - # - # if n_axes_with_list_indices: - # # Apply subspaces defined by lists (this - # # methodology works for both numpy arrays and - # # scipy sparse arrays). - # lists = [slice(None)] * n_indices - # for axis in axes_with_list_indices: - # lists[axis] = indices[axis] - # array = array[tuple(lists)] - # lists[axis] = slice(None) - # - # if copy: - # if np.ma.isMA(array) and not array.ndim: - # # This is because numpy.ma.copy doesn't work for - # # scalar arrays (at the moment, at least) - # ma_array = np.ma.empty((), dtype=array.dtype) - # ma_array[...] = array - # array = ma_array - # else: - # array = array.copy() - # - # return array - def get_units(self, default=ValueError()): """The units of the array. diff --git a/cfdm/data/netcdf4array.py b/cfdm/data/netcdf4array.py index 29d4b2efc..3619aa905 100644 --- a/cfdm/data/netcdf4array.py +++ b/cfdm/data/netcdf4array.py @@ -137,16 +137,6 @@ def __init__( except AttributeError: unpack = True - # try: - # units = source._get_component("units", False) - # except AttributeError: - # units = False - # - # try: - # calendar = source._get_component("calendar", False) - # except AttributeError: - # calendar = False - try: attributes = source._get_component("attributes", None) except AttributeError: @@ -181,8 +171,6 @@ def __init__( self._set_component("dtype", dtype, copy=False) self._set_component("mask", bool(mask), copy=False) self._set_component("unpack", bool(unpack), copy=False) - # self._set_component("units", units, copy=False) - # self._set_component("calendar", calendar, copy=False) self._set_component("storage_options", storage_options, copy=False) self._set_component("attributes", attributes, copy=False) @@ -233,15 +221,14 @@ def __getitem__(self, indices): mask=self.get_mask(), unpack=self.get_unpack(), always_masked_array=False, + orthogonal_indexing=True, + copy=False, ) array = array[indices] - # Set the units, if they haven't been set already. + # Set the attributes, if they haven't been set already. self._set_attributes(variable) - # # Set the units, if they haven't been set already. - # self._set_units(variable) - self.close(dataset) del netcdf, dataset diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index 62af5e438..d2c2da429 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -34,11 +34,13 @@ class netcdf_indexer: """A data indexer that also applies netCDF masking and unpacking. - Indexing is orthogonal, meaning that the index for each dimension - is applied independently, regardless of how that index was - defined. For instance, the indices ``[[0, 1], [1, 3], 0]`` and - ``[:2, 1:4:2, 0]`` will give identical results. Note that this - behaviour is different to that of `numpy`. + Indexing may be orthogonal or non-orthogonal. Orthogonal indexing + means that the index for each dimension is applied independently, + regardless of how that index was defined. For instance, the + indices ``[[0, 1], [1, 3], 0]`` and ``[:2, 1:4:2, 0]`` will give + identical results. This behaviour is different to that of + `numpy`. Non-orthogonal indexing means that normal `numpy` + indexing rules are applied. During indexing, masking and unpacking is applied according to the netCDF conventions, either or both of which may be disabled via @@ -116,6 +118,7 @@ def __init__( mask=True, unpack=True, always_masked_array=False, + orthogonal_indexing=False, attributes=None, copy=False, ): @@ -155,6 +158,16 @@ def __init__( by indexing is always a masked `numpy` array, even if there are no missing values. + orthogonal_indexing: `bool`, optional + If True then indexing is orthogonal, meaning that the + index for each dimension is applied independently, + regardless of how that index was defined. For + instance, the indices ``[[0, 1], [1, 3], 0]`` and + ``[:2, 1:4:2, 0]`` will give identical results. This + behaviour is different to that of `numpy`. If False, + the default, then normal `numpy` indexing rules are + applied. + attributes: `dict`, optional Provide netCDF attributes for the *variable* as a dictionary key/value pairs. Only the attributes @@ -165,25 +178,29 @@ def __init__( *attributes* is not `None`, then any netCDF attributes stored by *variable* itself are ignored. + copy: `bool`, optional + If True then return a copy of the subspace that is not + a view of part of the the original data. If False, the + default, then the returned subspace could be either a + copy or a view. + """ self.variable = variable self.mask = bool(mask) self.unpack = bool(unpack) self.always_masked_array = bool(always_masked_array) self._attributes = attributes - self._copy = copy + self._copy = bool(copy) + self._orthogonal_indexing = bool(orthogonal_indexing) def __getitem__(self, index): """Return a subspace of the variable as a `numpy` array. v.__getitem__(index) <==> v[index] - Indexing is orthogonal, meaning that the index for each - dimension is applied independently, regardless of how that - index was defined. For instance, the indices ``[[0, 1], [1, - 3], 0]`` and ``[:2, 1:4:2, 0]`` will give identical - results. Note that this behaviour is different to that of - `numpy`. + If `__orthogonal_indexing__` is True then indexing is + orthogonal. If `__orthogonal_indexing__` is False then normal + `numpy` indexing rules are applied. .. versionadded:: (cfdm) NEXTVERSION @@ -268,13 +285,14 @@ def __getitem__(self, index): return data + @property def __orthogonal_indexing__(self): - """Flag to indicate that orthogonal indexing is supported. + """Flag to indicate whether indexing is orthogonal. .. versionadded:: (cfdm) NEXTVERSION """ - return True + return self._orthogonal_indexing def _check_safecast(self, attr, dtype, attributes): """Check an attribute's data type. @@ -346,7 +364,7 @@ def _default_FillValue(self, dtype): return default_fillvals[dtype.str[1:]] def _index(self, index): - """Get a subspace of the variable with orthogonal indexing. + """Get a subspace of the variable. .. versionadded:: (cfdm) NEXTVERSION @@ -363,8 +381,11 @@ def _index(self, index): The subspace of the variable. """ - variable = self.variable - index = normalize_index(index, variable.shape) + data = self.variable + if index is Ellipsis: + return data[...] + + index = normalize_index(index, data.shape) # Find the positions of any list/1-d array indices (which by # now will contain only integers) @@ -374,6 +395,24 @@ def _index(self, index): if isinstance(i, list) or getattr(i, "shape", False) ] + data_orthogonal_indexing = getattr( + data, "__orthogonal_indexing__", False + ) + if not self.__orthogonal_indexing__: + # -------------------------------------------------------- + # Do non-orthogonal indexing + # -------------------------------------------------------- + if data_orthogonal_indexing and len(axes_with_list_indices) > 1: + raise IndexError( + "Can't non-orthogonally index a " + f"{data.__class__.__name__} object with index {index!r}" + ) + + return data[index] + + # ------------------------------------------------------------ + # Still here? Then do orthogonal indexing. + # ------------------------------------------------------------ # Create an index that replaces integer indices with size 1 # slices, so that their axes are not dropped yet (they will be # dealt with later). @@ -381,15 +420,12 @@ def _index(self, index): slice(i, i + 1) if isinstance(i, Integral) else i for i in index ] - data = variable - if len(axes_with_list_indices) <= 1 or getattr( - variable, "__orthogonal_indexing__", False - ): + if data_orthogonal_indexing or len(axes_with_list_indices) <= 1: # There is at most one list/1-d array index, and/or the # variable natively supports orthogonal indexing. # - # Note: `netCDF4.Variable` supports orthogonal indexing; - # but `numpy.ndarray`, `h5netcdf.File` and + # Note: `netCDF4.Variable` natively supports orthogonal + # indexing; but `numpy.ndarray`, `h5netcdf.File` and # `h5py.File` do not. data = data[tuple(index0)] else: @@ -425,7 +461,7 @@ def _index(self, index): # 2) Apply the rest of the list/1-d array indices, in the # order that gives the smallest result after each step. - ndim = variable.ndim + ndim = data.ndim while axes_with_list_indices: shape1 = data.shape size1 = data.size @@ -434,9 +470,6 @@ def _index(self, index): for i in axes_with_list_indices ] n = axes_with_list_indices.pop(np.argmin(sizes)) - - # Apply the subspace of for the chosen list/1-d array - # index index2 = [slice(None)] * ndim index2[n] = index[n] data = data[tuple(index2)] diff --git a/cfdm/data/numpyarray.py b/cfdm/data/numpyarray.py index 10684a810..114c2b0c4 100644 --- a/cfdm/data/numpyarray.py +++ b/cfdm/data/numpyarray.py @@ -33,7 +33,12 @@ def __getitem__(self, indices): """ array = netcdf_indexer( - self._get_component("array"), mask=False, unpack=False, copy=True + self._get_component("array"), + mask=False, + unpack=False, + always_masked_array=False, + orthogonal_indexing=True, + copy=True, ) return array[indices] diff --git a/cfdm/data/subsampledarray.py b/cfdm/data/subsampledarray.py index 55ef28ef5..e40ff0249 100644 --- a/cfdm/data/subsampledarray.py +++ b/cfdm/data/subsampledarray.py @@ -403,10 +403,14 @@ def __getitem__(self, indices): ) u[u_indices] = subarray[...] - if indices is Ellipsis: - return u - - u = netcdf_indexer(u, mask=False, unpack=False) + u = netcdf_indexer( + u, + mask=False, + unpack=False, + always_masked_array=False, + orthogonal_indexing=True, + copy=False, + ) return u[indices] def _conformed_dependent_tie_points(self): diff --git a/cfdm/functions.py b/cfdm/functions.py index 76767defd..76c013844 100644 --- a/cfdm/functions.py +++ b/cfdm/functions.py @@ -4,13 +4,7 @@ from functools import total_ordering from urllib.parse import urlparse -import cftime -import h5netcdf -import h5py -import netCDF4 import numpy as np -import s3fs -import scipy from . import __cf_version__, __file__, __version__, core from .constants import CONSTANTS, ValidLogLevels @@ -353,6 +347,14 @@ def environment(display=True, paths=True): cfdm: NEXTVERSION /home/miniconda3/lib/python3.11/site-packages/cfdm/__init__.py """ + import cftime + import dask + import h5netcdf + import h5py + import netCDF4 + import s3fs + import scipy + out = core.environment(display=False, paths=paths) # get all core env dependency_version_paths_mapping = { @@ -363,6 +365,7 @@ def environment(display=True, paths=True): "h5py": (h5py.__version__, os.path.abspath(h5py.__file__)), "s3fs": (s3fs.__version__, os.path.abspath(s3fs.__file__)), "scipy": (scipy.__version__, os.path.abspath(scipy.__file__)), + "dask": (dask.__version__, os.path.abspath(dask.__file__)), "cftime": (cftime.__version__, os.path.abspath(cftime.__file__)), "cfdm": (__version__, os.path.abspath(__file__)), } diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 971438593..5f0c1185f 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -245,7 +245,7 @@ def read( .. versionadded:: (cfdm) 1.8.2 unpack: `bool` - If True (the default) then unpack arrays by convention + If True, the default, then unpack arrays by convention when the data is read from disk. Unpacking is determined by netCDF conventions for the @@ -277,18 +277,13 @@ def read( .. versionadded:: (cfdm) 1.9.0.0 netcdf_eninge: `None` or `str`, optional - - Specify which library to use for the opening and reading + Specify which library to use for opening and reading netCDF files. By default, or if `None`, then the first one of `netCDF4` and `h5netcdf` to successfully open the file netCDF file is used. Setting *netcdf_engine* to one of ``'netCDF4'`` and ``'h5netcdf'`` will force the use of that library. - .. note:: `h5netcdf` restricts the types of indices that - define subspaces of its data. See - https://docs.h5py.org for details. - .. versionadded:: (cfdm) NEXTVERSION storage_options: `dict` or `None`, optional @@ -314,13 +309,13 @@ def read( is taken as ``{}``. If the ``'endpoint_url'`` key is not in *storage_options*, - or is not in a dictionary defined by the - ``'client_kwargs'`` key (both of which are the case when - *storage_options* is `None`), then one will be - automatically inserted for accessing an S3 file. For - example, for a file name of ``'s3://store/data/file.nc'``, - an ``'endpoint_url'`` key with value ``'https://store'`` - would be created. + nor in a dictionary defined by the ``'client_kwargs'`` key + (both of which are the case when *storage_options* is + `None`), then one will be automatically inserted for + accessing an S3 file. For example, for a file name of + ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key + with value ``'https://store'`` would be created. To + disable this, set ``'endpoint_url'`` to `None`. *Parameter example:* For a file name of ``'s3://store/data/file.nc'``, the diff --git a/cfdm/test/test_netcdf_indexer.py b/cfdm/test/test_netcdf_indexer.py index 5df635efc..70ccae76d 100644 --- a/cfdm/test/test_netcdf_indexer.py +++ b/cfdm/test/test_netcdf_indexer.py @@ -143,9 +143,11 @@ def test_netcdf_indexer_numpy(self): self.assertTrue((x == array).all()) def test_netcdf_indexer_orthogonal_indexing(self): - """Test netcdf_indexer for numpy.""" + """Test netcdf_indexer for numpy orthogonal indexing.""" array = np.ma.arange(120).reshape(2, 3, 4, 5) - x = cfdm.netcdf_indexer(array, mask=False, unpack=False) + x = cfdm.netcdf_indexer( + array, mask=False, unpack=False, orthogonal_indexing=True + ) y = x[..., [0, 2], :] a = array[..., [0, 2], :] @@ -157,6 +159,27 @@ def test_netcdf_indexer_orthogonal_indexing(self): a = a[1, ...] self.assertTrue((y == a).all()) + def test_netcdf_indexer_non_orthogonal_indexing(self): + """Test netcdf_indexer for numpy non-orthogonal indexing.""" + array = np.ma.arange(120).reshape(2, 3, 4, 5) + x = cfdm.netcdf_indexer(array, mask=False, unpack=False) + + y = x[..., [0, 2], :] + a = array[..., [0, 2], :] + self.assertTrue((y == a).all()) + + index = (Ellipsis, [0, 2], [2, 3]) + y = x[index] + a = array[index] + self.assertEqual(y.shape, a.shape) + self.assertTrue((y == a).all()) + + index = (1, Ellipsis, [0, 2], [2, 3]) + y = x[index] + a = array[index] + self.assertEqual(y.shape, a.shape) + self.assertTrue((y == a).all()) + def test_netcdf_always_masked_array(self): """Test netcdf_indexer for numpy masked output.""" array = np.ma.arange(9) @@ -165,6 +188,12 @@ def test_netcdf_always_masked_array(self): x = cfdm.netcdf_indexer(array, always_masked_array=True) self.assertTrue(np.ma.isMA(x[...])) + def test_netcdf_indexer_Ellipsis(self): + """Test netcdf_indexer with Ellipsis.""" + n = np.arange(9) + x = cfdm.netcdf_indexer(n) + self.assertTrue((x[...] == n).all()) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/requirements.txt b/requirements.txt index f4b2a5aa2..73deb66c8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,6 @@ packaging>=20.0 scipy>=1.10.0 h5netcdf>=1.3.0 h5py>=3.10.0 -s3fs>=2024.2.0 -dask>=2024.2.1 +s3fs>=2024.3.0 +dask>=2024.4.0 From d590fd97f57b0c081518170ffe790dff1a522870 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 13 May 2024 11:46:26 +0100 Subject: [PATCH 59/88] dev --- README.md | 2 +- cfdm/data/netcdfindexer.py | 23 +++++++++++------------ cfdm/read_write/netcdf/flatten/config.py | 22 +++++++++++++--------- cfdm/read_write/netcdf/flatten/flatten.py | 10 +++++----- cfdm/read_write/netcdf/netcdfread.py | 20 ++++++++++---------- cfdm/read_write/read.py | 6 +++--- cfdm/test/test_groups.py | 16 +++++++++------- cfdm/test/test_netcdf_indexer.py | 10 +++++----- cfdm/test/test_read_write.py | 4 ++-- 9 files changed, 59 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index c54e042a6..9bd31953e 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ inspecting it: The ``cfdm`` package can: * read field and domain constructs from netCDF and CDL datasets with a - choice of netCDF engines, + choice of netCDF backends, * create new field and domain constructs in memory, * write and append field and domain constructs to netCDF datasets on disk, * read, write, and manipulate UGRID mesh topologies, diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index d2c2da429..3f3f1bd9e 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -404,8 +404,8 @@ def _index(self, index): # -------------------------------------------------------- if data_orthogonal_indexing and len(axes_with_list_indices) > 1: raise IndexError( - "Can't non-orthogonally index a " - f"{data.__class__.__name__} object with index {index!r}" + "Can't non-orthogonally index " + f"{data.__class__.__name__} with index {index!r}" ) return data[index] @@ -413,9 +413,10 @@ def _index(self, index): # ------------------------------------------------------------ # Still here? Then do orthogonal indexing. # ------------------------------------------------------------ + # Create an index that replaces integer indices with size 1 # slices, so that their axes are not dropped yet (they will be - # dealt with later). + # dropeed later). index0 = [ slice(i, i + 1) if isinstance(i, Integral) else i for i in index ] @@ -429,10 +430,9 @@ def _index(self, index): # `h5py.File` do not. data = data[tuple(index0)] else: - # There are two or more list/1-d array index, and the - # variable does not natively support orthogonal indexing. - # - # Emulate orthogonal indexing with a sequence of + # There are two or more list/1-d array indices, and the + # variable does not natively support orthogonal indexing + # => emulate orthogonal indexing with a sequence of # subspaces, one for each list/1-d array index. # 1) Apply the slice indices at the time as the list/1-d @@ -445,17 +445,16 @@ def _index(self, index): ] # Find the position of the list/1-d array index that gives - # the smallest result. + # the smallest result, and apply the subspace of slices + # and the chosen list/1-d array index. This will give the + # samllest memory footprint of the whole operation. shape1 = self.index_shape(index1, data.shape) size1 = prod(shape1) sizes = [ - len(index[i]) * size1 // shape1[i] + size1 * (len(index[i]) // shape1[i]) for i in axes_with_list_indices ] n = axes_with_list_indices.pop(np.argmin(sizes)) - - # Apply the subspace of slices and the chosen list/1-d - # array index index1[n] = index[n] data = data[tuple(index1)] diff --git a/cfdm/read_write/netcdf/flatten/config.py b/cfdm/read_write/netcdf/flatten/config.py index 05895ce04..6f2fc4a7e 100644 --- a/cfdm/read_write/netcdf/flatten/config.py +++ b/cfdm/read_write/netcdf/flatten/config.py @@ -20,15 +20,15 @@ # NetCDF global attribute in the flattened dataset containing the # mapping of flattened attribute names to grouped attribute names -flattener_attribute_map = "__flattener_attribute_map" +flattener_attribute_map = "_flattener_attribute_map" # NetCDF global attribute in the flattened dataset containing the # mapping of flattened dimension names to grouped attribute names -flattener_dimension_map = "__flattener_dimension_map" +flattener_dimension_map = "_flattener_dimension_map" # NetCDF global attribute in the flattened dataset containing the # mapping of flattened variable names to grouped attribute names -flattener_variable_map = "__flattener_variable_map" +flattener_variable_map = "_flattener_variable_map" @dataclass() @@ -36,9 +36,9 @@ class FlatteningRules: """Define the flattening rules for a netCDF attribute. For a named netCDF attribute, the rules a define how the contents - of the attribute are flattened. For instance, the - ``ancillary_variables`` attribute contains the names of other - netCDF variables, separated by spaces. + of the attribute are flattened. For instance, it has to be defined + that the ``ancillary_variables`` attribute contains the names of + other netCDF variables. .. versionadded:: (cfdm) NEXTVERSION @@ -48,10 +48,14 @@ class FlatteningRules: # flattened name: str # ref_to_dim: Positive integer if contains references to - # dimensions (higher values have priority) + # dimensions. If ref_to_dim and ref_to_var are both + # positive then the rule with the greater value is + # tested first. ref_to_dim: int = 0 - # ref_to_var: Positive integer if contains references to variables - # (highest values have priority) + # ref_to_var: Positive integer if contains references to + # variables. If ref_to_dim and ref_to_var are both + # positive then the rule with the greater value is + # tested first. ref_to_var: int = 0 # resolve_key: True if 'keys' have to be resolved in 'key1: value1 # key2: value2 value3' or 'key1 key2' diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 67a950024..20d5f9edf 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -35,7 +35,7 @@ def netcdf_flatten( omit_data=False, write_chunksize=134217728, ): - """Create a flattened version of a netCDF dataset. + """Create a flattened version of a grouped netCDF dataset. **CF-netCDF coordinate variables** @@ -48,8 +48,8 @@ def netcdf_flatten( In such cases it is up to the user to apply the proximal and lateral search alogrithms to the flattened dataset returned by `netcdf_flatten`, in conjunction with the mappings defined in the - newly created global attributes ``__flattener_variable_map`` and - ``__flattener_dimension_map``, to find which netCDF variables are + newly created global attributes ``_flattener_variable_map`` and + ``_flattener_dimension_map``, to find which netCDF variables are acting as CF coordinate variables in the flattened dataset. See https://cfconventions.org/cf-conventions/cf-conventions.html#groups for details. @@ -58,9 +58,9 @@ def netcdf_flatten( group and coordinate variable ``lat(lat)`` in group ``/group1``, then the flattened dataset will contain dimension ``lat`` and variable ``group1__lat(lat)``, both in its root group. In this - case, the ``__flattener_variable_map`` global attribute of the + case, the ``_flattener_variable_map`` global attribute of the flattened dataset will contain the mapping ``'group1__lat: - /group1/lat'``, and the ``__flattener_dimension_map`` global + /group1/lat'``, and the ``_flattener_dimension_map`` global attribute will contain the mapping ``'lat: /lat'``. .. versionadded:: (cfdm) NEXTVERSION diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 9428dee0d..1f45f0a17 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -507,7 +507,7 @@ def file_open(self, filename, flatten=True, verbose=None): netCDF = False HDF = False - netcdf_engine = g["netcdf_engine"] + netcdf_backend = g["netcdf_backend"] # Deal with an file in an S3 object store u = urlparse(filename) @@ -525,7 +525,7 @@ def file_open(self, filename, flatten=True, verbose=None): file_systems[fs_key] = file_system # Reset 'filename' to an s3fs.File object that can be - # passed to the netCDF engine + # passed to the netCDF backend filename = file_system.open(u.path[1:], "rb") g["s3fs_File_objects"].append(filename) @@ -534,7 +534,7 @@ def file_open(self, filename, flatten=True, verbose=None): f" S3: s3fs.S3FileSystem options: {storage_options}\n" ) # pragma: no cover - if netcdf_engine is None: + if netcdf_backend is None: try: # Try opening the file with netCDF4 nc = self._open_netCDF4(filename) @@ -548,14 +548,14 @@ def file_open(self, filename, flatten=True, verbose=None): except Exception as error: raise error - elif netcdf_engine == "netCDF4": + elif netcdf_backend == "netCDF4": try: nc = self._open_netCDF4(filename) netCDF = True except Exception as error: raise error - elif netcdf_engine == "h5netcdf": + elif netcdf_backend == "h5netcdf": try: nc = self._open_h5netcdf(filename) HDF = True @@ -563,7 +563,7 @@ def file_open(self, filename, flatten=True, verbose=None): raise error else: - raise ValueError("Unknown netCDF engine: {netcdf_engine!r}") + raise ValueError("Unknown netCDF backend: {netcdf_backend!r}") g["original_h5netcdf"] = HDF g["original_netCDF4"] = netCDF @@ -899,7 +899,7 @@ def read( domain=False, storage_options=None, _file_systems=None, - netcdf_engine=None, + netcdf_backend=None, ): """Reads a netCDF dataset from file or OPenDAP URL. @@ -954,7 +954,7 @@ def read( .. versionadded:: (cfdm) NEXTVERSION - netcdf_engine: `None` or `str`, optional + netcdf_backend: `None` or `str`, optional See `cfdm.read` for details .. versionadded:: (cfdm) NEXTVERSION @@ -1070,9 +1070,9 @@ def read( # -------------------------------------------------------- "cfa": False, # -------------------------------------------------------- - # NetCDF engine + # NetCDF backend # -------------------------------------------------------- - "netcdf_engine": netcdf_engine, + "netcdf_backend": netcdf_backend, # -------------------------------------------------------- # S3 # -------------------------------------------------------- diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 5f0c1185f..714a9e6c9 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -18,7 +18,7 @@ def read( mask=True, unpack=True, domain=False, - netcdf_engine=None, + netcdf_backend=None, storage_options=None, _implementation=_implementation, ): @@ -280,7 +280,7 @@ def read( Specify which library to use for opening and reading netCDF files. By default, or if `None`, then the first one of `netCDF4` and `h5netcdf` to successfully open the file - netCDF file is used. Setting *netcdf_engine* to one of + netCDF file is used. Setting *netcdf_backend* to one of ``'netCDF4'`` and ``'h5netcdf'`` will force the use of that library. @@ -406,7 +406,7 @@ def read( unpack=unpack, domain=domain, storage_options=storage_options, - netcdf_engine=netcdf_engine, + netcdf_backend=netcdf_backend, extra_read_vars=None, ) except MaskError: diff --git a/cfdm/test/test_groups.py b/cfdm/test/test_groups.py index cbe51ab85..ddf040ecc 100644 --- a/cfdm/test/test_groups.py +++ b/cfdm/test/test_groups.py @@ -180,13 +180,15 @@ def test_groups(self): ) nc.close() - h = cfdm.read(grouped_file, netcdf_engine="netCDF4", verbose="WARNING") + h = cfdm.read( + grouped_file, netcdf_backend="netCDF4", verbose="WARNING" + ) self.assertEqual(len(h), 1) h = h[0] self.assertTrue(f.equals(h, verbose=2)) # Check that h5netcdf reads the file correctly - h5 = cfdm.read(grouped_file, netcdf_engine="h5netcdf") + h5 = cfdm.read(grouped_file, netcdf_backend="h5netcdf") self.assertEqual(len(h5), 1) self._check_h5netcdf_groups(h5[0], h) @@ -319,7 +321,7 @@ def test_groups_geometry(self): self.assertTrue(f.equals(h, verbose=2)) # Check that h5netcdf reads the file correctly - h5 = cfdm.read(grouped_file, netcdf_engine="h5netcdf") + h5 = cfdm.read(grouped_file, netcdf_backend="h5netcdf") self.assertEqual(len(h5), 1) self._check_h5netcdf_groups(h5[0], h) @@ -392,7 +394,7 @@ def test_groups_compression(self): self.assertTrue(f.equals(h, verbose=2)) # Check that h5netcdf reads the file correctly - h5 = cfdm.read(grouped_file, netcdf_engine="h5netcdf") + h5 = cfdm.read(grouped_file, netcdf_backend="h5netcdf") self.assertEqual(len(h5), 1) self._check_h5netcdf_groups(h5[0], h) @@ -465,7 +467,7 @@ def test_groups_dimension(self): self.assertTrue(f.equals(h, verbose=3)) # Check that h5netcdf reads the file correctly - h5 = cfdm.read(grouped_file, netcdf_engine="h5netcdf") + h5 = cfdm.read(grouped_file, netcdf_backend="h5netcdf") self.assertEqual(len(h5), 1) self._check_h5netcdf_groups(h5[0], h) @@ -502,13 +504,13 @@ def test_groups_unlimited_dimension(self): cfdm.write(f, grouped_file5, verbose=1) - h = cfdm.read(grouped_file, netcdf_engine="netCDF4") + h = cfdm.read(grouped_file, netcdf_backend="netCDF4") self.assertEqual(len(h), 1) h = h[0] self.assertTrue(f.equals(h)) # Check that h5netcdf reads the file correctly - h5 = cfdm.read(grouped_file, netcdf_engine="h5netcdf") + h5 = cfdm.read(grouped_file, netcdf_backend="h5netcdf") self.assertEqual(len(h5), 1) self._check_h5netcdf_groups(h5[0], h) diff --git a/cfdm/test/test_netcdf_indexer.py b/cfdm/test/test_netcdf_indexer.py index 70ccae76d..40d58ec24 100644 --- a/cfdm/test/test_netcdf_indexer.py +++ b/cfdm/test/test_netcdf_indexer.py @@ -31,7 +31,7 @@ def _remove_tmpfiles(): atexit.register(_remove_tmpfiles) -netcdf_engines = ("netCDF4", "h5netcdf") +netcdf_backends = ("netCDF4", "h5netcdf") class netcdf_indexerTest(unittest.TestCase): @@ -84,8 +84,8 @@ def test_netcdf_indexer_mask(self): nc = netCDF4.Dataset(tmpfile, "r") nc.set_auto_maskandscale(True) nc.set_always_mask(True) - for engine in netcdf_engines: - f = cfdm.read(tmpfile, netcdf_engine=engine) + for backend in netcdf_backends: + f = cfdm.read(tmpfile, netcdf_backend=backend) for g in f: ncvar = g.nc_get_variable() n = nc.variables[ncvar] @@ -116,8 +116,8 @@ def test_netcdf_indexer_unpack(self): nc = netCDF4.Dataset(tmpfile, "r") nc.set_auto_maskandscale(True) nc.set_always_mask(True) - for engine in netcdf_engines: - f = cfdm.read(tmpfile, netcdf_engine=engine) + for backend in netcdf_backends: + f = cfdm.read(tmpfile, netcdf_backend=backend) for g in f: ncvar = g.nc_get_variable() n = nc.variables[ncvar] diff --git a/cfdm/test/test_read_write.py b/cfdm/test/test_read_write.py index 71f218509..fbfa5d02b 100644 --- a/cfdm/test/test_read_write.py +++ b/cfdm/test/test_read_write.py @@ -671,8 +671,8 @@ def test_read_CDL(self): def test_read_write_string(self): """Test the `string` keyword argument to `read` and `write`.""" - fn = cfdm.read(self.string_filename, netcdf_engine="netCDF4") - fh = cfdm.read(self.string_filename, netcdf_engine="h5netcdf") + fn = cfdm.read(self.string_filename, netcdf_backend="netCDF4") + fh = cfdm.read(self.string_filename, netcdf_backend="h5netcdf") n = int(len(fn) / 2) From b3a1b562559b735f5bc60c212aaf85280de60ee9 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 20 Jun 2024 11:36:39 +0100 Subject: [PATCH 60/88] dev --- Changelog.rst | 4 +- cfdm/__init__.py | 4 +- cfdm/data/mixin/filearraymixin.py | 6 +- cfdm/data/netcdfindexer.py | 99 ++++++++++++---------- cfdm/read_write/netcdf/flatten/__init__.py | 1 + cfdm/read_write/netcdf/flatten/config.py | 1 + cfdm/read_write/netcdf/flatten/flatten.py | 12 +-- cfdm/read_write/netcdf/netcdfread.py | 6 +- cfdm/test/test_netcdf_indexer.py | 26 ++++++ requirements.txt | 4 +- 10 files changed, 98 insertions(+), 65 deletions(-) diff --git a/Changelog.rst b/Changelog.rst index f6806deb5..535325399 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -19,8 +19,8 @@ Version NEXTVERSION * New class `cfdm.NetCDFIndexer` * New dependency: ``h5netcdf>=1.3.0`` * New dependency: ``h5py>=3.10.0`` -* New dependency: ``s3fs>=2024.3.0`` -* New dependency: ``dask>=2024.4.1`` +* New dependency: ``s3fs>=2024.6.0`` +* New dependency: ``dask>=2024.6.0`` * Removed dependency: ``netcdf_flattener`` ---- diff --git a/cfdm/__init__.py b/cfdm/__init__.py index 15b833607..21e205d8a 100644 --- a/cfdm/__init__.py +++ b/cfdm/__init__.py @@ -117,7 +117,7 @@ except ImportError as error1: raise ImportError(_error0 + str(error1)) -_minimum_vn = "2024.3.0" +_minimum_vn = "2024.6.0" if Version(s3fs.__version__) < Version(_minimum_vn): raise ValueError( f"Bad s3fs version: cfdm requires s3fs>={_minimum_vn}. " @@ -143,7 +143,7 @@ except ImportError as error1: raise ImportError(_error0 + str(error1)) -_minimum_vn = "2024.4.0" +_minimum_vn = "2024.6.0" if Version(dask.__version__) < Version(_minimum_vn): raise ValueError( f"Bad scipy version: cfdm requires dask>={_minimum_vn}. " diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index 939fab4db..b5b40d7ab 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -314,9 +314,9 @@ def get_storage_options( if parsed_filename is not None and parsed_filename.scheme == "s3": # Derive endpoint_url from filename - storage_options[ - "endpoint_url" - ] = f"https://{parsed_filename.netloc}" + storage_options["endpoint_url"] = ( + f"https://{parsed_filename.netloc}" + ) return storage_options diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index 3f3f1bd9e..8aeab7a48 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -1,7 +1,7 @@ """A data indexer that applies netCDF masking and unpacking. -Portions of this code were adapted from the `netCDF4` library, which -carries the following MIT License: +Portions of this code were adapted from the `netCDF4` Python library, +which carries the following MIT License: Copyright 2008 Jeffrey Whitaker @@ -19,6 +19,7 @@ included in all copies or substantial portions of the Software. """ + import logging from math import prod from numbers import Integral @@ -38,22 +39,18 @@ class netcdf_indexer: means that the index for each dimension is applied independently, regardless of how that index was defined. For instance, the indices ``[[0, 1], [1, 3], 0]`` and ``[:2, 1:4:2, 0]`` will give - identical results. This behaviour is different to that of - `numpy`. Non-orthogonal indexing means that normal `numpy` - indexing rules are applied. - - During indexing, masking and unpacking is applied according to the - netCDF conventions, either or both of which may be disabled via - initialisation options. + identical results. Orthogonal indexing is different to the + indexing behaviour of `numpy`. Non-orthogonal indexing means that + normal `numpy` indexing rules are applied. In addition, string and character variables are always converted to unicode arrays, the latter with the last dimension concatenated. - Masking and unpacking operations are defined by the conventions - for netCDF attributes, which are either provided as part of the - input *data* object, or given with the input *attributes* - parameter. + Masking and unpacking operations, either or both may be disabled + via initialisation options, are defined by the conventions for + netCDF attributes, which are either provided as part of the input + *variable* object, or given with the input *attributes* parameter. The relevant netCDF attributes that are considered are: @@ -129,7 +126,7 @@ def __init__( variable: The variable to be indexed. May be any variable that has the same API as one of `numpy.ndarray`, - `netCDF4.Variable` or `h5py.Variable` (which includes + `netCDF4.Variable`, or `h5py.Variable` (which includes `h5netcdf.Variable`). Any masking and unpacking that could be applied by *variable* itself (e.g. by a `netCDF4.Variable` instance) is disabled, ensuring @@ -174,15 +171,20 @@ def __init__( relevant to masking and unpacking are considered, with all other attributes being ignored. If *attributes* is `None`, the default, then the netCDF attributes stored - by *variable* itself (if any) are used. If - *attributes* is not `None`, then any netCDF attributes - stored by *variable* itself are ignored. + by *variable* (if any) are used. If *attributes* is + not `None`, then any netCDF attributes stored by + *variable* are ignored. copy: `bool`, optional - If True then return a copy of the subspace that is not - a view of part of the the original data. If False, the - default, then the returned subspace could be either a - copy or a view. + If True then return a `numpy` array that is not a view + of part of the the original data, i.e. in-place + changes to the returned subspace will not affect the + original *variable*. This is done by returning an + in-memory copy the subspace. If False, the default, no + in-memory copy is done, and then whether or not + in-place changes to the returned subspace affect + *variable* will depend on how subspacing is + implemented by *variable*`. """ self.variable = variable @@ -196,7 +198,7 @@ def __init__( def __getitem__(self, index): """Return a subspace of the variable as a `numpy` array. - v.__getitem__(index) <==> v[index] + n.__getitem__(index) <==> v[index] If `__orthogonal_indexing__` is True then indexing is orthogonal. If `__orthogonal_indexing__` is False then normal @@ -414,9 +416,9 @@ def _index(self, index): # Still here? Then do orthogonal indexing. # ------------------------------------------------------------ - # Create an index that replaces integer indices with size 1 - # slices, so that their axes are not dropped yet (they will be - # dropeed later). + # Create an index that replaces integers with size 1 slices, + # so that their axes are not dropped yet (they will be dropped + # later). index0 = [ slice(i, i + 1) if isinstance(i, Integral) else i for i in index ] @@ -426,17 +428,18 @@ def _index(self, index): # variable natively supports orthogonal indexing. # # Note: `netCDF4.Variable` natively supports orthogonal - # indexing; but `numpy.ndarray`, `h5netcdf.File` and - # `h5py.File` do not. + # indexing; but `h5netcdf.File`, `h5py.File`, and + # `numpy.ndarray`, do not. data = data[tuple(index0)] else: # There are two or more list/1-d array indices, and the # variable does not natively support orthogonal indexing # => emulate orthogonal indexing with a sequence of - # subspaces, one for each list/1-d array index. + # independent subspaces, one for each list/1-d array + # index. - # 1) Apply the slice indices at the time as the list/1-d - # array index that gives the smallest result. + # 1) Apply the slice indices at the same time as the + # list/1-d array index that gives the smallest result. # Create an index that replaces each list/1-d array with # slice(None) @@ -447,7 +450,7 @@ def _index(self, index): # Find the position of the list/1-d array index that gives # the smallest result, and apply the subspace of slices # and the chosen list/1-d array index. This will give the - # samllest memory footprint of the whole operation. + # smallest high-water memory mark of the whole operation. shape1 = self.index_shape(index1, data.shape) size1 = prod(shape1) sizes = [ @@ -503,7 +506,7 @@ def _mask(self, data, dtype, attributes, dtype_unsigned_int): :Returns: - `nump.ndarray` + `numpy.ndarray` The masked data. """ @@ -802,7 +805,7 @@ def attributes(self): **Examples** - >>> v.attributes() + >>> n.attributes() {'standard_name': 'air_temperature', 'missing_value': -999.0} @@ -833,46 +836,48 @@ def attributes(self): def index_shape(cls, index, shape): """Return the shape of the array subspace implied by indices. - .. versionadded:: (cfdm) NEXTRELEASE + .. versionadded:: (cfdm) NEXTVERSION :Parameters: - indices: `tuple` + index: `tuple` The indices to be applied to an array with shape *shape*. - shape: sequence of `ints` + shape: sequence of `int` The shape of the array to be subspaced. :Returns: `list` - The shape of the subspace defined by the *indices*. + The shape of the subspace defined by the *index*. **Examples** >>> import numpy as np - >>> n.indices_shape((slice(2, 5), 4), (10, 20)) + >>> n.index_shape((slice(2, 5), [4]), (10, 20)) [3, 1] - >>> n.indices_shape(([2, 3, 4], np.arange(1, 6)), (10, 20)) + >>> n.index_shape((slice(2, 5), 4), (10, 20)) + [3] + >>> n.index_shape(([2, 3, 4], np.arange(1, 6)), (10, 20)) [3, 5] - >>> n.indices_shape((slice(None), [True, False, True]), (10, 3)) + >>> n.index_shape((slice(None), [True, False, True]), (10, 3)) [10, 2] >>> index0 = np.arange(5) >>> index0 = index0[index0 < 3] - >>> n.indices_shape((index0, []), (10, 20)) + >>> n.index_shape((index0, []), (10, 20)) [3, 0] - >>> n.indices_shape((slice(1, 5, 3), 3), (10, 20)) - [2, 1] - >>> n.indices_shape((slice(5, 1, -2), 3), (10, 20)) + >>> n.index_shape((slice(1, 5, 3), [3]), (10, 20)) [2, 1] - >>> n.indices_shape((slice(5, 1, 3), 3), (10, 20)) - [0, 1] - >>> n.indices_shape((slice(1, 5, -3), 3), (10, 20)) + >>> n.index_shape((slice(5, 1, -2), 3), (10, 20)) + [2] + >>> n.index_shape((slice(5, 1, 3), [3]), (10, 20)) [0, 1] + >>> n.index_shape((slice(1, 5, -3), 3), (10, 20)) + [0] """ implied_shape = [] diff --git a/cfdm/read_write/netcdf/flatten/__init__.py b/cfdm/read_write/netcdf/flatten/__init__.py index 4a9887947..82e6a3c9e 100644 --- a/cfdm/read_write/netcdf/flatten/__init__.py +++ b/cfdm/read_write/netcdf/flatten/__init__.py @@ -12,4 +12,5 @@ of the License at http://www.apache.org/licenses/LICENSE-2.0. """ + from .flatten import netcdf_flatten diff --git a/cfdm/read_write/netcdf/flatten/config.py b/cfdm/read_write/netcdf/flatten/config.py index 6f2fc4a7e..ac82d47f4 100644 --- a/cfdm/read_write/netcdf/flatten/config.py +++ b/cfdm/read_write/netcdf/flatten/config.py @@ -3,6 +3,7 @@ .. versionadded:: (cfdm) NEXTVERSION """ + from dataclasses import dataclass # Maximum length of name after which it is replaced with its hash diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 20d5f9edf..51f76d623 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -741,9 +741,9 @@ def flatten_dimension(self, dim): ) # Store new name in dict for resolving references later - self._dim_map[ - self.pathname(self.group(dim), self.name(dim)) - ] = new_name + self._dim_map[self.pathname(self.group(dim), self.name(dim))] = ( + new_name + ) # Add to name mapping attribute self._dim_map_value.append( @@ -822,9 +822,9 @@ def flatten_variable(self, var): new_var.setncatts(attributes) # Store new name in dict for resolving references later - self._var_map[ - self.pathname(self.group(var), self.name(var)) - ] = new_name + self._var_map[self.pathname(self.group(var), self.name(var))] = ( + new_name + ) # Add to name mapping attribute self._var_map_value.append( diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 65295b26c..3a4e32a3d 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -10307,9 +10307,9 @@ def _get_storage_options(self, filename, parsed_filename): "endpoint_url" not in storage_options and "endpoint_url" not in client_kwargs ): - storage_options[ - "endpoint_url" - ] = f"https://{parsed_filename.netloc}" + storage_options["endpoint_url"] = ( + f"https://{parsed_filename.netloc}" + ) g["file_system_storage_options"].setdefault(filename, storage_options) diff --git a/cfdm/test/test_netcdf_indexer.py b/cfdm/test/test_netcdf_indexer.py index 40d58ec24..571f8cdc5 100644 --- a/cfdm/test/test_netcdf_indexer.py +++ b/cfdm/test/test_netcdf_indexer.py @@ -194,6 +194,32 @@ def test_netcdf_indexer_Ellipsis(self): x = cfdm.netcdf_indexer(n) self.assertTrue((x[...] == n).all()) + def test_netcdf_indexer_index_shape(self): + """Test netcdf_indexer shape.""" + x = cfdm.netcdf_indexer + self.assertEqual(x.index_shape((slice(2, 5), [4]), (10, 20)), [3, 1]) + self.assertEqual(x.index_shape((slice(2, 5), 4), (10, 20)), [3]) + self.assertEqual( + x.index_shape(([2, 3, 4], np.arange(1, 6)), (10, 20)), [3, 5] + ) + + self.assertEqual( + x.index_shape((slice(None), [True, False, True]), (10, 3)), [10, 2] + ) + + index0 = np.arange(5) + index0 = index0[index0 < 3] + self.assertEqual(x.index_shape((index0, []), (10, 20)), [3, 0]) + + self.assertEqual( + x.index_shape((slice(1, 5, 3), [3]), (10, 20)), [2, 1] + ) + self.assertEqual(x.index_shape((slice(5, 1, -2), 3), (10, 20)), [2]) + self.assertEqual( + x.index_shape((slice(5, 1, 3), [3]), (10, 20)), [0, 1] + ) + self.assertEqual(x.index_shape((slice(1, 5, -3), 3), (10, 20)), [0]) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/requirements.txt b/requirements.txt index 73deb66c8..586e45931 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,6 @@ packaging>=20.0 scipy>=1.10.0 h5netcdf>=1.3.0 h5py>=3.10.0 -s3fs>=2024.3.0 -dask>=2024.4.0 +s3fs>=2024.6.0 +dask>=2024.6.0 From a1e8bc88ec591942ed6ef18bf09e9364ff2c1063 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 20 Jun 2024 14:45:44 +0100 Subject: [PATCH 61/88] dev --- cfdm/data/h5netcdfarray.py | 9 ++-- cfdm/data/mixin/filearraymixin.py | 51 ++++-------------- cfdm/data/mixin/netcdffilemixin.py | 40 -------------- cfdm/read_write/netcdf/netcdfread.py | 45 +++++++--------- cfdm/read_write/netcdf/netcdfwrite.py | 4 +- cfdm/read_write/read.py | 69 ++++++++++++------------ docs/source/class/cfdm.H5netcdfArray.rst | 53 +++++++++--------- docs/source/class/cfdm.NetCDF4Array.rst | 61 +++++++++++---------- docs/source/class/cfdm.NetCDFIndexer.rst | 6 +-- 9 files changed, 127 insertions(+), 211 deletions(-) diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index ba5beae82..825693065 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -204,16 +204,13 @@ def _set_attributes(self, var): :Returns: - `dict` - The attributes. + `None` """ - attributes = self._get_component("attributes", None) - if attributes is not None: + if self._get_component("attributes", None) is not None: return - attributes = dict(var.attrs) - self._set_component("attributes", attributes, copy=False) + self._set_component("attributes", dict(var.attrs), copy=False) def close(self, dataset): """Close the dataset containing the data. diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index b5b40d7ab..be440e5de 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -111,35 +111,6 @@ def get_addresses(self): """ return self._get_component("address", ()) - # def get_attributes(self, default=ValueError()): - # """The attributes of the array. - # - # .. versionadded:: (cfdm) NEXTVERSION - # - # :Parameters: - # - # default: optional - # Return the value of the *default* parameter if the - # attributes have not been set. If set to an `Exception` - # instance then it will be raised instead. - # - # :Returns: - # - # The attributes. - # - # """ - # attributes = self._get_component("attributes", None) - # if attributes is None: - # if default is None: - # return - # - # return self._default( - # default, - # f"{self.__class__.__name__} attributes have not yet been set", - # ) - # - # return deepcopy(attributes) - def get_filename(self, default=AttributeError()): """The name of the file containing the array. @@ -248,20 +219,20 @@ def get_storage_options( create_endpoint_url: `bool`, optional If True, the default, then create an - ``'endpoint_url'`` if and only if one has not already - been provided. See *filename* and *parsed_filename* - for details. + ``'endpoint_url'`` option if and only if one has not + already been provided. See *filename* and + *parsed_filename* for details. filename: `str`, optional - Used to set the ``'endpoint_url'`` key if it has not - been previously defined. Ignored if *parse_filename* - has been set. + Used to set the ``'endpoint_url'`` option if it has + not been previously defined. Ignored if + *parse_filename* has been set. parsed_filename: `urllib.parse.ParseResult`, optional - Used to set the ``'endpoint_url'`` key if it has not - been previously defined. By default the - ``'endpoint_url'`` key, if required, is set from the - file name returned by `get_filename`. + Used to set the ``'endpoint_url'`` option if it has + not been previously defined. By default the + ``'endpoint_url'`` optiona, if required, is set from + the file name returned by `get_filename`. :Returns: @@ -339,7 +310,7 @@ def open(self, func, *args, **kwargs): :Returns: - `tuple` + 2-`tuple` The file object for the dataset, and the address of the data within the file. diff --git a/cfdm/data/mixin/netcdffilemixin.py b/cfdm/data/mixin/netcdffilemixin.py index 0cd75886a..8a53be200 100644 --- a/cfdm/data/mixin/netcdffilemixin.py +++ b/cfdm/data/mixin/netcdffilemixin.py @@ -66,46 +66,6 @@ def _set_attributes(self, var): f"Must implement {self.__class__.__name__}._set_attributes" ) # pragma: no cover - # def _set_units(self, var): - # """The units and calendar properties. - # - # These are set from the netCDF variable attributes, but only if - # they have already not been defined, either during {{class}} - # instantiation or by a previous call to `_set_units`. - # - # .. versionadded:: (cfdm) 1.10.0.1 - # - # :Parameters: - # - # var: `netCDF4.Variable` or `h5netcdf.Variable` - # The variable containing the units and calendar - # definitions. - # - # :Returns: - # - # `tuple` - # The units and calendar values, either of which may be - # `None`. - # - # """ - # # We assume that an attributes dictionary exists - # attributes = self._get_component("attributes") - # - # # Note: Can't use None as the default since it is a valid - # # `units` or 'calendar' value that indicates that the - # # attribute has not been set in the dataset. - # units = self._get_component("units", False) - # if units is False: - # self._set_component("units", attributes.get("units"), copy=False) - # - # calendar = self._get_component("calendar", False) - # if calendar is False: - # self._set_component( - # "calendar", attributes.get("calendar"), copy=False - # ) - # - # return units, calendar - @property def array(self): """Return an independent numpy array containing the data. diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 3a4e32a3d..77e2fc792 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -629,6 +629,12 @@ def _open_netCDF4(self, filename): def _open_h5netcdf(self, filename): """Return an open `h5netcdf.File`. + Uses values of the ``rdcc_nbytes``, ``rdcc_w0``, and + ``rdcc_nslots`` parameters to `h5netcdf.File` that correspond + to the default values of the `netCDF4.set_chunk_cache` + parameters ``size``, ``nelems``, and ``preemption``, + respectively. + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -641,7 +647,14 @@ def _open_h5netcdf(self, filename): `h5netcdf.File` """ - return h5netcdf.File(filename, "r", decode_vlen_strings=True) + return h5netcdf.File( + filename, + "r", + decode_vlen_strings=True, + rdcc_nbytes=16777216, + rdcc_w0=0.75, + rdcc_nslots=4133, + ) @classmethod def cdl_to_netcdf(cls, filename): @@ -650,7 +663,7 @@ def cdl_to_netcdf(cls, filename): :Parameters: filename: `str` - The name sdef _netof the CDL file. + The name of the CDL file. :Returns: @@ -1076,13 +1089,13 @@ def read( # -------------------------------------------------------- # S3 # -------------------------------------------------------- - # + # Input file system storage options "storage_options": storage_options, - # - "file_systems": {}, - # + # File system storage options for each file "file_system_storage_options": {}, - # + # Cached s3fs.S3FileSystem objects + "file_systems": {}, + # Cache of open s3fs.File objects "s3fs_File_objects": [], } @@ -1166,26 +1179,12 @@ def read( # 'global_attributes' dictionary # ---------------------------------------------------------------- global_attributes = {} - # for attr in map(str,nc.ncattrs()): for attr, value in self._file_global_attributes(nc).items(): attr = str(attr) if isinstance(value, bytes): value = value.decode(errors="ignore") global_attributes[attr] = value - # print (attr, value, type(value)) - - # var - # try: - # if isinstance(value, str): - # try: - # global_attributes[attr] = str(value) - # except UnicodeEncodeError: - # global_attributes[attr] = value.encode(errors="ignore") - # else: - # global_attributes[attr] = value.decode('utf-8') - # except UnicodeDecodeError: - # pass g["global_attributes"] = global_attributes if debug: @@ -1397,7 +1396,6 @@ def read( variable_grouped_dataset[ncvar] = g["nc_grouped"] variable_attributes[ncvar] = {} - # for attr in map(str, variable.ncattrs()): for attr, value in self._file_variable_attributes( variable ).items(): @@ -1495,7 +1493,6 @@ def read( # The netCDF dimensions of the parent file internal_dimension_sizes = {} - # for name, dimension in nc.dimensions.items(): for name, dimension in self._file_dimensions(nc).items(): if ( has_groups @@ -2309,8 +2306,6 @@ def _get_variables_from_external_files(self, netcdf_external_variables): # Remove this ncvar from the set of external variables external_variables.remove(ncvar) - # TODO h5netcdf S3: include s3 vars here? - def _parse_compression_gathered(self, ncvar, compress): """Parse a list variable for compressing arrays by gathering.""" g = self.read_vars diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index deed75606..02519a309 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -2661,8 +2661,7 @@ def _write_netcdf_variable( if g["dry_run"]: return - # print (ncvar, repr(cfvar.properties())) - # logger.info(f" Writing {cfvar!r}") # pragma: no cover + logger.info(f" Writing {cfvar!r}") # pragma: no cover # Set 'construct_type' if not construct_type: @@ -4460,7 +4459,6 @@ def file_open(self, filename, mode, fmt, fields): os.remove(filename) try: - # nc.set_chunk_cache(16*1024*1024) # 16MiB chunkcache nc = netCDF4.Dataset(filename, mode, format=fmt) except RuntimeError as error: raise RuntimeError(f"{error}: {filename}") diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 714a9e6c9..e700c91c3 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -292,42 +292,39 @@ def read( options are interpreted depends on the location of the file: - **Local File System** - - Storage options are ignored for local files. - - **HTTP(S)** - - Storage options are ignored for files available across the - network via OPeNDAP. - - **S3-compatible services** - - The backend used is `s3fs`, and the storage options are - used to initialise an `s3fs.S3FileSystem` file system - object. By default, or if `None`, then *storage_options* - is taken as ``{}``. - - If the ``'endpoint_url'`` key is not in *storage_options*, - nor in a dictionary defined by the ``'client_kwargs'`` key - (both of which are the case when *storage_options* is - `None`), then one will be automatically inserted for - accessing an S3 file. For example, for a file name of - ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key - with value ``'https://store'`` would be created. To - disable this, set ``'endpoint_url'`` to `None`. - - *Parameter example:* - For a file name of ``'s3://store/data/file.nc'``, the - following are equivalent: ``None``, ``{}``, - ``{'endpoint_url': 'https://store'}``, and - ``{'client_kwargs': {'endpoint_url': 'https://store'}}`` - - *Parameter example:* - ``{'key: 'scaleway-api-key...', 'secret': - 'scaleway-secretkey...', 'endpoint_url': - 'https://s3.fr-par.scw.cloud', 'client_kwargs': - {'region_name': 'fr-par'}}`` + * **Local File System**: Storage options are ignored for + local files. + + * **HTTP(S)**: Storage options are ignored for files + available across the network via OPeNDAP. + + * **S3-compatible services**: The backend used is `s3fs`, + and the storage options are used to initialise an + `s3fs.S3FileSystem` file system object. By default, or + if `None`, then *storage_options* is taken as ``{}``. + + If the ``'endpoint_url'`` key is not in + *storage_options*, nor in a dictionary defined by the + ``'client_kwargs'`` key (both of which are the case when + *storage_options* is `None`), then one will be + automatically inserted for accessing an S3 file. For + instance, with a file name of + ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key + with value ``'https://store'`` would be created. To + disable this, set the ``'endpoint_url'`` key to `None`. + + *Parameter example:* + For a file name of ``'s3://store/data/file.nc'``, the + following are equivalent: ``None``, ``{}``, + ``{'endpoint_url': 'https://store'}``, and + ``{'client_kwargs': {'endpoint_url': + 'https://store'}}`` + + *Parameter example:* + ``{'key: 'scaleway-api-key...', 'secret': + 'scaleway-secretkey...', 'endpoint_url': + 'https://s3.fr-par.scw.cloud', 'client_kwargs': + {'region_name': 'fr-par'}}`` .. versionadded:: (cfdm) NEXTVERSION diff --git a/docs/source/class/cfdm.H5netcdfArray.rst b/docs/source/class/cfdm.H5netcdfArray.rst index 506fc3c3b..1576d7594 100644 --- a/docs/source/class/cfdm.H5netcdfArray.rst +++ b/docs/source/class/cfdm.H5netcdfArray.rst @@ -24,14 +24,13 @@ Inspection ~cfdm.H5netcdfArray.get_subspace ~cfdm.H5netcdfArray.get_attributes - .. rubric:: Attributes .. autosummary:: :nosignatures: :toctree: ../attribute/ :template: attribute.rst - + ~cfdm.H5netcdfArray.array ~cfdm.H5netcdfArray.dtype ~cfdm.H5netcdfArray.ndim @@ -47,20 +46,20 @@ Units :nosignatures: :toctree: ../method/ :template: method.rst - + ~cfdm.H5netcdfArray.get_calendar ~cfdm.H5netcdfArray.get_units - + File ---- - + .. rubric:: Methods .. autosummary:: :nosignatures: :toctree: ../method/ :template: method.rst - + ~cfdm.H5netcdfArray.get_address ~cfdm.H5netcdfArray.get_addresses ~cfdm.H5netcdfArray.close @@ -73,7 +72,7 @@ File ~cfdm.H5netcdfArray.get_mask ~cfdm.H5netcdfArray.get_unpack ~cfdm.H5netcdfArray.get_storage_options - + Miscellaneous ------------- @@ -81,10 +80,10 @@ Miscellaneous :nosignatures: :toctree: ../method/ :template: method.rst - + ~cfdm.H5netcdfArray.copy ~cfdm.H5netcdfArray.to_memory - + Special ------- @@ -92,32 +91,32 @@ Special :nosignatures: :toctree: ../method/ :template: method.rst - + ~cfdm.H5netcdfArray.__getitem__ Docstring substitutions ----------------------- - + .. rubric:: Methods - + .. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - + :nosignatures: + :toctree: ../method/ + :template: method.rst + ~cfdm.H5netcdfArray._docstring_special_substitutions - ~cfdm.H5netcdfArray._docstring_substitutions - ~cfdm.H5netcdfArray._docstring_package_depth - ~cfdm.H5netcdfArray._docstring_method_exclusions + ~cfdm.H5netcdfArray._docstring_substitutions + ~cfdm.H5netcdfArray._docstring_package_depth + ~cfdm.H5netcdfArray._docstring_method_exclusions Deprecated ---------- - -.. rubric:: Methods - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + ~cfdm.H5netcdfArray.get_missing_values diff --git a/docs/source/class/cfdm.NetCDF4Array.rst b/docs/source/class/cfdm.NetCDF4Array.rst index 0b2e22668..d087ab192 100644 --- a/docs/source/class/cfdm.NetCDF4Array.rst +++ b/docs/source/class/cfdm.NetCDF4Array.rst @@ -19,19 +19,18 @@ Inspection :nosignatures: :toctree: ../method/ :template: method.rst - + ~cfdm.NetCDF4Array.get_compression_type ~cfdm.NetCDF4Array.get_subspace ~cfdm.NetCDF4Array.get_attributes - .. rubric:: Attributes .. autosummary:: :nosignatures: :toctree: ../attribute/ :template: attribute.rst - + ~cfdm.NetCDF4Array.array ~cfdm.NetCDF4Array.dtype ~cfdm.NetCDF4Array.ndim @@ -47,20 +46,20 @@ Units :nosignatures: :toctree: ../method/ :template: method.rst - + ~cfdm.NetCDF4Array.get_calendar ~cfdm.NetCDF4Array.get_units - + File ---- - + .. rubric:: Methods .. autosummary:: :nosignatures: :toctree: ../method/ :template: method.rst - + ~cfdm.NetCDF4Array.get_address ~cfdm.NetCDF4Array.get_addresses ~cfdm.NetCDF4Array.close @@ -73,7 +72,7 @@ File ~cfdm.NetCDF4Array.get_mask ~cfdm.NetCDF4Array.get_unpack ~cfdm.NetCDF4Array.get_storage_options - + Miscellaneous ------------- @@ -81,10 +80,10 @@ Miscellaneous :nosignatures: :toctree: ../method/ :template: method.rst - + ~cfdm.NetCDF4Array.copy ~cfdm.NetCDF4Array.to_memory - + Special ------- @@ -92,32 +91,32 @@ Special :nosignatures: :toctree: ../method/ :template: method.rst - + ~cfdm.NetCDF4Array.__getitem__ Docstring substitutions ------------------------ - -.. rubric:: Methods - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - +----------------------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + ~cfdm.NetCDF4Array._docstring_special_substitutions - ~cfdm.NetCDF4Array._docstring_substitutions - ~cfdm.NetCDF4Array._docstring_package_depth - ~cfdm.NetCDF4Array._docstring_method_exclusions + ~cfdm.NetCDF4Array._docstring_substitutions + ~cfdm.NetCDF4Array._docstring_package_depth + ~cfdm.NetCDF4Array._docstring_method_exclusions Deprecated ---------- - -.. rubric:: Methods - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + ~cfdm.NetCDF4Array.get_missing_values diff --git a/docs/source/class/cfdm.NetCDFIndexer.rst b/docs/source/class/cfdm.NetCDFIndexer.rst index 5e236b0f8..6dd64d263 100644 --- a/docs/source/class/cfdm.NetCDFIndexer.rst +++ b/docs/source/class/cfdm.NetCDFIndexer.rst @@ -19,11 +19,11 @@ Inspection :nosignatures: :toctree: ../method/ :template: method.rst - + ~cfdm.NetCDFIndexer.attributes .. rubric:: Attributes - + .. autosummary:: :nosignatures: :toctree: ../attribute/ @@ -38,5 +38,5 @@ Special :nosignatures: :toctree: ../method/ :template: method.rst - + ~cfdm.NetCDFIndexer.__getitem__ From 283e9dd166b6173184182253a52abec5ac9a3307 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:01:07 +0100 Subject: [PATCH 62/88] Typos Co-authored-by: Sadie L. Bartholomew --- cfdm/data/data.py | 4 ++-- cfdm/data/h5netcdfarray.py | 4 ++-- cfdm/data/mixin/filearraymixin.py | 8 ++++---- cfdm/data/mixin/netcdffilemixin.py | 12 ++++++------ cfdm/data/netcdf4array.py | 10 +++++----- cfdm/docstring/docstring.py | 2 +- cfdm/functions.py | 2 ++ cfdm/read_write/read.py | 4 ++-- docs/source/installation.rst | 2 +- docs/source/tutorial.rst | 4 ++-- 10 files changed, 27 insertions(+), 25 deletions(-) diff --git a/cfdm/data/data.py b/cfdm/data/data.py index 1f58f3a07..56f1db18b 100644 --- a/cfdm/data/data.py +++ b/cfdm/data/data.py @@ -2211,7 +2211,7 @@ def maximum(self, axes=None, squeeze=False): {{axes int examples}} - squeeze: `bool`, optional}} + squeeze: `bool`, optional If this is set to False, the default, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast @@ -2437,7 +2437,7 @@ def sum(self, axes=None, squeeze=False): {{axes int examples}} - squeeze: `bool`, optional}} + squeeze: `bool`, optional If this is set to False, the default, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index 825693065..ee1a35345 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -69,7 +69,7 @@ def __init__( attributes will be set from the netCDF variable during the first `__getitem__` call. - .. versionadded:: (cfdm) NEXTRELEASE + .. versionadded:: (cfdm) NEXTVERSION {{init storage_options: `dict` or `None`, optional}} @@ -220,7 +220,7 @@ def close(self, dataset): :Parameters: dataset: `h5netcdf.File` - The netCDF dataset to be be closed. + The netCDF dataset to be closed. :Returns: diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index be440e5de..35e205ca6 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -205,7 +205,7 @@ def get_missing_values(self): raise DeprecationError( f"{self.__class__.__name__}.get_missing_values was deprecated " "at version NEXTVERSION and is no longer available. " - "Use {self.__class__.__name__}.get_attributes instead." + f"Use {self.__class__.__name__}.get_attributes instead." ) # pragma: no cover def get_storage_options( @@ -226,12 +226,12 @@ def get_storage_options( filename: `str`, optional Used to set the ``'endpoint_url'`` option if it has not been previously defined. Ignored if - *parse_filename* has been set. + *parsed_filename* has been set. parsed_filename: `urllib.parse.ParseResult`, optional Used to set the ``'endpoint_url'`` option if it has not been previously defined. By default the - ``'endpoint_url'`` optiona, if required, is set from + ``'endpoint_url'`` option, if required, is set from the file name returned by `get_filename`. :Returns: @@ -254,7 +254,7 @@ def get_storage_options( {} >>> f.get_storage_options() - {'key: 'scaleway-api-key...', + {'key': 'scaleway-api-key...', 'secret': 'scaleway-secretkey...', 'endpoint_url': 'https://s3.fr-par.scw.cloud', 'client_kwargs': {'region_name': 'fr-par'}} diff --git a/cfdm/data/mixin/netcdffilemixin.py b/cfdm/data/mixin/netcdffilemixin.py index 8a53be200..0e0f747b9 100644 --- a/cfdm/data/mixin/netcdffilemixin.py +++ b/cfdm/data/mixin/netcdffilemixin.py @@ -15,14 +15,14 @@ class NetCDFFileMixin: """ def _group(self, dataset, groups): - """Retrun the group object containing a variable. + """Return the group object containing a variable. .. versionadded:: (cfdm) NEXTVERSION :Parameters: dataset: `netCDF4.Dataset` or `h5netcdf.File` - The dataset containging the variable. + The dataset containing the variable. groups: sequence of `str` The definition of which group the variable is in. For @@ -94,7 +94,7 @@ def close(self, dataset): :Parameters: dataset: - The dataset to be be closed. + The dataset to be closed. :Returns: @@ -146,7 +146,7 @@ def get_missing_values(self, default=ValueError()): :Parameters: default: optional - Return the value of the *default* parameter no missing + Return the value of the *default* parameter if no missing values have yet been defined. {{default Exception}} @@ -181,7 +181,7 @@ def get_missing_values(self, default=ValueError()): raise DeprecationError( f"{self.__class__.__name__}.get_missing_values was deprecated " "at version NEXTVERSION and is no longer available. " - "Use {self.__class__.__name__}.get_attributes instead." + f"Use {self.__class__.__name__}.get_attributes instead." ) def get_unpack(self): @@ -205,7 +205,7 @@ def to_memory(self): :Returns: `NumpyArray` - The new with all of its data in memory. + The new array with all of its data in memory. """ return NumpyArray(self[...]) diff --git a/cfdm/data/netcdf4array.py b/cfdm/data/netcdf4array.py index 3619aa905..212fd9fe9 100644 --- a/cfdm/data/netcdf4array.py +++ b/cfdm/data/netcdf4array.py @@ -67,7 +67,7 @@ def __init__( attributes will be set from the netCDF variable during the first `__getitem__` call. - .. versionadded:: (cfdm) NEXTRELEASE + .. versionadded:: (cfdm) NEXTVERSION {{init storage_options: `dict` or `None`, optional}} @@ -83,7 +83,7 @@ def __init__( missing_values: Deprecated at version NEXTVERSION The missing value indicators defined by the netCDF - variable attributes. The may now be recorded via the + variable attributes. They may now be recorded via the *attributes* parameter ncvar: Deprecated at version 1.10.1.0 @@ -96,11 +96,11 @@ def __init__( Use the *address* parameter instead. units: `str` or `None`, optional - Deprecated at version NEXTRELEASE. Use the + Deprecated at version NEXTVERSION. Use the *attributes* parameter instead. calendar: `str` or `None`, optional - Deprecated at version NEXTRELEASE. Use the + Deprecated at version NEXTVERSION. Use the *attributes* parameter instead. """ @@ -333,7 +333,7 @@ def close(self, dataset): :Parameters: dataset: `netCDF4.Dataset` - The netCDF dataset to be be closed. + The netCDF dataset to be closed. :Returns: diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py index 113b3bc3a..e42d7e2b5 100644 --- a/cfdm/docstring/docstring.py +++ b/cfdm/docstring/docstring.py @@ -450,7 +450,7 @@ 'https://store'}}`` *Parameter example:* - ``{'key: 'scaleway-api-key...', 'secret': + ``{'key': 'scaleway-api-key...', 'secret': 'scaleway-secretkey...', 'endpoint_url': 'https://s3.fr-par.scw.cloud', 'client_kwargs': {'region_name': 'fr-par'}}``""", diff --git a/cfdm/functions.py b/cfdm/functions.py index 76c013844..31f00c8dc 100644 --- a/cfdm/functions.py +++ b/cfdm/functions.py @@ -326,6 +326,7 @@ def environment(display=True, paths=True): h5netcdf: 1.3.0 h5py: 3.10.0 s3fs: 2023.12.2 + dask: 2024.7.0 scipy: 1.11.3 cftime: 1.6.2 cfdm: NEXTVERSION @@ -343,6 +344,7 @@ def environment(display=True, paths=True): h5py: 3.10.0 /home/miniconda3/lib/python3.11/site-packages/h5py/__init__.py s3fs: 2023.12.2 /home/miniconda3/lib/python3.11/site-packages/s3fs/__init__.py scipy: 1.11.3 /home/miniconda3/lib/python3.11/site-packages/scipy/__init__.py + dask: 2024.7.0 /home/miniconda3/lib/python3.11/site-packages/dask/__init__.py cftime: 1.6.2 /home/miniconda3/lib/python3.11/site-packages/cftime/__init__.py cfdm: NEXTVERSION /home/miniconda3/lib/python3.11/site-packages/cfdm/__init__.py diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index e700c91c3..331b22df6 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -279,7 +279,7 @@ def read( netcdf_eninge: `None` or `str`, optional Specify which library to use for opening and reading netCDF files. By default, or if `None`, then the first one - of `netCDF4` and `h5netcdf` to successfully open the file + of `netCDF4` and `h5netcdf` to successfully open the netCDF file is used. Setting *netcdf_backend* to one of ``'netCDF4'`` and ``'h5netcdf'`` will force the use of that library. @@ -321,7 +321,7 @@ def read( 'https://store'}}`` *Parameter example:* - ``{'key: 'scaleway-api-key...', 'secret': + ``{'key': 'scaleway-api-key...', 'secret': 'scaleway-secretkey...', 'endpoint_url': 'https://s3.fr-par.scw.cloud', 'client_kwargs': {'region_name': 'fr-par'}}`` diff --git a/docs/source/installation.rst b/docs/source/installation.rst index d0d8ec6f3..bbed79031 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -156,7 +156,7 @@ The cfdm package requires: * `numpy `_, version 1.15 or newer. -* `netCDF4 `_, version 1.5.4 or +* `netCDF4 `_, version 1.5.4 or newer. * `cftime `_, version 1.6.0 or diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index 49e4dba2b..edc063627 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -184,7 +184,7 @@ The `cfdm.read` function has optional parameters to attributes are present (see :ref:`data masking `); and * display information and issue warnings about the mapping of the - netCDF file contents to CF data model constructs; + netCDF file contents to CF data model constructs; and * choose either `netCDF4` or `h5netcdf` backends for accessing netCDF files. @@ -2857,7 +2857,7 @@ instances for the field and metadata constructs. It is, however, possible to create data from arrays that reside on disk. The `cfdm.read` function creates data in this manner. A pointer to an array in a netCDF file can be stored in a `~cfdm.NetCDF4Array` or -`~cfdm.H5netcdfAarray` instance, which is is used to initialise a +`~cfdm.H5netcdfAarray` instance, which is used to initialise a `~cfdm.Data` instance. .. code-block:: python From e1cba0572625f23b48c2bfa6fdd664b51c050cbb Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:04:13 +0100 Subject: [PATCH 63/88] Update Changelog.rst Co-authored-by: Sadie L. Bartholomew --- Changelog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Changelog.rst b/Changelog.rst index 535325399..57dcd8a5b 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -16,7 +16,7 @@ Version NEXTVERSION dimension coordinate did not have a ``computed_standard_name`` attribute (https://github.com/NCAS-CMS/cfdm/issues/303) * New class `cfdm.H5netcdfArray` -* New class `cfdm.NetCDFIndexer` +* New class `cfdm.NetCDF4Array` * New dependency: ``h5netcdf>=1.3.0`` * New dependency: ``h5py>=3.10.0`` * New dependency: ``s3fs>=2024.6.0`` From d4fbbbddbb762ca8424e0bc04e68ebe2207602fd Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:08:25 +0100 Subject: [PATCH 64/88] netcdf_indexer --- Changelog.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Changelog.rst b/Changelog.rst index 57dcd8a5b..7a6bdf850 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -5,9 +5,11 @@ Version NEXTVERSION * Upgrades to allow cfdm to work with Python 3.12 (https://github.com/NCAS-CMS/cfdm/issues/302) -* New function `cfdm.netcdf_flattener` that replaces the - `netcdf_flattener` import - (https://github.com/NCAS-CMS/cfdm/issues/286) +* New function `cfdm.netcdf_flattener` that replaces the import of + `netcdf_flattener` (https://github.com/NCAS-CMS/cfdm/issues/286) +* New function `cfdm.netcdf_indexer` that applies netCDF masking and + unpacking to arbitrary arrays + (https://github.com/NCAS-CMS/cfdm/issues/285) * Allow access to netCDF-4 files in S3 object stores (https://github.com/NCAS-CMS/cfdm/issues/285) * Allow a choice of netCDF engines From cb84a9c0d583b3c3eaf6a44f53f506d493759198 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:16:14 +0100 Subject: [PATCH 65/88] remove h5py dependency Co-authored-by: Sadie L. Bartholomew --- cfdm/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cfdm/__init__.py b/cfdm/__init__.py index 21e205d8a..1382bb880 100644 --- a/cfdm/__init__.py +++ b/cfdm/__init__.py @@ -53,7 +53,6 @@ "netCDF4", "scipy", "h5netcdf", - "h5py", "s3fs", ) From 50c5b9f72441716d8f83a0f33f81ff6ab0d7c584 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:17:32 +0100 Subject: [PATCH 66/88] Typo Co-authored-by: Sadie L. Bartholomew --- cfdm/data/h5netcdfarray.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index ee1a35345..b2dd4d2dc 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -53,12 +53,6 @@ def __init__( shape: `tuple` The array dimension sizes in the file. - size: `int` - Number of elements in the array in the file. - - ndim: `int` - The number of array dimensions in the file. - {{init mask: `bool`, optional}} {{init unpack: `bool`, optional}} From 040e0a2238d6553a0f2e95f9e3f08001071e32cd Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:18:44 +0100 Subject: [PATCH 67/88] Remove incorrect logic Co-authored-by: Sadie L. Bartholomew --- cfdm/data/mixin/filearraymixin.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index 35e205ca6..f90a42673 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -83,8 +83,6 @@ def get_address(self, default=AttributeError()): n = len(addresses) if n == 1: return addresses[0] - elif n > 1: - return if default is None: return From d79d4b39a5417be73c5ba885cfad9adee66357d7 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:19:18 +0100 Subject: [PATCH 68/88] Typo Co-authored-by: Sadie L. Bartholomew --- cfdm/data/netcdf4array.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cfdm/data/netcdf4array.py b/cfdm/data/netcdf4array.py index 212fd9fe9..4eee306f4 100644 --- a/cfdm/data/netcdf4array.py +++ b/cfdm/data/netcdf4array.py @@ -46,13 +46,6 @@ def __init__( shape: `tuple` The array dimension sizes in the netCDF file. - - size: `int` - Number of elements in the array in the netCDF file. - - ndim: `int` - The number of array dimensions in the netCDF file. - {{init mask: `bool`, optional}} .. versionadded:: (cfdm) 1.8.2 From c4945a1e77b61d229910409f4f61728aa8a33327 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:20:03 +0100 Subject: [PATCH 69/88] Typo Co-authored-by: Sadie L. Bartholomew --- cfdm/data/netcdfarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cfdm/data/netcdfarray.py b/cfdm/data/netcdfarray.py index ec5b9d4b0..60a4cbfe6 100644 --- a/cfdm/data/netcdfarray.py +++ b/cfdm/data/netcdfarray.py @@ -86,7 +86,7 @@ def __init__( missing_values: Deprecated at version NEXTVERSION The missing value indicators defined by the netCDF - variable attributes. The may now be recorded via the + variable attributes. They may now be recorded via the *attributes* parameter ncvar: Deprecated at version 1.10.1.0 From 3fd8b433f21267085500adbd888fdde3fc5e4b79 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:21:03 +0100 Subject: [PATCH 70/88] Typo Co-authored-by: Sadie L. Bartholomew --- cfdm/data/netcdfindexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index 8aeab7a48..322a2e0c0 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -94,7 +94,7 @@ class netcdf_indexer: >>> n = np.arange(7) >>> x = cfdm.netcdf_indexer(n) >>> x.shape - (9,) + (7,) >>> print(x[...]) [0 1 2 3 4 5 6] >>> x = cfdm.netcdf_indexer(n, attributes={'_FillValue': 4}) From c687796fe14e659278c3f12fa2104d62504c3050 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:24:04 +0100 Subject: [PATCH 71/88] Typo --- cfdm/data/netcdfindexer.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index 322a2e0c0..58dcb6012 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -103,9 +103,6 @@ class netcdf_indexer: >>> x = cfdm.netcdf_indexer(n, mask=False, attributes={'_FillValue': 4}) >>> print(x[...]) [0 1 2 3 4 5 6] - >>> x = cfdm.netcdf_indexer(n, mask=False, attributes={'_FillValue': 4}) - >>> print(x[...]) - [0 1 2 3 4 5 6] """ From de8402c57bc848661cd78b6e1360aa9b37926043 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:24:58 +0100 Subject: [PATCH 72/88] Typos Co-authored-by: Sadie L. Bartholomew --- cfdm/data/netcdfindexer.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index 58dcb6012..28242d745 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -130,7 +130,7 @@ def __init__( that any masking and unpacking is always done by the `netcdf_indexer` instance. - mask: `bool` + mask: `bool`, optional If True, the default, then an array returned by indexing is automatically masked. Masking is determined by the netCDF conventions for the following @@ -138,14 +138,14 @@ def __init__( ``_Unsigned``, ``valid_max``, ``valid_min``, and ``valid_range``. - unpack: `bool` + unpack: `bool`, optional If True, the default, then an array returned by indexing is automatically unpacked. Unpacking is determined by the netCDF conventions for the following attributes: ``add_offset``, ``scale_factor``, and ``_Unsigned``. - always_masked_array: `bool` + always_masked_array: `bool`, optional If False, the default, then an array returned by indexing which has no missing values is created as a regular `numpy` array. If True then an array returned @@ -164,7 +164,7 @@ def __init__( attributes: `dict`, optional Provide netCDF attributes for the *variable* as a - dictionary key/value pairs. Only the attributes + dictionary of key/value pairs. Only the attributes relevant to masking and unpacking are considered, with all other attributes being ignored. If *attributes* is `None`, the default, then the netCDF attributes stored @@ -174,14 +174,14 @@ def __init__( copy: `bool`, optional If True then return a `numpy` array that is not a view - of part of the the original data, i.e. in-place + of part of the original data, i.e. in-place changes to the returned subspace will not affect the original *variable*. This is done by returning an in-memory copy the subspace. If False, the default, no - in-memory copy is done, and then whether or not + in-memory copy is made, and then whether or not in-place changes to the returned subspace affect *variable* will depend on how subspacing is - implemented by *variable*`. + implemented by *variable*. """ self.variable = variable From 82fc7dde8009a4406b6d84239cdf65f726bbee15 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:30:19 +0100 Subject: [PATCH 73/88] netcdf_flattener licence --- cfdm/read_write/netcdf/flatten/config.py | 11 +++++++++++ cfdm/read_write/netcdf/flatten/flatten.py | 13 +++++++++++++ 2 files changed, 24 insertions(+) diff --git a/cfdm/read_write/netcdf/flatten/config.py b/cfdm/read_write/netcdf/flatten/config.py index ac82d47f4..cb32eb0d8 100644 --- a/cfdm/read_write/netcdf/flatten/config.py +++ b/cfdm/read_write/netcdf/flatten/config.py @@ -2,6 +2,17 @@ .. versionadded:: (cfdm) NEXTVERSION +Portions of this code were adapted from the `netcdf_flattener` +library, which carries the following Apache 2.0 License: + +Copyright (c) 2020 EUMETSAT + +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. The ASF licenses this file to you +under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy +of the License at http://www.apache.org/licenses/LICENSE-2.0. + """ from dataclasses import dataclass diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 51f76d623..135a741bf 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -1,3 +1,16 @@ +"""Portions of this code were adapted from the `netcdf_flattener` +library, which carries the following Apache 2.0 License: + +Copyright (c) 2020 EUMETSAT + +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. The ASF licenses this file to you +under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy +of the License at http://www.apache.org/licenses/LICENSE-2.0. + +""" + import hashlib import logging import re From 40db78ce7525b302d6b43ea3ea706fada88a8350 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:33:09 +0100 Subject: [PATCH 74/88] Typos Co-authored-by: Sadie L. Bartholomew --- cfdm/read_write/netcdf/flatten/flatten.py | 26 +++++++++++------------ 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 135a741bf..8a4617ec9 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -27,7 +27,7 @@ ref_not_found_error, ) -# Mapping from numpy dtype endian format that expected by netCDF4 +# Mapping from numpy dtype endian format to that expected by netCDF4 _dtype_endian_lookup = { "=": "native", ">": "big", @@ -134,7 +134,7 @@ def parse_attribute(name, attribute): :Parameters: name: `str` - The attribute name (e.g. ``'cell_methods'```). + The attribute name (e.g. ``'cell_methods'``). attribute: `str` The attribute value to parse. @@ -465,7 +465,7 @@ def filepath(self, dataset): :Returns: `str` - The file system path, or the opendap URL, for the + The file system path, or the OPeNDAP URL, for the dataset. **Examples** @@ -665,7 +665,7 @@ def process_group(self, input_group): :Parameters: input_group: `str` - The group to faltten. + The group to flatten. :Returns: @@ -697,7 +697,7 @@ def flatten_attribute(self, input_group, attr_name): The group containing the attribute to flatten. attr_name: `str` - The anme of the attribute. + The name of the attribute. :Returns: @@ -854,7 +854,7 @@ def increment_pos(self, pos, dim, copy_slice_shape, var_shape): """Increment position. Increment position vector in a variable along a dimension by - the matching slice length along than dimension. If end of the + the matching slice length along that dimension. If end of the dimension is reached, recursively increment the next dimensions until a valid position is found. @@ -919,7 +919,7 @@ def write_data_in_chunks(self, old_var, new_var): `h5netcdf.Variable`. new_var: - The new variable in which copy the data, that has the + The new variable in which to copy the data, that has the same API as `netCDF4.Variable` or `h5netcdf.Variable`. :Returns: @@ -958,7 +958,7 @@ def write_data_in_chunks(self, old_var, new_var): ) def resolve_reference(self, orig_ref, orig_var, rules): - """Resolve a refrence. + """Resolve a reference. Resolves the absolute path to a coordinate variable within the group structure. @@ -1271,7 +1271,7 @@ def search_by_proximity( First search up in the hierarchy for the reference, until root group is reached. If coordinate variable, search until local - apex is reached, Then search down in siblings. + apex is reached, then search down in siblings. .. versionadded:: (cfdm) NEXTVERSION @@ -1288,7 +1288,7 @@ def search_by_proximity( variable. local_apex_reached: `bool` - Whether or not the apex is previously been reached. + Whether or not the apex has previously been reached. is_coordinate_variable: `bool` Whether the search is for a coordiante variable. @@ -1313,7 +1313,7 @@ def search_by_proximity( local_apex_reached or ref in current_group.dimensions.keys() ) - # Check if has to continue looking in parent group + # Check if have to continue looking in parent group # - normal search: continue until root is reached # - coordinate variable: continue until local apex is reached if is_coordinate_variable: @@ -1458,7 +1458,7 @@ def adapt_name(self, resolved_ref, rules): """Apapt the name. Return name of flattened reference. If not found, raise - exception or continue warning. + exception or continue with a warning. .. versionadded:: (cfdm) NEXTVERSION @@ -1514,7 +1514,7 @@ def adapt_name(self, resolved_ref, rules): return self.handle_reference_error(resolved_ref) def pathname(self, group, name): - """Compose full path name to an element in a group structure: + """Compose full path name to an element in a group structure. .. versionadded:: (cfdm) NEXTVERSION From 6ff5a4a875b3afba5198f1c4eea5a08271d8d2ee Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:33:54 +0100 Subject: [PATCH 75/88] Improved docstrings Co-authored-by: Sadie L. Bartholomew --- cfdm/read_write/netcdf/flatten/flatten.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 8a4617ec9..1c33a4d23 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -482,7 +482,7 @@ def filepath(self, dataset): return dataset.filename def get_dims(self, variable): - """Return. + """Return the dimensions associated with a variable. .. versionadded:: (cfdm) NEXTVERSION @@ -537,7 +537,7 @@ def getncattr(self, x, attr): return x.attrs[attr] def group(self, x): - """Return a. + """Return the group that a variable belongs to. .. versionadded:: (cfdm) NEXTVERSION @@ -592,7 +592,7 @@ def ncattrs(self, x): return list(x.attrs) def parent(self, group): - """Return a simulated unix directory path to a group. + """Return a simulated unix parent group. .. versionadded:: (cfdm) NEXTVERSION From 4633880bf9a4fe8e473f55989bb98af121c2b4ec Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:36:24 +0100 Subject: [PATCH 76/88] Typos Co-authored-by: Sadie L. Bartholomew --- cfdm/read_write/netcdf/netcdfread.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 77e2fc792..50538b308 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -509,7 +509,7 @@ def file_open(self, filename, flatten=True, verbose=None): HDF = False netcdf_backend = g["netcdf_backend"] - # Deal with an file in an S3 object store + # Deal with a file in an S3 object store u = urlparse(filename) storage_options = self._get_storage_options(filename, u) @@ -563,7 +563,7 @@ def file_open(self, filename, flatten=True, verbose=None): raise error else: - raise ValueError("Unknown netCDF backend: {netcdf_backend!r}") + raise ValueError(f"Unknown netCDF backend: {netcdf_backend!r}") g["original_h5netcdf"] = HDF g["original_netCDF4"] = netCDF @@ -10055,7 +10055,7 @@ def _file_global_attribute(self, nc, attr): :Returns: - The global attribute value + The global attribute value. """ try: @@ -10077,7 +10077,7 @@ def _file_global_attributes(self, nc): :Returns: - `dict'-like + `dict`-like A dictionary of the attribute values keyed by their names. @@ -10096,7 +10096,7 @@ def _file_dimensions(self, nc): :Returns: - `dict'-like + `dict`-like A dictionary of the dimensions keyed by their names. """ @@ -10124,7 +10124,7 @@ def _file_dimension(self, nc, dim_name): return self._file_dimensions(nc)[dim_name] def _file_dimension_isunlimited(self, nc, dim_name): - """Return a whether a dimension is unlimited. + """Return whether a dimension is unlimited. .. versionadded:: (cfdm) NEXTVERSION @@ -10145,7 +10145,7 @@ def _file_dimension_isunlimited(self, nc, dim_name): return self._file_dimension(nc, dim_name).isunlimited() def _file_dimension_size(self, nc, dim_name): - """Return a dimension is size. + """Return a dimension's size. .. versionadded:: (cfdm) NEXTVERSION @@ -10160,7 +10160,7 @@ def _file_dimension_size(self, nc, dim_name): :Returns: `int` - The dimssion size + The dimension size. """ return self._file_dimension(nc, dim_name).size @@ -10177,7 +10177,7 @@ def _file_variables(self, nc): :Returns: - `dict'-like + `dict`-like A dictionary of the variables keyed by their names. """ From 8cf95946bfac82110f57701334d43b9eee4d34fc Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:41:03 +0100 Subject: [PATCH 77/88] rename variables --- cfdm/read_write/netcdf/netcdfread.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 50538b308..a8763d279 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -505,8 +505,8 @@ def file_open(self, filename, flatten=True, verbose=None): """ g = self.read_vars - netCDF = False - HDF = False + xnetCDF = False + xHDF = False netcdf_backend = g["netcdf_backend"] # Deal with a file in an S3 object store @@ -538,13 +538,13 @@ def file_open(self, filename, flatten=True, verbose=None): try: # Try opening the file with netCDF4 nc = self._open_netCDF4(filename) - netCDF = True + xnetCDF = True except Exception: # The file could not be read by netCDF4 so try opening # it with h5netcdf try: nc = self._open_h5netcdf(filename) - HDF = True + xHDF = True except Exception as error: raise error @@ -558,15 +558,15 @@ def file_open(self, filename, flatten=True, verbose=None): elif netcdf_backend == "h5netcdf": try: nc = self._open_h5netcdf(filename) - HDF = True + xHDF = True except Exception as error: raise error else: raise ValueError(f"Unknown netCDF backend: {netcdf_backend!r}") - g["original_h5netcdf"] = HDF - g["original_netCDF4"] = netCDF + g["original_h5netcdf"] = xHDF + g["original_netCDF4"] = xnetCDF # ------------------------------------------------------------ # If the file has a group structure then flatten it (CF>=1.8) @@ -598,14 +598,14 @@ def file_open(self, filename, flatten=True, verbose=None): nc = flat_nc - netCDF = True - HDF = False + xnetCDF = True + xHDF = False g["has_groups"] = True g["flat_files"].append(flat_file) - g["netCDF4"] = netCDF - g["h5netcdf"] = HDF + g["netCDF4"] = xnetCDF + g["h5netcdf"] = xHDF g["nc"] = nc return nc From e97d14f6103cb4e01556b82375a50235c176bec7 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:40:49 +0100 Subject: [PATCH 78/88] Improved docstrings Co-authored-by: Sadie L. Bartholomew --- cfdm/read_write/netcdf/netcdfread.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index a8763d279..729d1a1d8 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -610,7 +610,7 @@ def file_open(self, filename, flatten=True, verbose=None): return nc def _open_netCDF4(self, filename): - """Return an open `netCDF4.Dataset`. + """Return a `netCDF4.Dataset` open in read-only mode. .. versionadded:: (cfdm) NEXTVERSION @@ -627,7 +627,7 @@ def _open_netCDF4(self, filename): return netCDF4.Dataset(filename, "r") def _open_h5netcdf(self, filename): - """Return an open `h5netcdf.File`. + """Return a `h5netcdf.File` open in read-only mode. Uses values of the ``rdcc_nbytes``, ``rdcc_w0``, and ``rdcc_nslots`` parameters to `h5netcdf.File` that correspond From 66c711f5f039dc2fbdeedeec700d88e151dcb8c1 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:43:36 +0100 Subject: [PATCH 79/88] rename variables --- cfdm/read_write/netcdf/netcdfread.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 729d1a1d8..0fb07e9f1 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -505,8 +505,8 @@ def file_open(self, filename, flatten=True, verbose=None): """ g = self.read_vars - xnetCDF = False - xHDF = False + netcdf = False + hdf = False netcdf_backend = g["netcdf_backend"] # Deal with a file in an S3 object store @@ -538,13 +538,13 @@ def file_open(self, filename, flatten=True, verbose=None): try: # Try opening the file with netCDF4 nc = self._open_netCDF4(filename) - xnetCDF = True + netcdf = True except Exception: # The file could not be read by netCDF4 so try opening # it with h5netcdf try: nc = self._open_h5netcdf(filename) - xHDF = True + hdf = True except Exception as error: raise error @@ -558,15 +558,15 @@ def file_open(self, filename, flatten=True, verbose=None): elif netcdf_backend == "h5netcdf": try: nc = self._open_h5netcdf(filename) - xHDF = True + hdf = True except Exception as error: raise error else: raise ValueError(f"Unknown netCDF backend: {netcdf_backend!r}") - g["original_h5netcdf"] = xHDF - g["original_netCDF4"] = xnetCDF + g["original_h5netcdf"] = hdf + g["original_netCDF4"] = netcdf # ------------------------------------------------------------ # If the file has a group structure then flatten it (CF>=1.8) @@ -598,19 +598,19 @@ def file_open(self, filename, flatten=True, verbose=None): nc = flat_nc - xnetCDF = True - xHDF = False + netcdf = True + hdf = False g["has_groups"] = True g["flat_files"].append(flat_file) - g["netCDF4"] = xnetCDF - g["h5netcdf"] = xHDF + g["netCDF4"] = netcdf + g["h5netcdf"] = hdf g["nc"] = nc return nc def _open_netCDF4(self, filename): - """Return a `netCDF4.Dataset` open in read-only mode. + """Return an open `netCDF4.Dataset`. .. versionadded:: (cfdm) NEXTVERSION @@ -627,7 +627,7 @@ def _open_netCDF4(self, filename): return netCDF4.Dataset(filename, "r") def _open_h5netcdf(self, filename): - """Return a `h5netcdf.File` open in read-only mode. + """Return an open `h5netcdf.File`. Uses values of the ``rdcc_nbytes``, ``rdcc_w0``, and ``rdcc_nslots`` parameters to `h5netcdf.File` that correspond From 1642521fb64853f6c7d327ccbf9b5dbfe6dd0295 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:50:32 +0100 Subject: [PATCH 80/88] remove debugging code --- cfdm/test/test_groups.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cfdm/test/test_groups.py b/cfdm/test/test_groups.py index ddf040ecc..d304dfff6 100644 --- a/cfdm/test/test_groups.py +++ b/cfdm/test/test_groups.py @@ -329,7 +329,7 @@ def test_groups_compression(self): """Test the compression of hierarchical groups.""" f = cfdm.example_field(4) - ungrouped_file = "ungrouped_file3.nc" + ungrouped_file = ungrouped_file3.nc grouped_file = grouped_file3 f.compress("indexed_contiguous", inplace=True) From 3b62cbff0006cb3c62c12b740957a70360f11398 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:52:44 +0100 Subject: [PATCH 81/88] rename variables for clarity --- cfdm/test/test_read_write.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cfdm/test/test_read_write.py b/cfdm/test/test_read_write.py index d51729d19..51faf6791 100644 --- a/cfdm/test/test_read_write.py +++ b/cfdm/test/test_read_write.py @@ -671,18 +671,18 @@ def test_read_CDL(self): def test_read_write_string(self): """Test the `string` keyword argument to `read` and `write`.""" - fn = cfdm.read(self.string_filename, netcdf_backend="netCDF4") - fh = cfdm.read(self.string_filename, netcdf_backend="h5netcdf") + fN = cfdm.read(self.string_filename, netcdf_backend="netCDF4") + fH = cfdm.read(self.string_filename, netcdf_backend="h5netcdf") - n = int(len(fn) / 2) + n = int(len(fN) / 2) for i in range(0, n): j = i + n - self.assertTrue(fn[i].data.equals(fn[j].data, verbose=3)) - self.assertTrue(fn[j].data.equals(fn[i].data, verbose=3)) + self.assertTrue(fN[i].data.equals(fN[j].data, verbose=3)) + self.assertTrue(fN[j].data.equals(fN[i].data, verbose=3)) # Check that netCDF4 and h5netcdf give the same results - for i, j in zip(fn, fh): + for i, j in zip(fN, fH): self.assertTrue(i.data.equals(j.data)) # Note: Don't loop round all netCDF formats for better From fe4100306ff0b807edb432d4b13890e68efe6ee9 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:54:05 +0100 Subject: [PATCH 82/88] unskip opendap test --- cfdm/test/test_read_write.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cfdm/test/test_read_write.py b/cfdm/test/test_read_write.py index 51faf6791..040ad1a70 100644 --- a/cfdm/test/test_read_write.py +++ b/cfdm/test/test_read_write.py @@ -996,8 +996,6 @@ def test_read_write_domain_ancillary(self): def test_read_url(self): """Test reading urls.""" - print("SKIPPING URL TEST") - return for scheme in ("http", "https"): remote = f"{scheme}://psl.noaa.gov/thredds/dodsC/Datasets/cru/crutem5/Monthlies/air.mon.anom.nobs.nc" # Check that cfdm can access it From e3ad717ec693d3a4ed504c602be3ac1b8709ab83 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 16:59:03 +0100 Subject: [PATCH 83/88] Tidy docs Co-authored-by: Sadie L. Bartholomew --- docs/source/class/cfdm.H5netcdfArray.rst | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/docs/source/class/cfdm.H5netcdfArray.rst b/docs/source/class/cfdm.H5netcdfArray.rst index 1576d7594..18b3d07a0 100644 --- a/docs/source/class/cfdm.H5netcdfArray.rst +++ b/docs/source/class/cfdm.H5netcdfArray.rst @@ -108,15 +108,3 @@ Docstring substitutions ~cfdm.H5netcdfArray._docstring_substitutions ~cfdm.H5netcdfArray._docstring_package_depth ~cfdm.H5netcdfArray._docstring_method_exclusions - -Deprecated ----------- - -.. rubric:: Methods - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - - ~cfdm.H5netcdfArray.get_missing_values From dad52b9284bfdfe744a9bf6421bbf8af6949e926 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 17:02:14 +0100 Subject: [PATCH 84/88] No NetCDFIndexer class Co-authored-by: Sadie L. Bartholomew --- docs/source/class.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/class.rst b/docs/source/class.rst index 1059dc244..e2536513f 100644 --- a/docs/source/class.rst +++ b/docs/source/class.rst @@ -78,7 +78,6 @@ Data classes cfdm.H5netcdfArray cfdm.NumpyArray cfdm.Array - cfdm.NetCDFIndexer Data compression classes ------------------------ From 4874560c39e13568bcd696e6ff78342e5491a97f Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 17:02:48 +0100 Subject: [PATCH 85/88] Non deprecated in new class docs Co-authored-by: Sadie L. Bartholomew --- docs/source/class/cfdm.NetCDF4Array.rst | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/docs/source/class/cfdm.NetCDF4Array.rst b/docs/source/class/cfdm.NetCDF4Array.rst index d087ab192..6ef0c047b 100644 --- a/docs/source/class/cfdm.NetCDF4Array.rst +++ b/docs/source/class/cfdm.NetCDF4Array.rst @@ -108,15 +108,3 @@ Docstring substitutions ~cfdm.NetCDF4Array._docstring_substitutions ~cfdm.NetCDF4Array._docstring_package_depth ~cfdm.NetCDF4Array._docstring_method_exclusions - -Deprecated ----------- - -.. rubric:: Methods - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - - ~cfdm.NetCDF4Array.get_missing_values From 065ccc659bdf7780587b5f8b8602f314a9140a08 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 23 Jul 2024 17:09:51 +0100 Subject: [PATCH 86/88] Better docstring Co-authored-by: Sadie L. Bartholomew --- cfdm/data/netcdf4array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cfdm/data/netcdf4array.py b/cfdm/data/netcdf4array.py index 4eee306f4..e76a0d5d5 100644 --- a/cfdm/data/netcdf4array.py +++ b/cfdm/data/netcdf4array.py @@ -346,8 +346,8 @@ def open(self): :Returns: (`netCDF4.Dataset`, `str`) - The open file object, and the address of the data - within the file. + The file object open in read-only mode, and the + address of the data within the file. """ return super().open(netCDF4.Dataset, mode="r") From 31919a92a6315a85e3ab72c8bcd8308230d89b1e Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 24 Jul 2024 17:36:44 +0100 Subject: [PATCH 87/88] remove TODO (all OK) --- cfdm/read_write/netcdf/netcdfread.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 0fb07e9f1..4a3a93859 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -551,7 +551,7 @@ def file_open(self, filename, flatten=True, verbose=None): elif netcdf_backend == "netCDF4": try: nc = self._open_netCDF4(filename) - netCDF = True + netcdf = True except Exception as error: raise error @@ -1502,7 +1502,7 @@ def read( # size from the original grouped dataset, because # unlimited dimensions have size 0 in the flattened # dataset (because it contains no data) (v1.8.8.1) - group, ncdim = self._netCDF4_group( # TODO h5netcdf ? + group, ncdim = self._netCDF4_group( g["nc_grouped"], flattener_dimensions[name] ) internal_dimension_sizes[name] = group.dimensions[ From 6f1c397715c171f4db9bdaa18412d651f0ed4d4d Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 24 Jul 2024 17:38:58 +0100 Subject: [PATCH 88/88] tidy --- cfdm/read_write/netcdf/netcdfread.py | 4 +--- cfdm/test/test_groups.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 4a3a93859..b7bc16be5 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -1505,9 +1505,7 @@ def read( group, ncdim = self._netCDF4_group( g["nc_grouped"], flattener_dimensions[name] ) - internal_dimension_sizes[name] = group.dimensions[ - ncdim - ].size # TODO h5netcdf ? + internal_dimension_sizes[name] = group.dimensions[ncdim].size else: internal_dimension_sizes[name] = dimension.size diff --git a/cfdm/test/test_groups.py b/cfdm/test/test_groups.py index d304dfff6..17772c426 100644 --- a/cfdm/test/test_groups.py +++ b/cfdm/test/test_groups.py @@ -329,7 +329,7 @@ def test_groups_compression(self): """Test the compression of hierarchical groups.""" f = cfdm.example_field(4) - ungrouped_file = ungrouped_file3.nc + ungrouped_file = ungrouped_file3 grouped_file = grouped_file3 f.compress("indexed_contiguous", inplace=True)