From 3a9672cf4533555851d4b8f7feae93f372df0f14 Mon Sep 17 00:00:00 2001 From: Scott Stewart Date: Mon, 4 Nov 2024 15:40:12 -0700 Subject: [PATCH] address requested PR changes part 1 --- seaice_ecdr/ancillary.py | 34 +- seaice_ecdr/cli/daily.py | 3 +- seaice_ecdr/daily_aggregate.py | 12 +- seaice_ecdr/days_treated_differently.py | 2 - seaice_ecdr/fill_polehole.py | 3 +- seaice_ecdr/initial_daily_ecdr.py | 60 ++-- seaice_ecdr/intermediate_monthly.py | 12 +- seaice_ecdr/monthly_aggregate.py | 13 +- seaice_ecdr/nc_attrs.py | 108 +++--- seaice_ecdr/nc_util.py | 69 ++-- seaice_ecdr/platforms/config.py | 4 + seaice_ecdr/publish_daily.py | 9 +- seaice_ecdr/publish_monthly.py | 11 +- seaice_ecdr/set_daily_ncattrs.py | 21 +- seaice_ecdr/tb_data.py | 11 +- seaice_ecdr/temporal_composite_daily.py | 7 +- .../tests/integration/test_ancillary.py | 6 +- .../tests/integration/test_publish_daily.py | 5 +- .../tests/integration/test_publish_monthly.py | 3 +- .../unit/test_dates_handled_differently.py | 310 +----------------- 20 files changed, 218 insertions(+), 485 deletions(-) diff --git a/seaice_ecdr/ancillary.py b/seaice_ecdr/ancillary.py index 2e16d593..1c75d709 100644 --- a/seaice_ecdr/ancillary.py +++ b/seaice_ecdr/ancillary.py @@ -24,6 +24,7 @@ ProductVersion, ) from seaice_ecdr.grid_id import get_grid_id +from seaice_ecdr.nc_util import remove_FillValue_from_coordinate_vars from seaice_ecdr.platforms import PLATFORM_CONFIG, Platform from seaice_ecdr.platforms.config import N07_PLATFORM @@ -47,21 +48,21 @@ def get_ancillary_filepath( return filepath -def fix_ds_attrs(ds): - """Correct potential errors in the ancillary datasets""" - # Remove fill values from x, if it exists - try: - del ds.variables["x"].encoding["_FillValue"] - except KeyError: - pass - - # Remove fill values from y, if it exists - try: - del ds.variables["y"].encoding["_FillValue"] - except KeyError: - pass - - return ds +# def fix_ds_attrs(ds): +# """Correct potential errors in the ancillary datasets""" +# # Remove fill values from x, if it exists +# try: +# del ds.variables["x"].encoding["_FillValue"] +# except KeyError: +# pass +# +# # Remove fill values from y, if it exists +# try: +# del ds.variables["y"].encoding["_FillValue"] +# except KeyError: +# pass +# +# return ds @cache @@ -86,7 +87,8 @@ def get_ancillary_ds( ) ds = xr.load_dataset(filepath) - ds = fix_ds_attrs(ds) + # ds = fix_ds_attrs(ds) + ds = remove_FillValue_from_coordinate_vars(ds) return ds diff --git a/seaice_ecdr/cli/daily.py b/seaice_ecdr/cli/daily.py index 0ee28fd8..14018589 100644 --- a/seaice_ecdr/cli/daily.py +++ b/seaice_ecdr/cli/daily.py @@ -54,8 +54,7 @@ def make_25km_ecdr( # separately: amsr2_start_date = start_date if end_date >= AM2_START_DATE: - if start_date < AM2_START_DATE: - amsr2_start_date = AM2_START_DATE + amsr2_start_date = max(start_date, AM2_START_DATE) if amsr2_start_date >= AM2_START_DATE: run_cmd( diff --git a/seaice_ecdr/daily_aggregate.py b/seaice_ecdr/daily_aggregate.py index f51c2079..aeba9657 100644 --- a/seaice_ecdr/daily_aggregate.py +++ b/seaice_ecdr/daily_aggregate.py @@ -20,7 +20,11 @@ from pm_tb_data._types import Hemisphere from seaice_ecdr._types import ECDR_SUPPORTED_RESOLUTIONS -from seaice_ecdr.ancillary import ANCILLARY_SOURCES, get_ancillary_ds +from seaice_ecdr.ancillary import ( + ANCILLARY_SOURCES, + get_ancillary_ds, + remove_FillValue_from_coordinate_vars, +) from seaice_ecdr.checksum import write_checksum_file from seaice_ecdr.constants import DEFAULT_BASE_OUTPUT_DIR from seaice_ecdr.nc_attrs import get_global_attrs @@ -159,6 +163,7 @@ def _update_ncrcat_daily_ds( platform_ids=[platform_id_from_filename(fp.name) for fp in daily_filepaths], resolution=resolution, hemisphere=hemisphere, + ancillary_source=ancillary_source, ) ds.attrs = daily_aggregate_ds_global_attrs # type: ignore[assignment] @@ -215,8 +220,9 @@ def make_daily_aggregate_netcdf_for_year( fix_daily_aggregate_ncattrs(daily_ds) # Write out the aggregate daily file. - daily_ds.variables["x"].encoding["_FillValue"] = None - daily_ds.variables["y"].encoding["_FillValue"] = None + # daily_ds.variables["x"].encoding["_FillValue"] = None + # daily_ds.variables["y"].encoding["_FillValue"] = None + daily_ds = remove_FillValue_from_coordinate_vars(daily_ds) daily_ds.to_netcdf( output_path, ) diff --git a/seaice_ecdr/days_treated_differently.py b/seaice_ecdr/days_treated_differently.py index 3b9b0454..02810197 100644 --- a/seaice_ecdr/days_treated_differently.py +++ b/seaice_ecdr/days_treated_differently.py @@ -82,8 +82,6 @@ def periods_of_cdr_missing_data( """Returns a list of date ranges for which CDR's output should be all-missing. """ - # missing_ranges = _load_missing_ranges() - # ranges = missing_ranges['treat_outputs_as_missing'][hemisphere_long] dates_treated_differently = _load_dates_treated_differently() try: ranges = dates_treated_differently["treat_outputs_as_missing"][platform_id][ diff --git a/seaice_ecdr/fill_polehole.py b/seaice_ecdr/fill_polehole.py index 8bb83a52..c04d2c2c 100644 --- a/seaice_ecdr/fill_polehole.py +++ b/seaice_ecdr/fill_polehole.py @@ -11,7 +11,6 @@ from scipy.ndimage import binary_dilation -# TODO: differentiate this from the function in `compute_bt_ic` def fill_pole_hole( *, conc: npt.NDArray, near_pole_hole_mask: npt.NDArray[np.bool_] ) -> npt.NDArray: @@ -20,6 +19,8 @@ def fill_pole_hole( Assumes that some data is available in and adjacent to the masked area. Missing areas are given by `np.nan`. + + If no nearby values are available the pole hole is unchanged. """ extended_nearpole_mask = binary_dilation(near_pole_hole_mask) diff --git a/seaice_ecdr/initial_daily_ecdr.py b/seaice_ecdr/initial_daily_ecdr.py index 53efa23e..2d74d7f4 100644 --- a/seaice_ecdr/initial_daily_ecdr.py +++ b/seaice_ecdr/initial_daily_ecdr.py @@ -49,6 +49,7 @@ day_has_all_bad_tbs, ) from seaice_ecdr.grid_id import get_grid_id +from seaice_ecdr.nc_util import remove_FillValue_from_coordinate_vars from seaice_ecdr.platforms import PLATFORM_CONFIG, SUPPORTED_PLATFORM_ID from seaice_ecdr.regrid_25to12 import reproject_ideds_25to12 from seaice_ecdr.spillover import LAND_SPILL_ALGS, land_spillover @@ -218,7 +219,7 @@ def calc_cdr_conc( return cdr_conc -def setup_ecdr_ds( +def _setup_ecdr_ds( *, date: dt.date, tb_data: EcdrTbData, @@ -468,7 +469,6 @@ def compute_initial_daily_ecdr_dataset( tb_data: EcdrTbData, land_spillover_alg: LAND_SPILL_ALGS, ancillary_source: ANCILLARY_SOURCES, - skip_dynamic_BT_tiepoints: bool = False, ) -> xr.Dataset: """Create intermediate daily ECDR xarray dataset. @@ -479,7 +479,7 @@ def compute_initial_daily_ecdr_dataset( fields. Its difficult to understand what's in the resulting dataset without manually inspecting the result of running this code. """ - ecdr_ide_ds = setup_ecdr_ds( + ecdr_ide_ds = _setup_ecdr_ds( date=date, tb_data=tb_data, hemisphere=hemisphere, @@ -488,7 +488,7 @@ def compute_initial_daily_ecdr_dataset( # Spatially interpolate the brightness temperatures for tbname in EXPECTED_ECDR_TB_NAMES: - # TODO: Replace this TB assignment with + # TODO: WIP: Replace this TB assignment with # util.py's create_tb_dataarray() tb_day_name = f"{tbname}_day" @@ -859,40 +859,23 @@ def compute_initial_daily_ecdr_dataset( water_mask=ecdr_ide_ds["bt_weather_mask"].data[0, :, :], # note: water ) - # TODO: Keeping commented-out weather_mask kwarg calls to highlight - # the transition from weather_mask to water_mask in these function - # calls - # TODO: Temporarily forcing skipping of water tiepoint calculation to examine - # using skip_dynamic_BT_tiepoints kwarg - if skip_dynamic_BT_tiepoints: - logger.warning("SKIPPING calculation of water tiepoint to match CDRv4") - bt_coefs["bt_wtp_v37"] = bt_coefs_init["bt_wtp_v37"] - else: - bt_coefs["bt_wtp_v37"] = bt.calculate_water_tiepoint( - wtp_init=bt_coefs_init["bt_wtp_v37"], - water_mask=ecdr_ide_ds["bt_weather_mask"].data[0, :, :], - tb=bt_v37, - ) + bt_coefs["bt_wtp_v37"] = bt.calculate_water_tiepoint( + wtp_init=bt_coefs_init["bt_wtp_v37"], + water_mask=ecdr_ide_ds["bt_weather_mask"].data[0, :, :], + tb=bt_v37, + ) - if skip_dynamic_BT_tiepoints: - logger.warning("SKIPPING calculation of water tiepoint to match CDRv4") - bt_coefs["bt_wtp_h37"] = bt_coefs_init["bt_wtp_h37"] - else: - bt_coefs["bt_wtp_h37"] = bt.calculate_water_tiepoint( - wtp_init=bt_coefs_init["bt_wtp_h37"], - water_mask=ecdr_ide_ds["bt_weather_mask"].data[0, :, :], - tb=bt_h37, - ) + bt_coefs["bt_wtp_h37"] = bt.calculate_water_tiepoint( + wtp_init=bt_coefs_init["bt_wtp_h37"], + water_mask=ecdr_ide_ds["bt_weather_mask"].data[0, :, :], + tb=bt_h37, + ) - if skip_dynamic_BT_tiepoints: - logger.warning("SKIPPING calculation of water tiepoint to match CDRv4") - bt_coefs["bt_wtp_v19"] = bt_coefs_init["bt_wtp_v19"] - else: - bt_coefs["bt_wtp_v19"] = bt.calculate_water_tiepoint( - wtp_init=bt_coefs_init["bt_wtp_v19"], - water_mask=ecdr_ide_ds["bt_weather_mask"].data[0, :, :], - tb=bt_v19, - ) + bt_coefs["bt_wtp_v19"] = bt.calculate_water_tiepoint( + wtp_init=bt_coefs_init["bt_wtp_v19"], + water_mask=ecdr_ide_ds["bt_weather_mask"].data[0, :, :], + tb=bt_v19, + ) bt_coefs["ad_line_offset"] = bt.get_adj_ad_line_offset( wtp_x=bt_coefs["bt_wtp_v37"], @@ -1275,8 +1258,9 @@ def write_ide_netcdf( "zlib": True, } - ide_ds.variables["x"].encoding["_FillValue"] = None - ide_ds.variables["y"].encoding["_FillValue"] = None + # ide_ds.variables["x"].encoding["_FillValue"] = None + # ide_ds.variables["y"].encoding["_FillValue"] = None + ide_ds = remove_FillValue_from_coordinate_vars(ide_ds) ide_ds.to_netcdf( output_filepath, encoding=nc_encoding, diff --git a/seaice_ecdr/intermediate_monthly.py b/seaice_ecdr/intermediate_monthly.py index bc6dcdf9..da33c51d 100644 --- a/seaice_ecdr/intermediate_monthly.py +++ b/seaice_ecdr/intermediate_monthly.py @@ -36,7 +36,11 @@ from pm_tb_data._types import NORTH, Hemisphere from seaice_ecdr._types import ECDR_SUPPORTED_RESOLUTIONS -from seaice_ecdr.ancillary import ANCILLARY_SOURCES, flag_value_for_meaning +from seaice_ecdr.ancillary import ( + ANCILLARY_SOURCES, + flag_value_for_meaning, + remove_FillValue_from_coordinate_vars, +) from seaice_ecdr.constants import DEFAULT_BASE_OUTPUT_DIR from seaice_ecdr.days_treated_differently import months_of_cdr_missing_data from seaice_ecdr.intermediate_daily import get_ecdr_filepath @@ -666,6 +670,7 @@ def make_intermediate_monthly_ds( platform_ids=[platform_id], resolution=resolution, hemisphere=hemisphere, + ancillary_source=ancillary_source, ) monthly_ds.attrs.update(monthly_ds_global_attrs) @@ -745,8 +750,9 @@ def make_intermediate_monthly_nc( # Set `x` and `y` `_FillValue` to `None`. Although unset initially, `xarray` # seems to default to `np.nan` for variables without a FillValue. - monthly_ds.x.encoding["_FillValue"] = None - monthly_ds.y.encoding["_FillValue"] = None + # monthly_ds.x.encoding["_FillValue"] = None + # monthly_ds.y.encoding["_FillValue"] = None + monthly_ds = remove_FillValue_from_coordinate_vars(monthly_ds) monthly_ds.to_netcdf( output_path, unlimited_dims=[ diff --git a/seaice_ecdr/monthly_aggregate.py b/seaice_ecdr/monthly_aggregate.py index 7c98de71..1f4bfd3b 100644 --- a/seaice_ecdr/monthly_aggregate.py +++ b/seaice_ecdr/monthly_aggregate.py @@ -12,7 +12,11 @@ from pm_tb_data._types import Hemisphere from seaice_ecdr._types import ECDR_SUPPORTED_RESOLUTIONS -from seaice_ecdr.ancillary import ANCILLARY_SOURCES, get_ancillary_ds +from seaice_ecdr.ancillary import ( + ANCILLARY_SOURCES, + get_ancillary_ds, + remove_FillValue_from_coordinate_vars, +) from seaice_ecdr.checksum import write_checksum_file from seaice_ecdr.constants import DEFAULT_ANCILLARY_SOURCE, DEFAULT_BASE_OUTPUT_DIR from seaice_ecdr.nc_attrs import get_global_attrs @@ -123,6 +127,7 @@ def _update_ncrcat_monthly_ds( platform_ids=[platform_id_from_filename(fp.name) for fp in monthly_filepaths], resolution=resolution, hemisphere=hemisphere, + ancillary_source=ancillary_source, ) agg_ds.attrs = monthly_aggregate_ds_global_attrs # type: ignore[assignment] @@ -237,8 +242,10 @@ def cli( # After the ncrcat process, a few attributes need to be cleaned up fix_monthly_aggregate_ncattrs(ds) - ds.variables["x"].encoding["_FillValue"] = None - ds.variables["y"].encoding["_FillValue"] = None + # ds.variables["x"].encoding["_FillValue"] = None + # ds.variables["y"].encoding["_FillValue"] = None + ds = remove_FillValue_from_coordinate_vars(ds) + ds.to_netcdf( output_filepath, ) diff --git a/seaice_ecdr/nc_attrs.py b/seaice_ecdr/nc_attrs.py index 9fe8c9d9..0aed518a 100644 --- a/seaice_ecdr/nc_attrs.py +++ b/seaice_ecdr/nc_attrs.py @@ -7,9 +7,14 @@ import pandas as pd import xarray as xr +from loguru import logger from pm_tb_data._types import Hemisphere from seaice_ecdr._types import ECDR_SUPPORTED_RESOLUTIONS +from seaice_ecdr.ancillary import ( + ANCILLARY_SOURCES, + get_ancillary_ds, +) from seaice_ecdr.constants import ECDR_PRODUCT_VERSION from seaice_ecdr.platforms import PLATFORM_CONFIG, SUPPORTED_PLATFORM_ID @@ -142,6 +147,7 @@ def get_global_attrs( platform_ids: list[SUPPORTED_PLATFORM_ID], resolution: ECDR_SUPPORTED_RESOLUTIONS, hemisphere: Hemisphere, + ancillary_source: ANCILLARY_SOURCES, ) -> dict[str, Any]: """Return a dictionary containing the global attributes for a standard ECDR NetCDF file. @@ -161,36 +167,34 @@ def get_global_attrs( time=time, ) - # TODO: These attributes should be pulled from a grid-based information source - # E.g., this information is available in the ancillary file. if hemisphere == "north": - geospatial_bounds = "POLYGON ((-3850000 5850000, 3750000 5850000, 3750000 -5350000, -3850000 -5350000, -3850000 5850000))" - geospatial_bounds_crs = "EPSG:3411" - geospatial_x_units = "meters" - geospatial_y_units = "meters" - geospatial_x_resolution = "25000 meters" - geospatial_y_resolution = "25000 meters" - geospatial_lat_min = 30.980564 - geospatial_lat_max = 90.0 - geospatial_lon_min = -180.0 - geospatial_lon_max = 180.0 - geospatial_lat_units = "degrees_north" - geospatial_lon_units = "degrees_east" + # geospatial_bounds = "POLYGON ((-3850000 5850000, 3750000 5850000, 3750000 -5350000, -3850000 -5350000, -3850000 5850000))" + # geospatial_bounds_crs = "EPSG:3411" + # geospatial_x_units = "meters" + # geospatial_y_units = "meters" + # geospatial_x_resolution = "25000 meters" + # geospatial_y_resolution = "25000 meters" + # geospatial_lat_min = 30.980564 + # geospatial_lat_max = 90.0 + # geospatial_lon_min = -180.0 + # geospatial_lon_max = 180.0 + # geospatial_lat_units = "degrees_north" + # geospatial_lon_units = "degrees_east" keywords = "EARTH SCIENCE > CRYOSPHERE > SEA ICE > SEA ICE CONCENTRATION, Continent > North America > Canada > Hudson Bay, Geographic Region > Arctic, Geographic Region > Polar, Geographic Region > Northern Hemisphere, Ocean > Arctic Ocean, Ocean > Arctic Ocean > Barents Sea, Ocean > Arctic Ocean > Beaufort Sea, Ocean > Arctic Ocean > Chukchi Sea, CONTINENT > NORTH AMERICA > CANADA > HUDSON BAY, Ocean > Atlantic Ocean > North Atlantic Ocean > Davis Straight, OCEAN > ATLANTIC OCEAN > NORTH ATLANTIC OCEAN > GULF OF ST LAWRENCE, Ocean > Atlantic Ocean > North Atlantic Ocean > North Sea, Ocean > Atlantic Ocean > North Atlantic Ocean > Norwegian Sea, OCEAN > ATLANTIC OCEAN > NORTH ATLANTIC OCEAN > SVALBARD AND JAN MAYEN, Ocean > Pacific Ocean, Ocean > Pacific Ocean > North Pacific Ocean > Bering Sea, Ocean > Pacific Ocean > North Pacific Ocean > Sea Of Okhotsk" else: - geospatial_bounds = "POLYGON ((-3950000 4350000, 3950000 4350000, 3950000 -3950000, -3950000 -3950000, -3950000 4350000))" - geospatial_bounds_crs = "EPSG:3412" - geospatial_x_units = "meters" - geospatial_y_units = "meters" - geospatial_x_resolution = "25000 meters" - geospatial_y_resolution = "25000 meters" - geospatial_lat_min = -90.0 - geospatial_lat_max = -39.23089 - geospatial_lon_min = -180.0 - geospatial_lon_max = 180.0 - geospatial_lat_units = "degrees_north" - geospatial_lon_units = "degrees_east" + # geospatial_bounds = "POLYGON ((-3950000 4350000, 3950000 4350000, 3950000 -3950000, -3950000 -3950000, -3950000 4350000))" + # geospatial_bounds_crs = "EPSG:3412" + # geospatial_x_units = "meters" + # geospatial_y_units = "meters" + # geospatial_x_resolution = "25000 meters" + # geospatial_y_resolution = "25000 meters" + # geospatial_lat_min = -90.0 + # geospatial_lat_max = -39.23089 + # geospatial_lon_min = -180.0 + # geospatial_lon_max = 180.0 + # geospatial_lat_units = "degrees_north" + # geospatial_lon_units = "degrees_east" keywords = "EARTH SCIENCE > CRYOSPHERE > SEA ICE > SEA ICE CONCENTRATION, Geographic Region > Polar, Geographic Region > Southern Hemisphere, Ocean > Southern Ocean, Ocean > Southern Ocean > Bellingshausen Sea, Ocean > Southern Ocean > Ross Sea, Ocean > Southern Ocean > Weddell Sea" @@ -233,18 +237,46 @@ def get_global_attrs( # TODO: ideally, these would get dynamically set from the input data's # grid definition... cdr_data_type="grid", - geospatial_bounds=geospatial_bounds, - geospatial_bounds_crs=geospatial_bounds_crs, - geospatial_x_units=geospatial_x_units, - geospatial_y_units=geospatial_y_units, - geospatial_x_resolution=geospatial_x_resolution, - geospatial_y_resolution=geospatial_y_resolution, - geospatial_lat_min=geospatial_lat_min, - geospatial_lat_max=geospatial_lat_max, - geospatial_lon_min=geospatial_lon_min, - geospatial_lon_max=geospatial_lon_max, - geospatial_lat_units=geospatial_lat_units, - geospatial_lon_units=geospatial_lon_units, + # geospatial_bounds=geospatial_bounds, + # geospatial_bounds_crs=geospatial_bounds_crs, + # geospatial_x_units=geospatial_x_units, + # geospatial_y_units=geospatial_y_units, + # geospatial_x_resolution=geospatial_x_resolution, + # geospatial_y_resolution=geospatial_y_resolution, + # geospatial_lat_min=geospatial_lat_min, + # geospatial_lat_max=geospatial_lat_max, + # geospatial_lon_min=geospatial_lon_min, + # geospatial_lon_max=geospatial_lon_max, + # geospatial_lat_units=geospatial_lat_units, + # geospatial_lon_units=geospatial_lon_units, + ) + + # TODO: These attributes should be pulled from a grid-based information source + # E.g., this information is available in the ancillary file. + ancillary_ds = get_ancillary_ds( + hemisphere=hemisphere, + resolution=resolution, + ancillary_source=ancillary_source, ) + attrs_from_ancillary = ( + "geospatial_bounds", + "geospatial_bounds_crs", + "geospatial_x_units", + "geospatial_y_units", + "geospatial_x_resolution", + "geospatial_y_resolution", + "geospatial_lat_min", + "geospatial_lat_max", + "geospatial_lon_min", + "geospatial_lon_max", + "geospatial_lat_units", + "geospatial_lon_units", + ) + + for attr in attrs_from_ancillary: + try: + new_global_attrs[attr] = ancillary_ds.attrs[attr] + except KeyError: + logger.warning(f"No such global attr in ancillary file: {attr}") return new_global_attrs diff --git a/seaice_ecdr/nc_util.py b/seaice_ecdr/nc_util.py index faf46edd..b96991b7 100644 --- a/seaice_ecdr/nc_util.py +++ b/seaice_ecdr/nc_util.py @@ -64,20 +64,38 @@ def remove_FillValue_from_coordinate_vars(ds: xr.Dataset | datatree.DataTree): outputs. Ideally, the ancillary files and the code that creates the `time` dim should omit `valid_range`, and this function becomes unnecessary. """ - try: - del ds.x.encoding["_FillValue"] - except KeyError: - pass - - try: - del ds.y.encoding["_FillValue"] - except KeyError: - pass + # I this might *actually* work to remove the _FillValue for xarray + for coord_var in ("x", "y", "time"): + try: + ds.variables["coord_var"].encoding["_FillValue"] = None + except KeyError: + pass + + # this should be a simplified verison of the orig version of this routine + + # for coord_var in ('x', 'y', 'time'): + # try: + # del ds.variables['coord_var'].encoding['_FillValue'] + # except KeyError: + # pass + + # This was the original contents of this routine + # try: + # del ds.x.encoding["_FillValue"] + # except KeyError: + # pass + + # try: + # del ds.y.encoding["_FillValue"] + # except KeyError: + # pass + + # try: + # del ds.time.encoding["_FillValue"] + # except KeyError: + # pass - try: - del ds.time.encoding["_FillValue"] - except KeyError: - pass + return ds def add_coordinate_coverage_content_type(ds: xr.Dataset | datatree.DataTree): @@ -120,7 +138,8 @@ def get_empty_group(ds, ncgroup_name): data_variable = ds_empty_group.data_vars[datavar] # NOTE: Getting the _FillValue via # fillvalue = ds_empty_group.data_vars[datavar].encoding['_FillValue'] - # ...did not work for packed data. Falling back to a + # did not work for packed data. + # Setting the fillvalue explicitly based on data_variable attrs if "number_of_missing_pixels" in data_variable.attrs: data_variable.attrs["number_of_missing_pixels"] = data_variable.size dtype = data_variable.dtype @@ -155,9 +174,7 @@ def add_ncgroup( ds, ncgroup_name, ) - # ds = None *** How do I close a datatree? *** break - # ds = None *** How do I close a datatree? *** # If we don't need to add this nc_group, return the filename list unaltered if not add_groupname: @@ -169,21 +186,22 @@ def add_ncgroup( clean_ncgroup_name = ncgroup_name.replace("/", "") empty_datatree_fn = f"empty_datatree_{clean_ncgroup_name}.nc" empty_datatree_fp = Path(tmpdir, empty_datatree_fn) - try: - empty_nc_datatree.variables["x"].encoding["_FillValue"] = None - except KeyError: - pass + empty_nc_datatree = remove_FillValue_from_coordinate_vars(empty_nc_datatree) + # try: + # empty_nc_datatree.variables["x"].encoding["_FillValue"] = None + # except KeyError: + # pass - try: - empty_nc_datatree.variables["y"].encoding["_FillValue"] = None - except KeyError: - pass + # try: + # empty_nc_datatree.variables["y"].encoding["_FillValue"] = None + # except KeyError: + # pass empty_nc_datatree.to_netcdf(empty_datatree_fp) new_file_list = [] updated_file_count = 0 - logger.success( + logger.info( f"Adding {clean_ncgroup_name} nc group to {len(filepath_list)} files as needed..." ) for fp in filepath_list: @@ -218,6 +236,7 @@ def add_ncgroup( def adjust_monthly_aggregate_ncattrs(fp): """Clean up source, platform, and sensor ncattrs""" print("SKIPPING adjust_monthly_aggregate_ncattrs") + print(" pending final decisions on monagg_attrs") pass diff --git a/seaice_ecdr/platforms/config.py b/seaice_ecdr/platforms/config.py index 968a9252..5b956139 100644 --- a/seaice_ecdr/platforms/config.py +++ b/seaice_ecdr/platforms/config.py @@ -131,6 +131,10 @@ def get_platform_id_from_name(platform_name): for platform in SUPPORTED_PLATFORMS: if platform.name == platform_name: return platform.id + logger.error( + f"Could not determine platform ID for: {platform_name}, returning empty string" + ) + return "" diff --git a/seaice_ecdr/publish_daily.py b/seaice_ecdr/publish_daily.py index b116d3a6..18274518 100644 --- a/seaice_ecdr/publish_daily.py +++ b/seaice_ecdr/publish_daily.py @@ -219,9 +219,6 @@ def publish_daily_nc( prototype_subgroup = prototype_daily_ds[cdr_var_fieldnames].rename_vars( remap_names ) - # TODO: Set the long_name of the prototype variables - # to be "AMSR2 Prototype..." - # instead of "NOAA/NSIDC CDR of..." # Rename ancillary variables. for var in prototype_subgroup.values(): if "ancillary_variables" in var.attrs: @@ -275,9 +272,9 @@ def publish_daily_nc( complete_daily_ds.time.encoding["units"] = "days since 1970-01-01" complete_daily_ds.time.encoding["calendar"] = "standard" - # No _FillValue for x or y is a CF requirement - complete_daily_ds.variables["x"].encoding["_FillValue"] = None - complete_daily_ds.variables["y"].encoding["_FillValue"] = None + # complete_daily_ds.variables["x"].encoding["_FillValue"] = None + # complete_daily_ds.variables["y"].encoding["_FillValue"] = None + complete_daily_ds = remove_FillValue_from_coordinate_vars(complete_daily_ds) complete_daily_ds.to_netcdf(complete_daily_filepath) logger.success(f"Staged NC file for publication: {complete_daily_filepath}") diff --git a/seaice_ecdr/publish_monthly.py b/seaice_ecdr/publish_monthly.py index 06d8da70..e033bb6e 100644 --- a/seaice_ecdr/publish_monthly.py +++ b/seaice_ecdr/publish_monthly.py @@ -8,6 +8,9 @@ from pm_tb_data._types import NORTH, Hemisphere from seaice_ecdr._types import ECDR_SUPPORTED_RESOLUTIONS +from seaice_ecdr.ancillary import ( + remove_FillValue_from_coordinate_vars, +) from seaice_ecdr.checksum import write_checksum_file from seaice_ecdr.intermediate_monthly import ( get_intermediate_monthly_dir, @@ -260,7 +263,6 @@ def prepare_monthly_ds_for_publication( # Do final cleanup. This should be unnecessary (see the docstrings for the # associated fuctions for more). - # remove_valid_range_from_coordinate_vars(complete_monthly_ds) add_coordinate_coverage_content_type(complete_monthly_ds) add_coordinates_attr(complete_monthly_ds) @@ -297,8 +299,11 @@ def _write_publication_ready_nc_and_checksum( publication_ready_monthly_ds.time.encoding["units"] = "days since 1970-01-01" publication_ready_monthly_ds.time.encoding["calendar"] = "standard" - publication_ready_monthly_ds.variables["x"].encoding["_FillValue"] = None - publication_ready_monthly_ds.variables["y"].encoding["_FillValue"] = None + # publication_ready_monthly_ds.variables["x"].encoding["_FillValue"] = None + # publication_ready_monthly_ds.variables["y"].encoding["_FillValue"] = None + publication_ready_monthly_ds = remove_FillValue_from_coordinate_vars( + publication_ready_monthly_ds + ) publication_ready_monthly_ds.to_netcdf(complete_monthly_filepath) logger.success(f"Staged NC file for publication: {complete_monthly_filepath}") diff --git a/seaice_ecdr/set_daily_ncattrs.py b/seaice_ecdr/set_daily_ncattrs.py index 2a2677ff..0cbbf8e8 100644 --- a/seaice_ecdr/set_daily_ncattrs.py +++ b/seaice_ecdr/set_daily_ncattrs.py @@ -5,7 +5,10 @@ from pm_tb_data._types import NORTH, Hemisphere from seaice_ecdr._types import ECDR_SUPPORTED_RESOLUTIONS -from seaice_ecdr.ancillary import ANCILLARY_SOURCES +from seaice_ecdr.ancillary import ( + ANCILLARY_SOURCES, + remove_FillValue_from_coordinate_vars, +) from seaice_ecdr.nc_attrs import get_global_attrs from seaice_ecdr.tb_data import get_data_url_from_data_source from seaice_ecdr.util import get_num_missing_pixels @@ -363,24 +366,12 @@ def finalize_cdecdr_ds( platform_ids=[ds_in.platform], resolution=resolution, hemisphere=hemisphere, + ancillary_source=ancillary_source, ) ds.attrs = new_global_attrs # Coordinate values should not have _FillValue set - try: - del ds.time.encoding["_FillValue"] - except KeyError: - pass - - try: - del ds.x.encoding["_FillValue"] - except KeyError: - pass - - try: - del ds.y.encoding["_FillValue"] - except KeyError: - pass + ds = remove_FillValue_from_coordinate_vars(ds) # Note: Here, the x and y coordinate variables *do* have valid_range set diff --git a/seaice_ecdr/tb_data.py b/seaice_ecdr/tb_data.py index c8c67e4e..06175552 100644 --- a/seaice_ecdr/tb_data.py +++ b/seaice_ecdr/tb_data.py @@ -413,15 +413,8 @@ def get_data_url_from_data_source( data_source: str, ) -> str: """Return the URL for this data_source""" - data_url = "" - if data_source == "NSIDC-0001": - data_url = "https://nsidc.org/data/nsidc-0001" - elif data_source == "NSIDC-0007": - data_url = "https://nsidc.org/data/nsidc-0007" - elif data_source == "AU_SI25": - data_url = "https://nsidc.org/data/au_si25" - elif data_source == "AU_SI12": - data_url = "https://nsidc.org/data/au_si12" + + data_url = f"https://nsidc.org/data/{data_source.lower()}" return data_url diff --git a/seaice_ecdr/temporal_composite_daily.py b/seaice_ecdr/temporal_composite_daily.py index 81f75b63..1ea046cc 100644 --- a/seaice_ecdr/temporal_composite_daily.py +++ b/seaice_ecdr/temporal_composite_daily.py @@ -22,6 +22,7 @@ get_daily_climatology_mask, get_non_ocean_mask, nh_polehole_mask, + remove_FillValue_from_coordinate_vars, ) from seaice_ecdr.cli.util import datetime_to_date from seaice_ecdr.constants import DEFAULT_BASE_OUTPUT_DIR @@ -418,7 +419,6 @@ def calc_stdev_inputs( try: assert verify_array_nominal_range(nt_conc) except AssertionError: - breakpoint() logger.error("nt_conc_median is not consistent with values 0-1") # Set a mask for the application of standard deviation calculation @@ -936,8 +936,9 @@ def write_tie_netcdf( elif varname not in uncompressed_fields: nc_encoding[varname] = {"zlib": True} - tie_ds.variables["x"].encoding["_FillValue"] = None - tie_ds.variables["y"].encoding["_FillValue"] = None + # tie_ds.variables["x"].encoding["_FillValue"] = None + # tie_ds.variables["y"].encoding["_FillValue"] = None + tie_ds = remove_FillValue_from_coordinate_vars(tie_ds) tie_ds.to_netcdf( output_filepath, encoding=nc_encoding, diff --git a/seaice_ecdr/tests/integration/test_ancillary.py b/seaice_ecdr/tests/integration/test_ancillary.py index 4b67a98b..5a0cfc27 100644 --- a/seaice_ecdr/tests/integration/test_ancillary.py +++ b/seaice_ecdr/tests/integration/test_ancillary.py @@ -19,6 +19,7 @@ DEFAULT_ANCILLARY_SOURCE, DEFAULT_CDR_RESOLUTION, ECDR_PRODUCT_VERSION, + NSIDC_NFS_SHARE_DIR, ) from seaice_ecdr.grid_id import get_grid_id @@ -81,14 +82,13 @@ def test_adj123_does_not_overlap_land(): def test_ancillary_filepaths(): """test that the directory/names of the ancillary files for publication are as expected""" - # ancillary_source = DEFAULT_ANCILLARY_SOURCE - # resolution = DEFAULT_CDR_RESOLUTION ancillary_source = cast(ANCILLARY_SOURCES, DEFAULT_ANCILLARY_SOURCE) resolution = cast(ECDR_SUPPORTED_RESOLUTIONS, DEFAULT_CDR_RESOLUTION) hemispheres = get_args(Hemisphere) product_version = ECDR_PRODUCT_VERSION - expected_dir = Path(f"/share/apps/G02202_V5/{product_version}_ancillary") + # expected_dir = Path(f"/share/apps/G02202_V5/{product_version}_ancillary") + expected_dir = Path(f"{NSIDC_NFS_SHARE_DIR}/{product_version}_ancillary") assert expected_dir.is_dir() expected_fp_template = Template("G02202-ancillary-${grid_id}-${product_version}.nc") diff --git a/seaice_ecdr/tests/integration/test_publish_daily.py b/seaice_ecdr/tests/integration/test_publish_daily.py index 9efb37d0..ab8f2c00 100644 --- a/seaice_ecdr/tests/integration/test_publish_daily.py +++ b/seaice_ecdr/tests/integration/test_publish_daily.py @@ -26,11 +26,8 @@ def test_publish_daily_nc(base_output_dir_test_path): # noqa # but the integration tests do not currently run with the AMSR2 platform # config, so the `prototype_am2` group is not present. # assert "prototype_am2" in ds.groups - assert "/cdr_supplementary" in ds.groups - # assert "valid_range" not in ds.time.attrs.keys() - # assert "valid_range" not in ds.x.attrs.keys() - # assert "valid_range" not in ds.y.attrs.keys() + assert "/cdr_supplementary" in ds.groups # Assert that the checksums exist where we expect them to be. checksum_filepath = ( diff --git a/seaice_ecdr/tests/integration/test_publish_monthly.py b/seaice_ecdr/tests/integration/test_publish_monthly.py index 665b3035..b642991b 100644 --- a/seaice_ecdr/tests/integration/test_publish_monthly.py +++ b/seaice_ecdr/tests/integration/test_publish_monthly.py @@ -27,11 +27,10 @@ def test_publish_monthly_nc(base_output_dir_test_path): # noqa # but the integration tests do not currently run with the AMSR2 platform # config, so the `prototype_am2` group is not present. # assert "prototype_am2" in ds.groups + assert "/cdr_supplementary" in ds.groups assert "valid_range" not in ds.time.attrs.keys() - # assert "valid_range" not in ds.x.attrs.keys() - # assert "valid_range" not in ds.y.attrs.keys() # Assert that the checksums exist where we expect them to be. checksum_filepath = ( diff --git a/seaice_ecdr/tests/unit/test_dates_handled_differently.py b/seaice_ecdr/tests/unit/test_dates_handled_differently.py index 2a47d1e1..bd9aa91a 100644 --- a/seaice_ecdr/tests/unit/test_dates_handled_differently.py +++ b/seaice_ecdr/tests/unit/test_dates_handled_differently.py @@ -9,10 +9,7 @@ def test_all_tbs_for_day_are_bad(): - """Test that days whose TBs are known to be bad are known - Initially, these test values are from CDRv4 code: - /vagrant/source/tests/test_cdr_configuration.py - """ + """Test that days whose TBs are known to be bad are known""" # North assert day_has_all_bad_tbs("n07", NORTH, dt.date(1984, 7, 3)) assert day_has_all_bad_tbs("n07", NORTH, dt.date(1986, 12, 5)) @@ -31,8 +28,6 @@ def test_day_has_all_empty_fields(): """Test for days that should have no data in it because it is in a long period of no available data such that not even temporal interpolation of the data should be used. - Original test dates came from CDRv4 code: - /vagrant/source/tests/test_cdr_configuration.py """ # Date is before start of first possible platform's data (n07) assert day_has_all_empty_fields("n07", "north", dt.date(1978, 10, 24)) @@ -42,312 +37,9 @@ def test_day_has_all_empty_fields(): # North assert day_has_all_empty_fields("n07", "north", dt.date(1984, 7, 3)) - # assert day_has_all_empty_fields("F08", "north", dt.date(1990, 12, 27)) assert not day_has_all_empty_fields("F17", "north", dt.date(2010, 12, 10)) # South assert day_has_all_empty_fields("n07", "south", dt.date(1984, 8, 23)) assert day_has_all_empty_fields("F08", "south", dt.date(1987, 12, 10)) assert not day_has_all_empty_fields("F17", "south", dt.date(2010, 12, 10)) - - -""" -def test_daily_filename_north(): - expected = f"sic_psn12.5_20210101_am2_{ECDR_PRODUCT_VERSION}.nc" - - actual = standard_daily_filename( - hemisphere=NORTH, resolution="12.5", platform_id="am2", date=dt.date(2021, 1, 1) - ) - - assert actual == expected - - -def test_daily_filename_south(): - expected = f"sic_pss12.5_20210101_am2_{ECDR_PRODUCT_VERSION}.nc" - - actual = standard_daily_filename( - hemisphere=SOUTH, resolution="12.5", platform_id="am2", date=dt.date(2021, 1, 1) - ) - - assert actual == expected - - -def test_nrt_daily_filename(): - expected = f"sic_psn12.5_20210101_am2_{ECDR_NRT_PRODUCT_VERSION}_P.nc" - - actual = nrt_daily_filename( - hemisphere=NORTH, resolution="12.5", platform_id="am2", date=dt.date(2021, 1, 1) - ) - - assert actual == expected - - -def test_nrt_monthly_filename(): - expected = f"sic_psn25_202409_F17_{ECDR_NRT_PRODUCT_VERSION}_P.nc" - - actual = nrt_monthly_filename( - hemisphere=NORTH, - resolution="25", - platform_id="F17", - year=2024, - month=9, - ) - - assert actual == expected - - -def test_daily_aggregate_filename(): - expected = f"sic_psn12.5_20210101-20211231_{ECDR_PRODUCT_VERSION}.nc" - - actual = standard_daily_aggregate_filename( - hemisphere=NORTH, - resolution="12.5", - start_date=dt.date(2021, 1, 1), - end_date=dt.date(2021, 12, 31), - ) - - assert actual == expected - - -def test_monthly_filename_north(): - expected = f"sic_psn12.5_202101_am2_{ECDR_PRODUCT_VERSION}.nc" - - actual = standard_monthly_filename( - hemisphere=NORTH, - resolution="12.5", - platform_id="am2", - year=2021, - month=1, - ) - - assert actual == expected - - -def test_monthly_filename_south(): - expected = f"sic_pss12.5_202101_am2_{ECDR_PRODUCT_VERSION}.nc" - - actual = standard_monthly_filename( - hemisphere=SOUTH, - resolution="12.5", - platform_id="am2", - year=2021, - month=1, - ) - - assert actual == expected - - -def test_monthly_aggregate_filename(): - expected = f"sic_pss12.5_202101-202112_{ECDR_PRODUCT_VERSION}.nc" - - actual = standard_monthly_aggregate_filename( - hemisphere=SOUTH, - resolution="12.5", - start_year=2021, - start_month=1, - end_year=2021, - end_month=12, - ) - - assert actual == expected - - -def test_daily_platform_id_from_filename(): - expected_platform_id: Final = "am2" - fn = standard_daily_filename( - hemisphere=NORTH, - resolution="12.5", - platform_id=expected_platform_id, - date=dt.date(2021, 1, 1), - ) - - actual_platform_id = platform_id_from_filename(fn) - - assert expected_platform_id == actual_platform_id - - -def test_monthly_platform_id_from_filename(): - expected_platform_id: Final = "F17" - fn = standard_monthly_filename( - hemisphere=SOUTH, - resolution="12.5", - platform_id=expected_platform_id, - year=2021, - month=1, - ) - - actual_platform_id = platform_id_from_filename(fn) - - assert expected_platform_id == actual_platform_id - - -def test_daily_platform_id_from_daily_nrt_filename(): - expected_platform_id: Final = "F17" - fn = nrt_daily_filename( - hemisphere=SOUTH, - resolution="25", - platform_id=expected_platform_id, - date=dt.date(2021, 1, 1), - ) - - actual_platform_id = platform_id_from_filename(fn) - - assert expected_platform_id == actual_platform_id - - -def test_daily_platform_id_from_monthly_nrt_filename(): - expected_platform_id: Final = "F17" - fn = nrt_monthly_filename( - hemisphere=SOUTH, - resolution="25", - platform_id=expected_platform_id, - year=2024, - month=9, - ) - - actual_platform_id = platform_id_from_filename(fn) - - assert expected_platform_id == actual_platform_id - - -def test_find_standard_monthly_netcdf_files_platform_wildcard(fs): - monthly_output_dir = Path("/path/to/data/dir/monthly") - fs.create_dir(monthly_output_dir) - platform_ids: list[SUPPORTED_PLATFORM_ID] = ["am2", "F17"] - for platform_id in platform_ids: - fake_monthly_filename = standard_monthly_filename( - hemisphere=NORTH, - resolution="25", - platform_id=platform_id, - year=2021, - month=1, - ) - fake_monthly_filepath = monthly_output_dir / fake_monthly_filename - fs.create_file(fake_monthly_filepath) - - found_files_wildcard_platform_id = find_standard_monthly_netcdf_files( - search_dir=monthly_output_dir, - hemisphere=NORTH, - resolution="25", - platform_id="*", - year=2021, - month=1, - ) - - assert len(found_files_wildcard_platform_id) == 2 - - -def test_find_standard_monthly_netcdf_files_yearmonth_wildcard(fs): - monthly_output_dir = Path("/path/to/data/dir/monthly") - fs.create_dir(monthly_output_dir) - for year, month in [(2022, 1), (2022, 2)]: - fake_monthly_filename = standard_monthly_filename( - hemisphere=NORTH, - resolution="25", - platform_id="F17", - year=year, - month=month, - ) - fake_monthly_filepath = monthly_output_dir / fake_monthly_filename - fs.create_file(fake_monthly_filepath) - - found_files_wildcard_platform_id = find_standard_monthly_netcdf_files( - search_dir=monthly_output_dir, - hemisphere=NORTH, - resolution="25", - platform_id="F17", - year="*", - month="*", - ) - - assert len(found_files_wildcard_platform_id) == 2 - - -def test_date_range(): - start_date = dt.date(2021, 1, 2) - end_date = dt.date(2021, 1, 5) - expected = [ - start_date, - dt.date(2021, 1, 3), - dt.date(2021, 1, 4), - end_date, - ] - actual = list(date_range(start_date=start_date, end_date=end_date)) - - assert expected == actual - - -def test_get_dates_by_year(): - actual = get_dates_by_year( - [ - dt.date(2021, 1, 3), - dt.date(2021, 1, 2), - dt.date(2022, 1, 1), - dt.date(1997, 3, 2), - dt.date(1997, 4, 15), - dt.date(2022, 1, 2), - ] - ) - - expected = [ - [ - dt.date(1997, 3, 2), - dt.date(1997, 4, 15), - ], - [ - dt.date(2021, 1, 2), - dt.date(2021, 1, 3), - ], - [ - dt.date(2022, 1, 1), - dt.date(2022, 1, 2), - ], - ] - - assert actual == expected - - -def test_get_num_missing_pixels(monkeypatch): - _mock_oceanmask = xr.DataArray( - [ - True, - False, - True, - True, - ], - dims=("y",), - coords=dict(y=list(range(4))), - ) - monkeypatch.setattr( - util, "get_ocean_mask", lambda *_args, **_kwargs: _mock_oceanmask - ) - - _mock_sic = xr.DataArray( - [ - 0.99, - np.nan, - 0.25, - np.nan, - ], - dims=("y",), - coords=dict(y=list(range(4))), - ) - - detected_missing = get_num_missing_pixels( - seaice_conc_var=_mock_sic, - hemisphere="north", - resolution="12.5", - ancillary_source="CDRv5", - ) - - assert detected_missing == 1 - - -def test_raise_error_for_dates(): - # If no dates are passed, no error should be raised. - raise_error_for_dates(error_dates=[]) - - # If one or more dates are passed, an error should be raised. - with pytest.raises(RuntimeError): - raise_error_for_dates(error_dates=[dt.date(2011, 1, 1)]) -"""