Skip to content

Commit

Permalink
Adding new reader and lat/lon match transform (#184)
Browse files Browse the repository at this point in the history
We are adding a new reader to handle geoval files, but with the idea
that it can be generalized to other types of files. A new latlon match
transform was added using some work that @danholdaway had developed.

List of changes:
- Method to `data_collections.py` to retrieve a collection
- Generic `DataFile` reader
- `latlon_match` transform to match lat/lon coordinates from one
collection to another
- `DataFile` test yaml and some files for testing

Resolves #177
  • Loading branch information
asewnath authored Apr 19, 2024
1 parent eac6d99 commit a40e651
Show file tree
Hide file tree
Showing 9 changed files with 315 additions and 1 deletion.
2 changes: 1 addition & 1 deletion requirements-github.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ xarray>=2022.6.0
seaborn>=0.12.2
hvplot>=0.8.2
nbconvert>=6.5.4
bokeh>=3.1.1
bokeh<3.5.0,>=3.4.0
geopandas>=0.13.2
geoviews>=1.10.0
nbsite
Expand Down
5 changes: 5 additions & 0 deletions src/eva/data/data_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,11 @@ def add_variable_to_collection(self, collection_name, group_name, variable_name,

# ----------------------------------------------------------------------------------------------

def get_data_collection(self, collection_name):
return self._collections[collection_name]

# ----------------------------------------------------------------------------------------------

def get_variable_data_array(self, collection_name, group_name, variable_name,
channels=None, levels=None, datatypes=None):

Expand Down
109 changes: 109 additions & 0 deletions src/eva/data/geoval_space.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# (C) Copyright 2024 NOAA/NWS/EMC
#
# (C) Copyright 2024 United States Government as represented by the Administrator of the
# National Aeronautics and Space Administration. All Rights Reserved.
#
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.

# --------------------------------------------------------------------------------------------------

import os
import netCDF4 as nc
import numpy as np
from xarray import Dataset, open_dataset
from eva.utilities.config import get
from eva.data.eva_dataset_base import EvaDatasetBase
from eva.utilities.utils import parse_channel_list


class GeovalSpace(EvaDatasetBase):

"""
A class for handling geoval files
"""

def execute(self, dataset_config, data_collections, timing):

"""
Executes the processing of data file dataset.
Args:
dataset_config (dict): Configuration dictionary for the dataset.
data_collections (DataCollections): Object for managing data collections.
timing (Timing): Timing object for tracking execution time.
"""

# Set the collection name
# -----------------------
collection_name = get(dataset_config, self.logger, 'name')

# Get missing value threshold
# ---------------------------
threshold = float(get(dataset_config, self.logger, 'missing_value_threshold', 1.0e30))

# Get levels to plot profiles
# --------------------------_
levels_str_or_list = get(dataset_config, self.logger, 'levels', [])

# Convert levels to list
levels = []
if levels_str_or_list is not []:
levels = parse_channel_list(levels_str_or_list, self.logger)

# Filename to be used for reads
# ---------------------------------------
data_filename = get(dataset_config, self.logger, 'data_file')

# Get instrument name
instr_name = get(dataset_config, self.logger, 'instrument_name')

# Open instrument files xarray dataset
instr_ds = open_dataset(data_filename)

# Enforce that a variable exists, do not default to all variables
variables = get(dataset_config, self.logger, 'variables')
if not variables:
self.logger.abort('A variables list needs to be defined in the config file.')
vars_to_remove = list(set(list(instr_ds.keys())) - set(variables))
instr_ds = instr_ds.drop_vars(vars_to_remove)

# Rename variables and nval dimension
rename_dict = {}
rename_dims_dict = {}
for v in variables:
# Retrieve dimension names
dims = instr_ds[v].dims
if np.size(dims) > 1:
rename_dims_dict[dims[1]] = f'Level'
rename_dict[v] = f'{instr_name}::{v}'
instr_ds = instr_ds.rename(rename_dict)
instr_ds = instr_ds.rename_dims(rename_dims_dict)

# Add the dataset_config to the collections
data_collections.create_or_add_to_collection(collection_name, instr_ds)

# Nan out unphysical values
data_collections.nan_float_values_outside_threshold(threshold)

# Display the contents of the collections for helping the user with making plots
data_collections.display_collections()

def generate_default_config(self, filenames, collection_name):

"""
Generate a default configuration for the dataset.
This method generates a default configuration for the dataset based on the provided
filenames and collection name. It can be used as a starting point for creating a
configuration for the dataset.
Args:
filenames: Filenames or file paths relevant to the dataset.
collection_name (str): Name of the collection for the dataset.
Returns:
dict: A dictionary representing the default configuration for the dataset.
"""

pass
100 changes: 100 additions & 0 deletions src/eva/tests/config/testGeovalSpace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
datasets:

- name: exp_geovals_with_lvls
type: GeovalSpace
data_file: ${data_input_path}/swell-hofx.amsua_n19-geovals.20211211T210000Z.nc4
levels: &exp_levels 33,60
instrument_name: amsua_n19
variables: &exp_vars_with_lvls ['mole_fraction_of_carbon_dioxide_in_air']

- name: exp_geovals
type: GeovalSpace
data_file: ${data_input_path}/swell-hofx.amsua_n19-geovals.20211211T210000Z.nc4
instrument_name: amsua_n19
variables: &exp_vars ['vegetation_area_fraction', 'leaf_area_index']

- name: exp_latlon
type: IodaObsSpace
filenames:
- ${data_input_path}/swell-hofx.amsua_n19.20211211T210000Z.nc4
groups:
- name: MetaData

- name: ctrl_geovals_with_lvls
type: GeovalSpace
data_file: ${data_input_path}/ncdiag.x0048v2-geovals.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4
levels: &ctrl_levels 33,60
instrument_name: amsua_n19
variables: &ctrl_vars_with_lvls ['mole_fraction_of_carbon_dioxide_in_air']

- name: ctrl_geovals
type: GeovalSpace
data_file: ${data_input_path}/ncdiag.x0048v2-geovals.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4
instrument_name: amsua_n19
variables: &ctrl_vars ['vegetation_area_fraction', 'leaf_area_index']

- name: ctrl_latlon
type: IodaObsSpace
filenames:
- ${data_input_path}/ncdiag.x0048v2.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4
groups:
- name: MetaData

transforms:

- transform: latlon_match
new_collection_name: ctrl_geovals_matched_index
base_latlon: ctrl_latlon
match_base_latlon_to: exp_latlon
base_collection: ctrl_geovals::amsua_n19::${variable}
for:
variable: *ctrl_vars

- transform: latlon_match
new_collection_name: ctrl_geovals_with_lvls_matched_index
base_latlon: ctrl_latlon
match_base_latlon_to: exp_latlon
base_collection: ctrl_geovals_with_lvls::amsua_n19::${variable}
for:
variable: *ctrl_vars_with_lvls

- transform: arithmetic
new name: exp_geovals::amsua_n19::exp_minus_ctrl_${variable}
equals: exp_geovals::amsua_n19::${variable}-ctrl_geovals_matched_index::amsua_n19::${variable}
for:
variable: *exp_vars

graphics:

plotting_backend: Emcpy
figure_list:

- batch figure:
variables: *exp_vars
dynamic options:
- type: vminvmaxcmap
data variable: exp_geovals::amsua_n19::exp_minus_ctrl_${variable}
figure:
figure size: [20,10]
layout: [1,1]
title: 'JEDI - GSI | AMSU-A NOAA-19 | Geoval | ${variable}'
output name: map_plots/geovals/amsua_n19/${variable}/observations_amsua_n19_${variable}.png
plots:
- mapping:
projection: plcarr
domain: global
add_map_features: ['coastline']
add_colorbar:
label: '${variable}'
layers:
- type: MapScatter
longitude:
variable: exp_latlon::MetaData::longitude
latitude:
variable: exp_latlon::MetaData::latitude
data:
variable: exp_geovals::amsua_n19::exp_minus_ctrl_${variable}
markersize: 2
cmap: ${dynamic_cmap}
vmin: ${dynamic_vmin}
vmax: ${dynamic_vmax}
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/eva/tests/data/swell-hofx.amsua_n19.20211211T210000Z.nc4
Git LFS file not shown
88 changes: 88 additions & 0 deletions src/eva/transforms/latlon_match.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# (C) Copyright 2024 NOAA/NWS/EMC
#
# (C) Copyright 2024 United States Government as represented by the Administrator of the
# National Aeronautics and Space Administration. All Rights Reserved.
#
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.

import numpy as np
from xarray import Dataset, DataArray
from eva.utilities.config import get
from eva.utilities.logger import Logger
from eva.transforms.transform_utils import parse_for_dict, split_collectiongroupvariable


def latlon_match(config, data_collections):

"""
Applies lat/lon match transform to a given collection.
Args:
config (dict): A configuration dictionary containing transformation parameters.
data_collections (DataCollections): An instance of the DataCollections class containing
input data.
Returns:
None
This function applies lat/lon matching to variables in the base collection. A new collection
with matched variables is added to the data collection.
base collection: collection to perform the latlon matching on
base_latlon: the collection with lat/lon coordiates corresponding to base collection
match_base_latlon_to: the collection with lat/lon coordinates corresponding to what you want to
match the base latlon to.
"""

# Create a logger
logger = Logger('LatLonMatchTransform')

# Parse the for dictionary
_, _, variables = parse_for_dict(config, logger)

# Parse config for names
base_collection = get(config, logger, 'base_collection')
base_latlon_name = get(config, logger, 'base_latlon')
match_latlon_name = get(config, logger, 'match_base_latlon_to')

# Extract collection and group
cgv = split_collectiongroupvariable(logger, base_collection)

# Retrieve collections using collection names
base_lat = data_collections.get_variable_data_array(base_latlon_name, 'MetaData',
'latitude').to_numpy()
base_lon = data_collections.get_variable_data_array(base_latlon_name, 'MetaData',
'longitude').to_numpy()
match_lat = data_collections.get_variable_data_array(match_latlon_name, 'MetaData',
'latitude').to_numpy()
match_lon = data_collections.get_variable_data_array(match_latlon_name, 'MetaData',
'longitude').to_numpy()

# Find matching index (this can be updated using dask)
matching_index = []
for i in range(len(base_lat)):
matching_index.append((abs(base_lat - match_lat[i]) +
abs(base_lon - match_lon[i])).argmin())

# Retrieve data collection from data collections
match_ds = data_collections.get_data_collection(cgv[0])

# Loop through starting_dataset and update all variable arrays
update_ds_list = []
for variable in variables:
var_array = data_collections.get_variable_data_array(cgv[0], cgv[1], variable)
var_values = var_array.values

# Index data array with matching_index and then save to new collection
var_values = var_values[matching_index]
var_array.values = var_values
match_ds[f'{cgv[1]}::{variable}'] = var_array

# get new collection name
new_collection_name = get(config, logger, 'new_collection_name')

# add new collection to data collections
data_collections.create_or_add_to_collection(new_collection_name, match_ds)
match_ds.close()

0 comments on commit a40e651

Please sign in to comment.