Skip to content

Commit

Permalink
Multiple Timeseries (JCSDA-internal#209)
Browse files Browse the repository at this point in the history
## Description

Extending time series utility to process multiple time series. This
allows for time series comparisons between different collections Note
that time series section must be a list.
  • Loading branch information
asewnath authored Oct 9, 2024
1 parent cab65a4 commit 767125f
Show file tree
Hide file tree
Showing 12 changed files with 272 additions and 122 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/yaml_coding_norms.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
name: Check YAML Coding Norms
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- id: yaml-lint
name: yaml-lint
uses: ibiqlik/action-yamllint@v3
Expand All @@ -27,7 +27,7 @@ jobs:
format: colored
config_file: .yamllint.yml

- uses: actions/upload-artifact@v2
- uses: actions/upload-artifact@v4
if: always()
with:
name: yamllint-logfile
Expand Down
2 changes: 1 addition & 1 deletion requirements-github.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ xarray>=2022.6.0
seaborn>=0.12.2
hvplot>=0.8.2
nbconvert>=6.5.4
bokeh<3.5.0,>=3.4.0
bokeh<3.6.0,>=3.5.0
geopandas>=0.13.2
geoviews>=1.10.0
nbsite
Expand Down
22 changes: 11 additions & 11 deletions src/eva/data/data_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ def create_or_add_to_collection(self, collection_name, collection, concat_dimens
"""

# If time_series collection name must also be time_series
if self.time_series and collection_name != 'time_series':
self.logger.abort('In create_or_add_to_collection: time_series collection must ' +
'be \'time_series\'')
if self.time_series and 'time_series' not in collection_name:
self.logger.abort('In get_variable_data: time_series collection must ' +
'have name containing \'time_series\'')

# Collections should only be xarray datasets
if not isinstance(collection, Dataset):
Expand Down Expand Up @@ -158,9 +158,9 @@ def add_variable_to_collection(self, collection_name, group_name, variable_name,
"""

# If time_series collection name must also be time_series
if self.time_series and collection_name != 'time_series':
self.logger.abort('In add_variable_to_collection: time_series collection must ' +
'be \'time_series\'')
if self.time_series and 'time_series' not in collection_name:
self.logger.abort('In get_variable_data: time_series collection must ' +
'have name containing \'time_series\'')

# Assert that new variable is an xarray Dataarray
if not isinstance(variable, DataArray):
Expand Down Expand Up @@ -211,9 +211,9 @@ def get_variable_data_array(self, collection_name, group_name, variable_name,
"""

# If time_series collection name must also be time_series
if self.time_series and collection_name != 'time_series':
self.logger.abort('In get_variable_data_array: time_series collection must ' +
'be \'time_series\'')
if self.time_series and 'time_series' not in collection_name:
self.logger.abort('In get_variable_data: time_series collection must ' +
'have name containing \'time_series\'')

group_variable_name = group_name + '::' + variable_name
data_array = self._collections[collection_name][group_variable_name]
Expand Down Expand Up @@ -293,9 +293,9 @@ def get_variable_data(self, collection_name, group_name, variable_name,
"""

# If time_series collection name must also be time_series
if self.time_series and collection_name != 'time_series':
if self.time_series and 'time_series' not in collection_name:
self.logger.abort('In get_variable_data: time_series collection must ' +
'be \'time_series\'')
'have name containing \'time_series\'')

variable_array = self.get_variable_data_array(collection_name, group_name, variable_name,
channels, levels, datatypes)
Expand Down
173 changes: 96 additions & 77 deletions src/eva/eva_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from datetime import datetime
import argparse
import os
from collections import defaultdict

from eva.utilities.config import get
from eva.utilities.logger import Logger
Expand Down Expand Up @@ -94,85 +95,103 @@ def read_transform_time_series(logger, timing, eva_dict, data_collections):
None
"""

# Check for required keys
# -----------------------
required_keys = [
'begin_date',
'final_date',
'interval',
'collection',
'variables',
]
for key in required_keys:
logger.assert_abort(key in eva_dict['time_series'], 'If running Eva in time series ' +
f'mode the time series config must contain "{key}"')

# Write message that this is a time series run
logger.info('This instance of Eva is being used to accumulate a time series.')

# Optionally suppress the display of the collection
suppress_collection_display = get(eva_dict, logger, 'suppress_collection_display', False)

# Get the datasets configuration
time_series_config = eva_dict['time_series']

# Extract the dates of the time series
begin_date = time_series_config['begin_date']
final_date = time_series_config['final_date']
interval = time_series_config['interval']

# Convert begin and end dates from ISO strings to datetime objects
begin_date = datetime.fromisoformat(begin_date)
final_date = datetime.fromisoformat(final_date)

# Convert interval ISO string to timedelta object
interval = iso_duration_to_timedelta(logger, interval)

# Make list of dates from begin to end with interval
dates = []
date = begin_date
count = 0
while date <= final_date:
dates.append(date)
date += interval
count += 1
# Abort if count hits one million
logger.assert_abort(count < 1e6, 'You are planning to read more than one million ' +
'time steps. This is likely an error. Please check your configuration.')

# Get the datasets configuration
datasets_config = get(eva_dict, logger, 'datasets')

# Assert that datasets_config is the same length as dates
logger.assert_abort(len(datasets_config) == len(dates), 'When running in time series mode ' +
'the number of datasets must be the same as the number of dates.')

# Loop over datasets reading each one in turn, internally appending the data_collections
for ind, dataset_config in enumerate(datasets_config):

# Create a temporary collection for this time step
data_collections_tmp = DataCollections()

# Prepare diagnostic data
logger.info('Running data driver')
timing.start('DataDriverExecute')
data_driver(dataset_config, data_collections_tmp, timing, logger)
timing.stop('DataDriverExecute')

# Perform any transforms on the fly
# Iterate through list of time series dictionaries
for time_series_config in eva_dict['time_series']:

# Check for required keys
# -----------------------
required_keys = [
'begin_date',
'final_date',
'interval',
'collection',
'variables',
]
for key in required_keys:
logger.assert_abort(key in time_series_config, 'If running Eva in time series ' +
f'mode the time series config must contain "{key}"')

# Write message that this is a time series run
logger.info('This instance of Eva is being used to accumulate a time series.')

# Optionally suppress the display of the collection
suppress_collection_display = get(eva_dict, logger, 'suppress_collection_display', False)

# Extract the dates of the time series
begin_date = time_series_config['begin_date']
final_date = time_series_config['final_date']
interval = time_series_config['interval']

# Convert begin and end dates from ISO strings to datetime objects
begin_date = datetime.fromisoformat(begin_date)
final_date = datetime.fromisoformat(final_date)

# Convert interval ISO string to timedelta object
interval = iso_duration_to_timedelta(logger, interval)

# Make list of dates from begin to end with interval
dates = []
date = begin_date
count = 0
while date <= final_date:
dates.append(date)
date += interval
count += 1
# Abort if count hits one million
logger.assert_abort(count < 1e6, 'You are planning to read more than one million ' +
'time steps. This is likely an error. Please check your ' +
'configuration.')

# Get all datasets configuration
all_datasets = get(eva_dict, logger, 'datasets')

# Find all dataset_configs with collection name
datasets_config = []
for dataset in all_datasets:
if dataset['name'] == time_series_config["collection"]:
datasets_config.append(dataset)

# Save transforms to transform_dict based on collection
transform_dict = defaultdict(list)
if 'transforms' in eva_dict:
logger.info(f'Running transform driver')
timing.start('TransformDriverExecute')
transform_driver(eva_dict, data_collections_tmp, timing, logger)
timing.stop('TransformDriverExecute')
for transform in get(eva_dict, logger, 'transforms'):
# Get collection name
name = transform['new name'].split('::')[0]
if name == time_series_config['collection']:
transform_dict['transforms'].append(transform)

# Assert that datasets_config is the same length as dates
logger.assert_abort(len(datasets_config) == len(dates), 'When running in time ' +
'series mode the number of datasets must be the same as the ' +
'number of dates.')

# Loop over datasets reading each one in turn, internally appending the data_collections
for ind, dataset_config in enumerate(datasets_config):

# Create a temporary collection for this time step
data_collections_tmp = DataCollections()

# Prepare diagnostic data
logger.info('Running data driver')
timing.start('DataDriverExecute')
data_driver(dataset_config, data_collections_tmp, timing, logger)
timing.stop('DataDriverExecute')

# Perform any transforms on the fly
if transform_dict:
logger.info(f'Running transform driver')
timing.start('TransformDriverExecute')
transform_driver(transform_dict, data_collections_tmp, timing, logger)
timing.stop('TransformDriverExecute')

# Collapse data into time series
date = dates[ind]
collapse_collection_to_time_series(logger, ind, date, time_series_config,
data_collections, data_collections_tmp)

# Collapse data into time series
collapse_collection_to_time_series(logger, ind, dates, time_series_config, data_collections,
data_collections_tmp)

if not suppress_collection_display:
logger.info('Computing of Eva time series complete: status of collection:')
data_collections.display_collections()
if not suppress_collection_display:
logger.info('Computing of Eva time series complete: status of collection:')
data_collections.display_collections()


# --------------------------------------------------------------------------------------------------
Expand Down
119 changes: 119 additions & 0 deletions src/eva/tests/config/testIodaObsSpaceAmsuaN19_Multiple_TimeSeries.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
suppress_collection_display: false
datasets:
- name: control
type: IodaObsSpace
filenames:
- ${data_input_path}/ctrl_amsua_n19.20230726T030000Z.nc4
channels: 3,8
groups:
- name: ObsValue
variables:
- brightnessTemperature
- name: GsiHofXBc
- name: hofx0
- name: MetaData
- name: oman
- name: control
type: IodaObsSpace
filenames:
- ${data_input_path}/ctrl_amsua_n19.20230726T090000Z.nc4
channels: 3,8
groups:
- name: ObsValue
variables:
- brightnessTemperature
- name: GsiHofXBc
- name: hofx0
- name: MetaData
- name: oman
- name: experiment
type: IodaObsSpace
filenames:
- ${data_input_path}/exp_amsua_n19.20230726T030000Z.nc4
channels: 3,8
groups:
- name: ObsValue
variables:
- brightnessTemperature
- name: GsiHofXBc
- name: hofx0
- name: MetaData
- name: oman
- name: experiment
type: IodaObsSpace
filenames:
- ${data_input_path}/exp_amsua_n19.20230726T090000Z.nc4
channels: 3,8
groups:
- name: ObsValue
variables:
- brightnessTemperature
- name: GsiHofXBc
- name: hofx0
- name: MetaData
- name: oman
transforms:
- transform: arithmetic
new name: control::ObsValueMinusHofx::${variable}
equals: control::ObsValue::${variable}-control::hofx0::${variable}
for:
variable: &id001
- brightnessTemperature
- transform: arithmetic
new name: experiment::ObsValueMinusHofx::${variable}
equals: experiment::ObsValue::${variable}-experiment::hofx0::${variable}
for:
variable: *id001
time_series:
- begin_date: '2023-07-26T03:00:00'
final_date: '2023-07-26T09:00:00'
interval: PT6H
collection: experiment
variables:
- ObsValueMinusHofx::brightnessTemperature
aggregation_methods:
- mean
dimension: Location
- begin_date: '2023-07-26T03:00:00'
final_date: '2023-07-26T09:00:00'
interval: PT6H
collection: control
variables:
- ObsValueMinusHofx::brightnessTemperature
aggregation_methods:
- mean
dimension: Location
graphics:
plotting_backend: Emcpy
figure_list:
- figure:
layout:
- 1
- 1
title: Mean OmB | AMSU-A NOAA-19 | Ch 3 | ObsValueMinusHofx::brightnessTemperature
output name: time_series/amsua_n19/brightnessTemperature_mean/3/time_series_compare_omb.png
plots:
- add_xlabel: Datetime
add_ylabel: JEDI h(x)
add_grid: null
add_legend:
loc: upper left
layers:
- type: LinePlot
x:
variable: control_time_series::MetaData::Dates
y:
variable: control_time_series::ObsValueMinusHofx::brightnessTemperature_mean
channel: 3
markersize: 5
color: black
label: Observation minus h(x) - ctrl
- type: LinePlot
x:
variable: experiment_time_series::MetaData::Dates
y:
variable: experiment_time_series::ObsValueMinusHofx::brightnessTemperature_mean
channel: 3
markersize: 5
color: blue
label: Observation minus h(x) - exp
Loading

0 comments on commit 767125f

Please sign in to comment.