Skip to content

Commit

Permalink
Merge pull request #25 from rileyhales/improve_validation
Browse files Browse the repository at this point in the history
  • Loading branch information
rileyhales authored Dec 1, 2021
2 parents fc050cb + 1fdcbfc commit 52b1497
Show file tree
Hide file tree
Showing 12 changed files with 412 additions and 122 deletions.
31 changes: 29 additions & 2 deletions examples/colombia-magdalena/magdalena_example.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
import os

import numpy as np

import rbc


np.seterr(all="ignore")

workdir = '/Users/rchales/data/regional-bias-correction/colombia-magdalena'
drain_shape = os.path.join(workdir, 'gis_inputs', 'magdalena_dl_attrname_xy.json')
gauge_shape = os.path.join(workdir, 'gis_inputs', 'ideam_stations.json')
obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs')

# Only need to do this step 1x ever
# rbc.prep.scaffold_working_directory(workdir)
Expand All @@ -16,7 +22,7 @@
# assign_table = rbc.table.gen(workdir)
# rbc.table.cache(workdir, assign_table)
# Or read the existing table
assign_table = rbc.table.read(workdir)
# assign_table = rbc.table.read(workdir)

# Prepare the observation and simulation data
# Only need to do this step 1x ever
Expand All @@ -43,4 +49,25 @@
# rbc.gis.clip_by_unassigned(workdir, assign_table, drain_shape)

# Compute the corrected simulation data
rbc.calibrate_region(workdir, assign_table)
# assign_table = rbc.table.read(workdir)
# rbc.calibrate_region(workdir, assign_table)
# vtab = rbc.validate.gen_val_table(workdir)
rbc.gis.validation_maps(workdir, gauge_shape)
rbc.analysis.plot(workdir, obs_data_dir, 9007721)


# import pandas as pd
# path_to_your_pickle_file = '/Users/rchales/data/regional-bias-correction/colombia-magdalena/validation_runs/90/data_processed/subset_time_series.pickle'
# a = pd.read_pickle(path_to_your_pickle_file)
# a.index = a.index.tz_localize('UTC')
# a.to_pickle(path_to_your_pickle_file)


# import netCDF4 as nc
# import numpy as np
# path = '/Users/rchales/data/regional-bias-correction/colombia-magdalena/validation_runs/90/calibrated_simulated_flow.nc'
# a = nc.Dataset(path, 'r+')
# a.createVariable('corrected', 'i4', ('corrected',), zlib=True, shuffle=True, fill_value=-1)
# a['corrected'][:] = np.array((0, 1))
# a.sync()
# a.close()
16 changes: 16 additions & 0 deletions examples/example_inputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import os


# COLOMBIA
workdir = '/Users/rchales/data/regional-bias-correction/colombia-magdalena'
drain_shape = os.path.join(workdir, 'gis_inputs', 'magdalena_dl_attrname_xy.json')
gauge_shape = os.path.join(workdir, 'gis_inputs', 'ideam_stations.json')
obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs')
hist_sim_nc = os.path.join(workdir, 'data_inputs', 'south_america_era5_qout.nc')

# TEXAS
workdir = '/Users/rchales/data/regional-bias-correction/texas'
drain_shape = os.path.join(workdir, 'shapefiles', 'texas-dl.json')
gauge_shape = os.path.join(workdir, 'shapefiles', 'texas-gauges.shp')
obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs')
hist_sim_nc = os.path.join(workdir, 'data_inputs', 'Qout_era5_t640_24hr_19790101to20210630.nc.nc')
20 changes: 15 additions & 5 deletions examples/example_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@

np.seterr(all="ignore")

workdir = '/Users/rchales/data/regional-bias-correction/colombia-magdalena'
drain_shape = os.path.join(workdir, 'gis_inputs', 'magdalena_dl_attrname_xy.json')
obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs')
workdir = ''
drain_shape = os.path.join(workdir, 'gis_inputs', '')
gauge_shape = ''
obs_data_dir = ''
hist_sim_nc = ''

# Prepare the working directory - only need to do this step 1x ever
# rbc.prep.scaffold_working_directory(workdir)
Expand All @@ -19,19 +21,22 @@
# Put the observed data csv files in the data_inputs/obs_csvs folder

# Prepare the observation and simulation data - Only need to do this step 1x ever
print('Preparing data')
rbc.prep.historical_simulation(workdir)
rbc.prep.observed_data(workdir)

# Generate the assignments table
print('Generate Assignment Table')
assign_table = rbc.table.gen(workdir)
rbc.table.cache(workdir, assign_table)

# Generate the clusters using the historical simulation data
print('Generate Clusters')
rbc.cluster.generate(workdir)
assign_table = rbc.cluster.summarize(workdir, assign_table)
rbc.table.cache(workdir, assign_table)

# Assign basins which are gauged and propagate those gauges
print('Making Assignments')
assign_table = rbc.assign.gauged(assign_table)
assign_table = rbc.assign.propagation(assign_table)
assign_table = rbc.assign.clusters_by_dist(assign_table)
Expand All @@ -40,13 +45,18 @@
rbc.table.cache(workdir, assign_table)

# Generate GIS files so you can go explore your progress graphically
print('Generate GIS files')
rbc.gis.clip_by_assignment(workdir, assign_table, drain_shape)
rbc.gis.clip_by_cluster(workdir, assign_table, drain_shape)
rbc.gis.clip_by_unassigned(workdir, assign_table, drain_shape)

# Compute the corrected simulation data
print('Starting Calibration')
rbc.calibrate_region(workdir, assign_table)

# run the validation study
rbc.validate.sample_gauges(workdir)
print('Performing Validation')
rbc.validate.sample_gauges(workdir, overwrite=True)
rbc.validate.run_series(workdir, drain_shape, obs_data_dir)
vtab = rbc.validate.gen_val_table(workdir)
rbc.gis.validation_maps(workdir, gauge_shape, vtab)
3 changes: 2 additions & 1 deletion rbc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
import rbc.gis
import rbc.utils
import rbc.validate
import rbc.analysis


__all__ = ['prep_region', 'analyze_region',
'calibrate_stream', 'calibrate_region',
'table', 'prep', 'assign', 'gis', 'cluster', 'utils', 'validate']
'table', 'prep', 'assign', 'gis', 'cluster', 'utils', 'validate', 'analysis']
__author__ = 'Riley Hales'
__version__ = '0.1.0'
__license__ = 'BSD 3 Clause Clear'
122 changes: 101 additions & 21 deletions rbc/_calibrate.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import datetime
import os
import statistics
import warnings

import hydrostats as hs
import hydrostats.data as hd
import netCDF4 as nc
import numpy as np
import pandas as pd
Expand All @@ -11,6 +13,10 @@
from ._vocab import mid_col
from ._vocab import asgn_mid_col
from ._vocab import asgn_gid_col
from ._vocab import metric_list
from ._vocab import metric_nc_name_list
from ._vocab import sim_ts_pickle
from ._vocab import cal_nc_name


def calibrate_stream(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd.DataFrame,
Expand Down Expand Up @@ -133,68 +139,142 @@ def calibrate_stream(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flo
columns=('flow', 'scalars', 'percentile'))


def calibrate_region(workdir: str, assign_table: pd.DataFrame, obs_data_dir: str = None):
def calibrate_region(workdir: str, assign_table: pd.DataFrame,
gauge_table: pd.DataFrame = None, obs_data_dir: str = None) -> None:
"""
Creates a netCDF with the
Args:
workdir: path to project working directory
assign_table: the assign_table dataframe
gauge_table: path to the gauge table
obs_data_dir: path to the observed data
Returns:
None
"""
if gauge_table is None:
gauge_table = pd.read_csv(os.path.join(workdir, 'gis_inputs', 'gauge_table.csv'))
if obs_data_dir is None:
obs_data_dir = os.path.join(workdir, 'data_observed', 'obs_csvs')
bcs_nc_path = os.path.join(workdir, 'calibrated_simulated_flow.nc')
ts = pd.read_pickle(os.path.join(workdir, 'data_processed', 'subset_time_series.pickle'))
obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs')

t_size = ts.values.shape[0]
m_size = ts.values.shape[1]
bcs_nc_path = os.path.join(workdir, cal_nc_name)
ts = pd.read_pickle(os.path.join(workdir, 'data_processed', sim_ts_pickle))

# create the new netcdf
bcs_nc = nc.Dataset(bcs_nc_path, 'w')

# set up the dimensions
t_size = ts.values.shape[0]
m_size = ts.values.shape[1]
c_size = 2
bcs_nc.createDimension('time', t_size)
bcs_nc.createDimension('model_id', m_size)
bcs_nc.createDimension('corrected', c_size)

bcs_nc.createVariable('time', 'f4', ('time',), zlib=True, shuffle=True, fill_value=np.nan)
bcs_nc.createVariable('model_id', 'f4', ('model_id',), zlib=True, shuffle=True, fill_value=np.nan)
# coordinate variables
bcs_nc.createVariable('time', 'i', ('time',), zlib=True, shuffle=True, fill_value=-9999)
bcs_nc.createVariable('model_id', 'i', ('model_id',), zlib=True, shuffle=True, fill_value=-9999)
bcs_nc.createVariable('corrected', 'i', ('corrected',), zlib=True, shuffle=True, fill_value=-1)

# other variables
bcs_nc.createVariable('flow_sim', 'f4', ('time', 'model_id'), zlib=True, shuffle=True, fill_value=np.nan)
bcs_nc.createVariable('flow_bc', 'f4', ('time', 'model_id'), zlib=True, shuffle=True, fill_value=np.nan)
bcs_nc.createVariable('percentiles', 'f4', ('time', 'model_id'), zlib=True, shuffle=True, fill_value=np.nan)
bcs_nc.createVariable('scalars', 'f4', ('time', 'model_id'), zlib=True, shuffle=True, fill_value=np.nan)

bcs_nc['time'][:] = ts.index.values
bcs_nc['model_id'][:] = assign_table['model_id'].values.flatten()
for metric in metric_nc_name_list:
bcs_nc.createVariable(metric, 'f4', ('model_id', 'corrected'), zlib=True, shuffle=True, fill_value=np.nan)

# times from the datetime index
times = ts.index.values.astype(datetime.datetime)
# convert nanoseconds to milliseconds
times = times / 1000000
# convert to dates
times = nc.num2date(times, 'milliseconds since 1970-01-01 00:00:00', calendar='standard')
# convert to a simpler unit
times = nc.date2num(times, 'days since 1970-01-01 00:00:00', calendar='standard')

# fill the values we already know
bcs_nc['time'].unit = 'days since 1970-01-01 00:00:00+00'
bcs_nc['time'][:] = times
bcs_nc['model_id'][:] = ts.columns.to_list()
bcs_nc['corrected'][:] = np.array((0, 1))
bcs_nc['flow_sim'][:] = ts.values

bcs_nc.sync()

# set up arrays to compute the corrected, percentile and scalar arrays
c_array = np.array([np.nan] * t_size * m_size).reshape((t_size, m_size))
p_array = np.array([np.nan] * t_size * m_size).reshape((t_size, m_size))
s_array = np.array([np.nan] * t_size * m_size).reshape((t_size, m_size))

computed_metrics = {}
for metric in metric_list:
computed_metrics[metric] = np.array([np.nan] * m_size * c_size).reshape(m_size, c_size)

errors = {'g1': 0, 'g2': 0, 'g3': 0, 'g4': 0}
for idx, triple in enumerate(assign_table[[mid_col, asgn_mid_col, asgn_gid_col]].values):
try:
print(f'{idx}/{m_size}')
if idx % 25 == 0:
print(f'\n\t\t{idx + 1}/{m_size}')

model_id, asgn_mid, asgn_gid = triple
if np.isnan(asgn_gid) or np.isnan(asgn_mid):
warnings.warn(f'unable to correct model id: {model_id}')
continue
model_id = int(model_id)
asgn_mid = int(asgn_mid)
asgn_gid = int(asgn_gid)
except Exception as e:
errors['g1'] += 1
continue

# todo if the asgn_mid == model_id then use the point calibration method from geoglows package
try:
obs_df = pd.read_csv(os.path.join(obs_data_dir, f'{asgn_gid}.csv'), index_col=0, parse_dates=True)
obs_df = obs_df.dropna()
except Exception as e:
print('failed to read the observed data')
print(e)
errors['g2'] += 1
continue

calibrated_df = calibrate_stream(
ts[[asgn_mid]],
pd.read_csv(os.path.join(obs_data_dir, f'{asgn_gid}.csv'), index_col=0,
parse_dates=True),
ts[[model_id]],
)
try:
calibrated_df = calibrate_stream(ts[[asgn_mid]], obs_df, ts[[model_id]], )
c_array[:, idx] = calibrated_df['flow'].values
p_array[:, idx] = calibrated_df['percentile'].values
s_array[:, idx] = calibrated_df['scalars'].values
except Exception as e:
print('failed during the calibration step')
print(e)
errors['g3'] += 1
continue

try:
if model_id in gauge_table['model_id'].values:
correct_id = gauge_table.loc[gauge_table['model_id'] == model_id, 'gauge_id'].values[0]
obs_df = pd.read_csv(os.path.join(obs_data_dir, f'{correct_id}.csv'), index_col=0, parse_dates=True)
sim_obs_stats = hs.make_table(hd.merge_data(sim_df=ts[[model_id]], obs_df=obs_df), metric_list)
bcs_obs_stats = hs.make_table(hd.merge_data(sim_df=calibrated_df[['flow']], obs_df=obs_df), metric_list)
for metric in metric_list:
computed_metrics[metric][idx, :] = \
float(sim_obs_stats[metric].values[0]), float(bcs_obs_stats[metric].values[0])

except Exception as e:
print('failed during collecting stats')
print(e)
errors['g4'] += 1
continue

bcs_nc['flow_bc'][:] = c_array
bcs_nc['percentiles'][:] = p_array
bcs_nc['scalars'][:] = s_array
for idx, metric in enumerate(metric_list):
bcs_nc[metric_nc_name_list[idx]][:] = computed_metrics[metric]

bcs_nc.sync()
bcs_nc.close()

print(errors)
with open(os.path.join(workdir, 'calibration_errors.json'), 'w') as f:
f.write(str(errors))

return
9 changes: 8 additions & 1 deletion rbc/_vocab.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
# assign table and gis_input file required column names
mid_col = 'model_id'
gid_col = 'gauge_id'
asgn_mid_col = 'assigned_model_id'
asgn_gid_col = 'assigned_gauge_id'
down_mid_col = 'downstream_model_id'

reason_col = 'reason'
area_col = 'drain_area'
order_col = 'stream_order'

# name of some of the files produced by the algorithm
cluster_count_file = 'best-fit-cluster-count.json'
cal_nc_name = 'calibrated_simulated_flow.nc'
sim_ts_pickle = 'sim_time_series.pickle'

# metrics computed on validation sets
metric_list = ["ME", 'MAE', "RMSE", "NRMSE (Mean)", "MAPE", "NSE", "KGE (2012)"]
metric_nc_name_list = ['ME', 'MAE', 'RMSE', 'NRMSE', 'MAPE', 'NSE', 'KGE2012']
11 changes: 6 additions & 5 deletions rbc/_workflow.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from .prep import historical_simulation, hist_sim_table, observed_data
import pandas as pd

from .prep import historical_simulation

from .table import gen, cache
from .cluster import generate, summarize
Expand All @@ -9,12 +11,11 @@

def prep_region(workdir: str) -> None:
historical_simulation(workdir)
hist_sim_table(workdir)
observed_data(workdir)
# observed_data(workdir)
return


def analyze_region(workdir: str, drain_shape: str, obs_data_dir: str = None) -> None:
def analyze_region(workdir: str, drain_shape: str, gauge_table: pd.DataFrame = None, obs_data_dir: str = None) -> None:
# Generate the assignments table
print("gen assign_table")
assign_table = gen(workdir)
Expand Down Expand Up @@ -44,5 +45,5 @@ def analyze_region(workdir: str, drain_shape: str, obs_data_dir: str = None) ->

# Compute the corrected simulation data
print("calibrating region netcdf")
calibrate_region(workdir, assign_table, obs_data_dir)
calibrate_region(workdir, assign_table, gauge_table, obs_data_dir)
return
Loading

0 comments on commit 52b1497

Please sign in to comment.