Merge pull request #25 from rileyhales/improve_validation

rileyhales · Dec 1, 2021 · 52b1497 · 52b1497
2 parents fc050cb + 1fdcbfc
commit 52b1497
Show file tree

Hide file tree

Showing 12 changed files with 412 additions and 122 deletions.
diff --git a/examples/colombia-magdalena/magdalena_example.py b/examples/colombia-magdalena/magdalena_example.py
@@ -1,10 +1,16 @@
 import os
 
+import numpy as np
+
 import rbc
 
 
+np.seterr(all="ignore")
+
 workdir = '/Users/rchales/data/regional-bias-correction/colombia-magdalena'
 drain_shape = os.path.join(workdir, 'gis_inputs', 'magdalena_dl_attrname_xy.json')
+gauge_shape = os.path.join(workdir, 'gis_inputs', 'ideam_stations.json')
+obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs')
 
 # Only need to do this step 1x ever
 # rbc.prep.scaffold_working_directory(workdir)
@@ -16,7 +22,7 @@
 # assign_table = rbc.table.gen(workdir)
 # rbc.table.cache(workdir, assign_table)
 # Or read the existing table
-assign_table = rbc.table.read(workdir)
+# assign_table = rbc.table.read(workdir)
 
 # Prepare the observation and simulation data
 # Only need to do this step 1x ever
@@ -43,4 +49,25 @@
 # rbc.gis.clip_by_unassigned(workdir, assign_table, drain_shape)
 
 # Compute the corrected simulation data
-rbc.calibrate_region(workdir, assign_table)
+# assign_table = rbc.table.read(workdir)
+# rbc.calibrate_region(workdir, assign_table)
+# vtab = rbc.validate.gen_val_table(workdir)
+rbc.gis.validation_maps(workdir, gauge_shape)
+rbc.analysis.plot(workdir, obs_data_dir, 9007721)
+
+
+# import pandas as pd
+# path_to_your_pickle_file = '/Users/rchales/data/regional-bias-correction/colombia-magdalena/validation_runs/90/data_processed/subset_time_series.pickle'
+# a = pd.read_pickle(path_to_your_pickle_file)
+# a.index = a.index.tz_localize('UTC')
+# a.to_pickle(path_to_your_pickle_file)
+
+
+# import netCDF4 as nc
+# import numpy as np
+# path = '/Users/rchales/data/regional-bias-correction/colombia-magdalena/validation_runs/90/calibrated_simulated_flow.nc'
+# a = nc.Dataset(path, 'r+')
+# a.createVariable('corrected', 'i4', ('corrected',), zlib=True, shuffle=True, fill_value=-1)
+# a['corrected'][:] = np.array((0, 1))
+# a.sync()
+# a.close()
diff --git a/examples/example_inputs.py b/examples/example_inputs.py
@@ -0,0 +1,16 @@
+import os
+
+
+# COLOMBIA
+workdir = '/Users/rchales/data/regional-bias-correction/colombia-magdalena'
+drain_shape = os.path.join(workdir, 'gis_inputs', 'magdalena_dl_attrname_xy.json')
+gauge_shape = os.path.join(workdir, 'gis_inputs', 'ideam_stations.json')
+obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs')
+hist_sim_nc = os.path.join(workdir, 'data_inputs', 'south_america_era5_qout.nc')
+
+# TEXAS
+workdir = '/Users/rchales/data/regional-bias-correction/texas'
+drain_shape = os.path.join(workdir, 'shapefiles', 'texas-dl.json')
+gauge_shape = os.path.join(workdir, 'shapefiles', 'texas-gauges.shp')
+obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs')
+hist_sim_nc = os.path.join(workdir, 'data_inputs', 'Qout_era5_t640_24hr_19790101to20210630.nc.nc')
diff --git a/examples/example_script.py b/examples/example_script.py
@@ -7,9 +7,11 @@
 
 np.seterr(all="ignore")
 
-workdir = '/Users/rchales/data/regional-bias-correction/colombia-magdalena'
-drain_shape = os.path.join(workdir, 'gis_inputs', 'magdalena_dl_attrname_xy.json')
-obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs')
+workdir = ''
+drain_shape = os.path.join(workdir, 'gis_inputs', '')
+gauge_shape = ''
+obs_data_dir = ''
+hist_sim_nc = ''
 
 # Prepare the working directory - only need to do this step 1x ever
 # rbc.prep.scaffold_working_directory(workdir)
@@ -19,19 +21,22 @@
 # Put the observed data csv files in the data_inputs/obs_csvs folder
 
 # Prepare the observation and simulation data - Only need to do this step 1x ever
+print('Preparing data')
 rbc.prep.historical_simulation(workdir)
-rbc.prep.observed_data(workdir)
 
 # Generate the assignments table
+print('Generate Assignment Table')
 assign_table = rbc.table.gen(workdir)
 rbc.table.cache(workdir, assign_table)
 
 # Generate the clusters using the historical simulation data
+print('Generate Clusters')
 rbc.cluster.generate(workdir)
 assign_table = rbc.cluster.summarize(workdir, assign_table)
 rbc.table.cache(workdir, assign_table)
 
 # Assign basins which are gauged and propagate those gauges
+print('Making Assignments')
 assign_table = rbc.assign.gauged(assign_table)
 assign_table = rbc.assign.propagation(assign_table)
 assign_table = rbc.assign.clusters_by_dist(assign_table)
@@ -40,13 +45,18 @@
 rbc.table.cache(workdir, assign_table)
 
 # Generate GIS files so you can go explore your progress graphically
+print('Generate GIS files')
 rbc.gis.clip_by_assignment(workdir, assign_table, drain_shape)
 rbc.gis.clip_by_cluster(workdir, assign_table, drain_shape)
 rbc.gis.clip_by_unassigned(workdir, assign_table, drain_shape)
 
 # Compute the corrected simulation data
+print('Starting Calibration')
 rbc.calibrate_region(workdir, assign_table)
 
 # run the validation study
-rbc.validate.sample_gauges(workdir)
+print('Performing Validation')
+rbc.validate.sample_gauges(workdir, overwrite=True)
 rbc.validate.run_series(workdir, drain_shape, obs_data_dir)
+vtab = rbc.validate.gen_val_table(workdir)
+rbc.gis.validation_maps(workdir, gauge_shape, vtab)
diff --git a/rbc/__init__.py b/rbc/__init__.py
@@ -8,11 +8,12 @@
 import rbc.gis
 import rbc.utils
 import rbc.validate
+import rbc.analysis
 
 
 __all__ = ['prep_region', 'analyze_region',
            'calibrate_stream', 'calibrate_region',
-           'table', 'prep', 'assign', 'gis', 'cluster', 'utils', 'validate']
+           'table', 'prep', 'assign', 'gis', 'cluster', 'utils', 'validate', 'analysis']
 __author__ = 'Riley Hales'
 __version__ = '0.1.0'
 __license__ = 'BSD 3 Clause Clear'
diff --git a/rbc/_calibrate.py b/rbc/_calibrate.py
@@ -1,7 +1,9 @@
+import datetime
 import os
 import statistics
-import warnings
 
+import hydrostats as hs
+import hydrostats.data as hd
 import netCDF4 as nc
 import numpy as np
 import pandas as pd
@@ -11,6 +13,10 @@
 from ._vocab import mid_col
 from ._vocab import asgn_mid_col
 from ._vocab import asgn_gid_col
+from ._vocab import metric_list
+from ._vocab import metric_nc_name_list
+from ._vocab import sim_ts_pickle
+from ._vocab import cal_nc_name
 
 
 def calibrate_stream(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd.DataFrame,
@@ -133,68 +139,142 @@ def calibrate_stream(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flo
                         columns=('flow', 'scalars', 'percentile'))
 
 
-def calibrate_region(workdir: str, assign_table: pd.DataFrame, obs_data_dir: str = None):
+def calibrate_region(workdir: str, assign_table: pd.DataFrame,
+                     gauge_table: pd.DataFrame = None, obs_data_dir: str = None) -> None:
+    """
+    Creates a netCDF with the
+
+    Args:
+        workdir: path to project working directory
+        assign_table: the assign_table dataframe
+        gauge_table: path to the gauge table
+        obs_data_dir: path to the observed data
+
+    Returns:
+        None
+    """
+    if gauge_table is None:
+        gauge_table = pd.read_csv(os.path.join(workdir, 'gis_inputs', 'gauge_table.csv'))
     if obs_data_dir is None:
-        obs_data_dir = os.path.join(workdir, 'data_observed', 'obs_csvs')
-    bcs_nc_path = os.path.join(workdir, 'calibrated_simulated_flow.nc')
-    ts = pd.read_pickle(os.path.join(workdir, 'data_processed', 'subset_time_series.pickle'))
+        obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs')
 
-    t_size = ts.values.shape[0]
-    m_size = ts.values.shape[1]
+    bcs_nc_path = os.path.join(workdir, cal_nc_name)
+    ts = pd.read_pickle(os.path.join(workdir, 'data_processed', sim_ts_pickle))
 
     # create the new netcdf
     bcs_nc = nc.Dataset(bcs_nc_path, 'w')
+
+    # set up the dimensions
+    t_size = ts.values.shape[0]
+    m_size = ts.values.shape[1]
+    c_size = 2
     bcs_nc.createDimension('time', t_size)
     bcs_nc.createDimension('model_id', m_size)
+    bcs_nc.createDimension('corrected', c_size)
 
-    bcs_nc.createVariable('time', 'f4', ('time',), zlib=True, shuffle=True, fill_value=np.nan)
-    bcs_nc.createVariable('model_id', 'f4', ('model_id',), zlib=True, shuffle=True, fill_value=np.nan)
+    # coordinate variables
+    bcs_nc.createVariable('time', 'i', ('time',), zlib=True, shuffle=True, fill_value=-9999)
+    bcs_nc.createVariable('model_id', 'i', ('model_id',), zlib=True, shuffle=True, fill_value=-9999)
+    bcs_nc.createVariable('corrected', 'i', ('corrected',), zlib=True, shuffle=True, fill_value=-1)
 
+    # other variables
     bcs_nc.createVariable('flow_sim', 'f4', ('time', 'model_id'), zlib=True, shuffle=True, fill_value=np.nan)
     bcs_nc.createVariable('flow_bc', 'f4', ('time', 'model_id'), zlib=True, shuffle=True, fill_value=np.nan)
     bcs_nc.createVariable('percentiles', 'f4', ('time', 'model_id'), zlib=True, shuffle=True, fill_value=np.nan)
     bcs_nc.createVariable('scalars', 'f4', ('time', 'model_id'), zlib=True, shuffle=True, fill_value=np.nan)
-
-    bcs_nc['time'][:] = ts.index.values
-    bcs_nc['model_id'][:] = assign_table['model_id'].values.flatten()
+    for metric in metric_nc_name_list:
+        bcs_nc.createVariable(metric, 'f4', ('model_id', 'corrected'), zlib=True, shuffle=True, fill_value=np.nan)
+
+    # times from the datetime index
+    times = ts.index.values.astype(datetime.datetime)
+    # convert nanoseconds to milliseconds
+    times = times / 1000000
+    # convert to dates
+    times = nc.num2date(times, 'milliseconds since 1970-01-01 00:00:00', calendar='standard')
+    # convert to a simpler unit
+    times = nc.date2num(times, 'days since 1970-01-01 00:00:00', calendar='standard')
+
+    # fill the values we already know
+    bcs_nc['time'].unit = 'days since 1970-01-01 00:00:00+00'
+    bcs_nc['time'][:] = times
+    bcs_nc['model_id'][:] = ts.columns.to_list()
+    bcs_nc['corrected'][:] = np.array((0, 1))
     bcs_nc['flow_sim'][:] = ts.values
 
     bcs_nc.sync()
 
+    # set up arrays to compute the corrected, percentile and scalar arrays
     c_array = np.array([np.nan] * t_size * m_size).reshape((t_size, m_size))
     p_array = np.array([np.nan] * t_size * m_size).reshape((t_size, m_size))
     s_array = np.array([np.nan] * t_size * m_size).reshape((t_size, m_size))
 
+    computed_metrics = {}
+    for metric in metric_list:
+        computed_metrics[metric] = np.array([np.nan] * m_size * c_size).reshape(m_size, c_size)
+
+    errors = {'g1': 0, 'g2': 0, 'g3': 0, 'g4': 0}
     for idx, triple in enumerate(assign_table[[mid_col, asgn_mid_col, asgn_gid_col]].values):
         try:
-            print(f'{idx}/{m_size}')
+            if idx % 25 == 0:
+                print(f'\n\t\t{idx + 1}/{m_size}')
 
             model_id, asgn_mid, asgn_gid = triple
             if np.isnan(asgn_gid) or np.isnan(asgn_mid):
-                warnings.warn(f'unable to correct model id: {model_id}')
                 continue
             model_id = int(model_id)
             asgn_mid = int(asgn_mid)
             asgn_gid = int(asgn_gid)
+        except Exception as e:
+            errors['g1'] += 1
+            continue
 
-            # todo if the asgn_mid == model_id then use the point calibration method from geoglows package
+        try:
+            obs_df = pd.read_csv(os.path.join(obs_data_dir, f'{asgn_gid}.csv'), index_col=0, parse_dates=True)
+            obs_df = obs_df.dropna()
+        except Exception as e:
+            print('failed to read the observed data')
+            print(e)
+            errors['g2'] += 1
+            continue
 
-            calibrated_df = calibrate_stream(
-                ts[[asgn_mid]],
-                pd.read_csv(os.path.join(obs_data_dir, f'{asgn_gid}.csv'), index_col=0,
-                            parse_dates=True),
-                ts[[model_id]],
-            )
+        try:
+            calibrated_df = calibrate_stream(ts[[asgn_mid]], obs_df, ts[[model_id]], )
             c_array[:, idx] = calibrated_df['flow'].values
             p_array[:, idx] = calibrated_df['percentile'].values
             s_array[:, idx] = calibrated_df['scalars'].values
         except Exception as e:
+            print('failed during the calibration step')
             print(e)
+            errors['g3'] += 1
+            continue
+
+        try:
+            if model_id in gauge_table['model_id'].values:
+                correct_id = gauge_table.loc[gauge_table['model_id'] == model_id, 'gauge_id'].values[0]
+                obs_df = pd.read_csv(os.path.join(obs_data_dir, f'{correct_id}.csv'), index_col=0, parse_dates=True)
+                sim_obs_stats = hs.make_table(hd.merge_data(sim_df=ts[[model_id]], obs_df=obs_df), metric_list)
+                bcs_obs_stats = hs.make_table(hd.merge_data(sim_df=calibrated_df[['flow']], obs_df=obs_df), metric_list)
+                for metric in metric_list:
+                    computed_metrics[metric][idx, :] = \
+                        float(sim_obs_stats[metric].values[0]), float(bcs_obs_stats[metric].values[0])
+
+        except Exception as e:
+            print('failed during collecting stats')
+            print(e)
+            errors['g4'] += 1
+            continue
 
     bcs_nc['flow_bc'][:] = c_array
     bcs_nc['percentiles'][:] = p_array
     bcs_nc['scalars'][:] = s_array
+    for idx, metric in enumerate(metric_list):
+        bcs_nc[metric_nc_name_list[idx]][:] = computed_metrics[metric]
+
     bcs_nc.sync()
     bcs_nc.close()
 
+    print(errors)
+    with open(os.path.join(workdir, 'calibration_errors.json'), 'w') as f:
+        f.write(str(errors))
+
     return
diff --git a/rbc/_vocab.py b/rbc/_vocab.py
@@ -1,11 +1,18 @@
+# assign table and gis_input file required column names
 mid_col = 'model_id'
 gid_col = 'gauge_id'
 asgn_mid_col = 'assigned_model_id'
 asgn_gid_col = 'assigned_gauge_id'
 down_mid_col = 'downstream_model_id'
-
 reason_col = 'reason'
 area_col = 'drain_area'
 order_col = 'stream_order'
 
+# name of some of the files produced by the algorithm
 cluster_count_file = 'best-fit-cluster-count.json'
+cal_nc_name = 'calibrated_simulated_flow.nc'
+sim_ts_pickle = 'sim_time_series.pickle'
+
+# metrics computed on validation sets
+metric_list = ["ME", 'MAE', "RMSE", "NRMSE (Mean)", "MAPE", "NSE", "KGE (2012)"]
+metric_nc_name_list = ['ME', 'MAE', 'RMSE', 'NRMSE', 'MAPE', 'NSE', 'KGE2012']
diff --git a/rbc/_workflow.py b/rbc/_workflow.py
@@ -1,4 +1,6 @@
-from .prep import historical_simulation, hist_sim_table, observed_data
+import pandas as pd
+
+from .prep import historical_simulation
 
 from .table import gen, cache
 from .cluster import generate, summarize
@@ -9,12 +11,11 @@
 
 def prep_region(workdir: str) -> None:
     historical_simulation(workdir)
-    hist_sim_table(workdir)
-    observed_data(workdir)
+    # observed_data(workdir)
     return
 
 
-def analyze_region(workdir: str, drain_shape: str, obs_data_dir: str = None) -> None:
+def analyze_region(workdir: str, drain_shape: str, gauge_table: pd.DataFrame = None, obs_data_dir: str = None) -> None:
     # Generate the assignments table
     print("gen assign_table")
     assign_table = gen(workdir)
@@ -44,5 +45,5 @@ def analyze_region(workdir: str, drain_shape: str, obs_data_dir: str = None) ->
 
     # Compute the corrected simulation data
     print("calibrating region netcdf")
-    calibrate_region(workdir, assign_table, obs_data_dir)
+    calibrate_region(workdir, assign_table, gauge_table, obs_data_dir)
     return