From 175092f6ba2de5939f72165addbe2ad079b0d5fe Mon Sep 17 00:00:00 2001 From: hjabbot <42661345+hjabbot@users.noreply.github.com> Date: Tue, 3 Sep 2024 10:04:56 +0100 Subject: [PATCH 1/8] Generalising splitting conditions between curl and dmag, deprecating Reynolds --- .../dataloaders/vector/abstract_vector.py | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/meshiphi/dataloaders/vector/abstract_vector.py b/meshiphi/dataloaders/vector/abstract_vector.py index ed89a642..29c3becd 100644 --- a/meshiphi/dataloaders/vector/abstract_vector.py +++ b/meshiphi/dataloaders/vector/abstract_vector.py @@ -558,20 +558,27 @@ def get_hom_condition(self, bounds, splitting_conds, agg_type='MEAN', data=None) # To allow multiple modes of splitting, chuck them in the splitting conditions # Split if magnitude of curl(data) is larger than threshold if 'curl' in splitting_conds: - curl = self.calc_curl(bounds) - if np.abs(curl) > splitting_conds['curl']: - hom_type = 'HET' + flow = self.calc_curl(bounds, collapse=False) + sc = splitting_conds['curl'] # Split if max magnitude(any_vector - ave_vector) is larger than threshold - if 'dmag' in splitting_conds: - dmag = self.calc_dmag(bounds) - if np.abs(dmag) > splitting_conds['dmag']: - hom_type = 'HET' + elif 'dmag' in splitting_conds: + flow = self.calc_dmag(bounds, collapse=False) + sc = splitting_conds['dmag'] + + if isinstance(flow, type(np.nan)) and np.isnan(flow): + return "CLR" + num_over_threshold = (flow > sc['threshold']).sum() + frac_over_threshold = num_over_threshold / flow.size + + if frac_over_threshold <= sc['lower_bound']: + hom_type = "CLR" + elif frac_over_threshold >= sc['upper_bound']: + if splitting_conds['split_lock'] == True: + hom_type = "HOM" + else: + hom_type = "CLR" + else: hom_type = "HET" - # Split if Reynolds number is larger than threshold - if 'reynolds' in splitting_conds: - reynolds = self.calc_reynolds_number(bounds) - if reynolds > splitting_conds['reynolds']: - hom_type = 'HET' logging.debug(f"\thom_condition for attribute: '{self.data_name}' in bounds:'{bounds}' returned '{hom_type}'") From 278c73e83d4dfcb29b8151c08e3104f2c1e7012c Mon Sep 17 00:00:00 2001 From: hjabbot <42661345+hjabbot@users.noreply.github.com> Date: Tue, 3 Sep 2024 10:06:44 +0100 Subject: [PATCH 2/8] Fixing crash on aggregation when nan returned --- meshiphi/dataloaders/vector/abstract_vector.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/meshiphi/dataloaders/vector/abstract_vector.py b/meshiphi/dataloaders/vector/abstract_vector.py index 29c3becd..c0707478 100644 --- a/meshiphi/dataloaders/vector/abstract_vector.py +++ b/meshiphi/dataloaders/vector/abstract_vector.py @@ -422,10 +422,16 @@ def get_value_from_df(dps, variable_names, bounds, agg_type, skipna): values = [data_count, data_count] elif agg_type == 'MIN': index = dps['_magnitude'].idxmin(skipna=skipna) - values = [dps[name][index] for name in variable_names] + if ~np.isnan(index): + values = [dps[name][index] for name in variable_names] + else: + values = [np.nan for name in variable_names] elif agg_type == 'MAX': index = dps['_magnitude'].idxmax(skipna=skipna) - values = [dps[name][index] for name in variable_names] + if ~np.isnan(index): + values = [dps[name][index] for name in variable_names] + else: + values = [np.nan for name in variable_names] elif agg_type == 'MEAN': values = [dps[name].mean(skipna=skipna) for name in variable_names] elif agg_type == 'STD': From 6ef536f6f5c22b04ae22658be3a559a89c4aebf1 Mon Sep 17 00:00:00 2001 From: hjabbot <42661345+hjabbot@users.noreply.github.com> Date: Thu, 5 Sep 2024 13:00:33 +0100 Subject: [PATCH 3/8] Increment version number --- meshiphi/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meshiphi/__init__.py b/meshiphi/__init__.py index de93938d..13c67c61 100644 --- a/meshiphi/__init__.py +++ b/meshiphi/__init__.py @@ -1,4 +1,4 @@ -__version__ = "2.1.12" +__version__ = "2.1.13" __description__ = "MeshiPhi: Earth's digital twin mapped on a non-uniform mesh" __license__ = "MIT" __author__ = "Autonomous Marine Operations Planning (AMOP) Team, AI Lab, British Antarctic Survey" From 57f8f8f35481a270ce2e5f661ec976bdaa2f3873 Mon Sep 17 00:00:00 2001 From: hjabbot <42661345+hjabbot@users.noreply.github.com> Date: Thu, 5 Sep 2024 13:01:20 +0100 Subject: [PATCH 4/8] Fixed bug where NaN values were being included in the calculation of frac_over_threshold --- meshiphi/dataloaders/scalar/abstract_scalar.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/meshiphi/dataloaders/scalar/abstract_scalar.py b/meshiphi/dataloaders/scalar/abstract_scalar.py index 6f51f5ed..57f065c7 100644 --- a/meshiphi/dataloaders/scalar/abstract_scalar.py +++ b/meshiphi/dataloaders/scalar/abstract_scalar.py @@ -560,7 +560,11 @@ def get_hom_condition_from_xr(dps, splitting_conds): else: # Determine fraction of datapoints over threshold value num_over_threshold = np.count_nonzero(dps > splitting_conds['threshold']) - frac_over_threshold = num_over_threshold/dps.size + num_non_nan = np.count_nonzero(~np.isnan(dps)) + if num_non_nan > 0: + frac_over_threshold = num_over_threshold/num_non_nan + else: + frac_over_threshold = 0 # Return homogeneity condition if frac_over_threshold <= splitting_conds['lower_bound']: hom_type = "CLR" elif frac_over_threshold >= splitting_conds['upper_bound']: From 8b95b1a4d3b4249c379f46b7f04c3b00c14e654d Mon Sep 17 00:00:00 2001 From: hjabbot <42661345+hjabbot@users.noreply.github.com> Date: Fri, 6 Sep 2024 09:32:57 +0100 Subject: [PATCH 5/8] Fixed dataframe version of bug where NaNs counted in frac_over_threshold --- meshiphi/dataloaders/scalar/abstract_scalar.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/meshiphi/dataloaders/scalar/abstract_scalar.py b/meshiphi/dataloaders/scalar/abstract_scalar.py index 57f065c7..36cb389c 100644 --- a/meshiphi/dataloaders/scalar/abstract_scalar.py +++ b/meshiphi/dataloaders/scalar/abstract_scalar.py @@ -524,7 +524,11 @@ def get_hom_condition_from_df(dps, splitting_conds): else: # Determine fraction of datapoints over threshold value num_over_threshold = dps[dps > splitting_conds['threshold']] - frac_over_threshold = num_over_threshold.shape[0]/dps.shape[0] + num_non_nan = np.count_nonzero(~np.isnan(dps)) + if num_non_nan > 0: + frac_over_threshold = num_over_threshold.shape[0]/num_non_nan + else: + frac_over_threshold = 0 # Return homogeneity condition if frac_over_threshold <= splitting_conds['lower_bound']: hom_type = "CLR" From 7cb91a6fb72633cf6d6f6a55af90d19fb9e0c6a6 Mon Sep 17 00:00:00 2001 From: hjabbot <42661345+hjabbot@users.noreply.github.com> Date: Thu, 19 Sep 2024 11:43:48 +0100 Subject: [PATCH 6/8] Avoiding counting NaN values in frac_over_threshold for VectorDataLoader --- meshiphi/dataloaders/vector/abstract_vector.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/meshiphi/dataloaders/vector/abstract_vector.py b/meshiphi/dataloaders/vector/abstract_vector.py index c0707478..b90260fe 100644 --- a/meshiphi/dataloaders/vector/abstract_vector.py +++ b/meshiphi/dataloaders/vector/abstract_vector.py @@ -574,7 +574,13 @@ def get_hom_condition(self, bounds, splitting_conds, agg_type='MEAN', data=None) if isinstance(flow, type(np.nan)) and np.isnan(flow): return "CLR" num_over_threshold = (flow > sc['threshold']).sum() - frac_over_threshold = num_over_threshold / flow.size + + num_non_nan = np.count_nonzero(~np.isnan(flow)) + if num_non_nan > 0: + frac_over_threshold = num_over_threshold/num_non_nan + else: + frac_over_threshold = 0 + if frac_over_threshold <= sc['lower_bound']: hom_type = "CLR" From f1c85e8a4eed998e333ab0b73c4c53d596b22c2d Mon Sep 17 00:00:00 2001 From: hjabbot <42661345+hjabbot@users.noreply.github.com> Date: Thu, 19 Sep 2024 11:59:47 +0100 Subject: [PATCH 7/8] Fixing split lock key error in vector dataloader --- meshiphi/dataloaders/vector/abstract_vector.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/meshiphi/dataloaders/vector/abstract_vector.py b/meshiphi/dataloaders/vector/abstract_vector.py index b90260fe..53dbf434 100644 --- a/meshiphi/dataloaders/vector/abstract_vector.py +++ b/meshiphi/dataloaders/vector/abstract_vector.py @@ -571,6 +571,9 @@ def get_hom_condition(self, bounds, splitting_conds, agg_type='MEAN', data=None) flow = self.calc_dmag(bounds, collapse=False) sc = splitting_conds['dmag'] + if 'split_lock' not in sc: + sc['split_lock'] = False + if isinstance(flow, type(np.nan)) and np.isnan(flow): return "CLR" num_over_threshold = (flow > sc['threshold']).sum() @@ -585,7 +588,7 @@ def get_hom_condition(self, bounds, splitting_conds, agg_type='MEAN', data=None) if frac_over_threshold <= sc['lower_bound']: hom_type = "CLR" elif frac_over_threshold >= sc['upper_bound']: - if splitting_conds['split_lock'] == True: + if sc['split_lock'] == True: hom_type = "HOM" else: hom_type = "CLR" From f8982e2072f7343888e38b1099b78172dc2ff465 Mon Sep 17 00:00:00 2001 From: hjabbot <42661345+hjabbot@users.noreply.github.com> Date: Thu, 19 Sep 2024 12:44:05 +0100 Subject: [PATCH 8/8] Deprecating Reynolds and divergence based splitting --- .../dataloaders/vector/abstract_vector.py | 88 +------------------ 1 file changed, 2 insertions(+), 86 deletions(-) diff --git a/meshiphi/dataloaders/vector/abstract_vector.py b/meshiphi/dataloaders/vector/abstract_vector.py index 53dbf434..26cc2220 100644 --- a/meshiphi/dataloaders/vector/abstract_vector.py +++ b/meshiphi/dataloaders/vector/abstract_vector.py @@ -925,89 +925,6 @@ def set_data_col_name_list(self, new_names): self.data_name_list = new_names return self.set_data_col_name(new_data_name) - def calc_reynolds_number(self, bounds): - ''' - Calculates an approximate Reynolds number from the mean vector velocity - and cellbox size. - - CURRENTLY ASSUMES DENSITY AND VISCOSITY OF SEAWATER AT 4°C! - WILL NEED MINOR REWORKING TO INCLUDE DIFFERENT FLUIDS - - Args: - bounds (Boundary): - Cellbox boundary to calculate characteristic length from - - Returns: - float: - Reynolds number of cellbox - ''' - # Extract the speed - velocity = self.get_value(bounds, agg_type='MEAN') - speed = np.linalg.norm(list(velocity.values())) # Calculates magnitude - # Extract the characteristic length - length = bounds.calc_size() - # Calculate the reynolds number and return - logging.warning("\tReynold number used for splitting, this function assumes properties of ocean water!") - return 1028 * 0.00167 * speed * length - - def calc_divergence(self, bounds, data=None, collapse=True, agg_type='MAX'): - ''' - Calculates the divergence of vectors in a cellbox - - Args: - bounds (Boundary): - Cellbox boundary in which all relevant vectors are contained - data (pd.DataFrame or xr.Dataset): - Dataset with 'lat' and 'long' columns/dimensions with vectors - collapes (bool): - Flag determining whether to return an aggregated value, or a - vector field (values for each individual vector). - agg_type (str): - Method of aggregation if collapsing value. - Accepts 'MAX' or 'MEAN' - - Returns: - float or pd.DataFrame: - float value of aggregated div if collapse=True, or - pd.DataFrame of div vector field if collapse=False - - Raises: - ValueError: If agg_type is not 'MAX' or 'MEAN' - ''' - if data is None: dps = self.trim_datapoints(bounds, data=data) - else: dps = data - - # Create a meshgrid of vectors from the data - vector_field = self._create_vector_meshgrid(dps, self.data_name_list) - - # Get component values for each vector - fx, fy = vector_field[:, :, 0], vector_field[:, :, 1] - # If not enough datapoints to compute gradient - if 1 in fx.shape or 1 in fy.shape: - logging.debug('\tUnable to compute gradient across cell for divergence calculation') - div = np.nan - else: - # Compute partial derivatives - dfx_dy = np.gradient(fx, axis=1) - dfy_dx = np.gradient(fy, axis=0) - # Compute curl - div = dfy_dx + dfx_dy - - # If div is nan - if np.isnan(div).all(): - logging.debug('\tAll NaN cellbox encountered') - return np.nan - # If want to collapse to max mag value, return scalar - elif collapse: - if agg_type == 'MAX': return max(np.nanmax(div), np.nanmin(div), key=abs) - elif agg_type == 'MEAN': return np.nanmean(div) - else: - raise ValueError(f"agg_type '{agg_type}' not understood! Requires 'MAX' or 'MEAN'") - # Else return field - else: - return div - - def calc_curl(self, bounds, data=None, collapse=True, agg_type='MAX'): ''' Calculates the curl of vectors in a cellbox @@ -1049,7 +966,7 @@ def calc_curl(self, bounds, data=None, collapse=True, agg_type='MAX'): # Compute curl curl = dfy_dx - dfx_dy - # If div is nan + # If curl is nan if np.isnan(curl).all(): logging.debug('\tAll NaN cellbox encountered') return np.nan @@ -1103,7 +1020,7 @@ def calc_dmag(self, bounds, data=None, collapse=True, agg_type='MEAN'): if len(d_mag) == 0: logging.debug('\tEmpty cellbox encountered') return np.nan - # If div is nan + # If d_mag is nan elif np.isnan(d_mag).all(): logging.debug('\tAll NaN cellbox encountered') return np.nan @@ -1116,7 +1033,6 @@ def calc_dmag(self, bounds, data=None, collapse=True, agg_type='MEAN'): # Else return field else: return d_mag - @staticmethod def _create_vector_meshgrid(data, data_name_list): '''