Skip to content

Commit

Permalink
Merge pull request #318 from rsagroup/small_sample
Browse files Browse the repository at this point in the history
update to include small sample two factor bootstrap correction
  • Loading branch information
JasperVanDenBosch authored May 11, 2023
2 parents e43546c + ddc65bd commit ed75983
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 36 deletions.
26 changes: 18 additions & 8 deletions src/rsatoolbox/inference/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,8 @@ def eval_dual_bootstrap(
/ (matrix.shape[0] - 1)
result = Result(models, evaluations, method=method,
cv_method=cv_method, noise_ceiling=noise_ceil,
variances=variances, dof=dof)
variances=variances, dof=dof, n_rdm=data.n_rdm,
n_pattern=data.n_cond)
return result


Expand Down Expand Up @@ -201,15 +202,17 @@ def eval_fixed(models, data, theta=None, method='cosine'):
noise_ceil = boot_noise_ceiling(
data, method=method, rdm_descriptor='index')
if data.n_rdm > 1:
variances = np.cov(evaluations[0], ddof=1) \
variances = np.cov(evaluations[0], ddof=0) \
/ evaluations.shape[-1]
dof = evaluations.shape[-1] - 1
else:
variances = None
dof = 0
result = Result(models, evaluations, method=method,
cv_method='fixed', noise_ceiling=noise_ceil,
variances=variances, dof=dof)
variances=variances, dof=dof, n_rdm=data.n_rdm,
n_pattern=None)
result.n_pattern = data.n_cond
return result


Expand Down Expand Up @@ -269,7 +272,8 @@ def eval_bootstrap(models, data, theta=None, method='cosine', N=1000,
dof = min(data.n_rdm, data.n_cond) - 1
result = Result(models, evaluations, method=method,
cv_method='bootstrap', noise_ceiling=noise_ceil,
variances=variances, dof=dof)
variances=variances, dof=dof, n_rdm=data.n_rdm,
n_pattern=data.n_cond)
return result


Expand Down Expand Up @@ -329,7 +333,9 @@ def eval_bootstrap_pattern(models, data, theta=None, method='cosine', N=1000,
dof = data.n_cond - 1
result = Result(models, evaluations, method=method,
cv_method='bootstrap_pattern', noise_ceiling=noise_ceil,
variances=variances, dof=dof)
variances=variances, dof=dof, n_rdm=None,
n_pattern=data.n_cond)
result.n_rdm = data.n_rdm
return result


Expand Down Expand Up @@ -378,7 +384,9 @@ def eval_bootstrap_rdm(models, data, theta=None, method='cosine', N=1000,
variances = np.cov(evaluations.T)
result = Result(models, evaluations, method=method,
cv_method='bootstrap_rdm', noise_ceiling=noise_ceil,
variances=variances, dof=dof)
variances=variances, dof=dof, n_rdm=data.n_rdm,
n_pattern=None)
result.n_pattern = data.n_cond
return result


Expand Down Expand Up @@ -590,7 +598,8 @@ def bootstrap_crossval(models, data, method='cosine', fitter=None,
variances = np.cov(np.concatenate([evals_nonan.T, noise_ceil_nonan]))
result = Result(models, evaluations, method=method,
cv_method=cv_method, noise_ceiling=noise_ceil,
variances=variances, dof=dof)
variances=variances, dof=dof, n_rdm=data.n_rdm,
n_pattern=data.n_cond)
return result


Expand Down Expand Up @@ -735,7 +744,8 @@ def eval_dual_bootstrap_random(
variances = np.cov(np.concatenate([evals_nonan.T, noise_ceil_nonan]))
result = Result(models, evaluations, method=method,
cv_method=cv_method, noise_ceiling=noise_ceil,
variances=variances, dof=dof)
variances=variances, dof=dof, n_rdm=data.n_rdm,
n_pattern=data.n_cond)
return result


Expand Down
6 changes: 4 additions & 2 deletions src/rsatoolbox/inference/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class Result:
"""

def __init__(self, models, evaluations, method, cv_method, noise_ceiling,
variances=None, dof=1, fitter=None):
variances=None, dof=1, fitter=None, n_rdm=None, n_pattern=None):
if isinstance(models, rsatoolbox.model.Model):
models = [models]
assert len(models) == evaluations.shape[1], 'evaluations shape does' \
Expand All @@ -55,6 +55,8 @@ def __init__(self, models, evaluations, method, cv_method, noise_ceiling,
self.dof = dof
self.fitter = fitter
self.n_bootstraps = evaluations.shape[0]
self.n_rdm = n_rdm
self.n_pattern = n_pattern
if variances is not None:
# if the variances only refer to the models this should have the
# same number of entries as the models list.
Expand All @@ -63,7 +65,7 @@ def __init__(self, models, evaluations, method, cv_method, noise_ceiling,
else:
nc_included = variances.shape[-1] != len(models)
self.model_var, self.diff_var, self.noise_ceil_var = \
extract_variances(variances, nc_included)
extract_variances(variances, nc_included, n_rdm, n_pattern)
else:
self.model_var = None
self.diff_var = None
Expand Down
115 changes: 89 additions & 26 deletions src/rsatoolbox/util/inference_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
"""
Inference module utilities
"""

from __future__ import annotations
from collections.abc import Iterable
from typing import TYPE_CHECKING, Optional
import warnings
import numpy as np
from scipy import stats
from scipy.stats import rankdata, wilcoxon
Expand All @@ -13,6 +15,8 @@
from rsatoolbox.rdm import RDMs
from .matrix import pairwise_contrast
from .rdm_utils import batch_to_matrices
if TYPE_CHECKING:
from numpy.typing import NDArray


def input_check_model(models, theta=None, fitter=None, N=1):
Expand Down Expand Up @@ -68,7 +72,7 @@ def input_check_model(models, theta=None, fitter=None, N=1):
return models, evaluations, theta, fitter


def pool_rdm(rdms, method='cosine'):
def pool_rdm(rdms, method: str = 'cosine'):
"""pools multiple RDMs into the one with maximal performance under a given
evaluation metric
rdm_descriptors of the generated rdms are empty
Expand Down Expand Up @@ -114,11 +118,11 @@ def pool_rdm(rdms, method='cosine'):
rdm_vec = np.array([_nan_rank_data(v) for v in rdm_vec])
rdm_vec = _nan_mean(rdm_vec)
elif method in ('kendall', 'tau-b'):
Warning('Noise ceiling for tau based on averaged ranks!')
warnings.warn('Noise ceiling for tau based on averaged ranks!')
rdm_vec = np.array([_nan_rank_data(v) for v in rdm_vec])
rdm_vec = _nan_mean(rdm_vec)
elif method == 'tau-a':
Warning('Noise ceiling for tau based on averaged ranks!')
warnings.warn('Noise ceiling for tau based on averaged ranks!')
rdm_vec = np.array([_nan_rank_data(v) for v in rdm_vec])
rdm_vec = _nan_mean(rdm_vec)
else:
Expand All @@ -130,7 +134,7 @@ def pool_rdm(rdms, method='cosine'):
pattern_descriptors=rdms.pattern_descriptors)


def _nan_mean(rdm_vector):
def _nan_mean(rdm_vector: NDArray) -> NDArray:
""" takes the average over a rdm_vector with nans for masked entries
without a warning
Expand All @@ -149,7 +153,7 @@ def _nan_mean(rdm_vector):
return rdm_mean


def _nan_rank_data(rdm_vector):
def _nan_rank_data(rdm_vector: NDArray) -> NDArray:
""" rank_data for vectors with nan entries
Args:
Expand All @@ -166,9 +170,14 @@ def _nan_rank_data(rdm_vector):
return ranks


def all_tests(evaluations, noise_ceil, test_type='t-test',
model_var=None, diff_var=None, noise_ceil_var=None,
dof=1):
def all_tests(
evaluations: NDArray,
noise_ceil: NDArray,
test_type: str = 't-test',
model_var: Optional[NDArray] = None,
diff_var: Optional[NDArray] = None,
noise_ceil_var: Optional[NDArray] = None,
dof: int = 1):
"""wrapper running all tests necessary for the model plot
-> pairwise tests, tests against 0 and against noise ceiling
Expand Down Expand Up @@ -218,7 +227,11 @@ def all_tests(evaluations, noise_ceil, test_type='t-test',
return p_pairwise, p_zero, p_noise


def pair_tests(evaluations, test_type='t-test', diff_var=None, dof=1):
def pair_tests(
evaluations: NDArray,
test_type: str = 't-test',
diff_var: Optional[NDArray] = None,
dof: int = 1):
"""wrapper running pair tests
Args:
Expand Down Expand Up @@ -393,7 +406,7 @@ def bootstrap_pair_tests(evaluations):
proportions = np.zeros((evaluations.shape[1], evaluations.shape[1]))
while len(evaluations.shape) > 2:
evaluations = np.mean(evaluations, axis=-1)
for i_model in range(evaluations.shape[1]-1):
for i_model in range(evaluations.shape[1] - 1):
for j_model in range(i_model + 1, evaluations.shape[1]):
proportions[i_model, j_model] = np.sum(
evaluations[:, i_model] < evaluations[:, j_model]) \
Expand Down Expand Up @@ -499,7 +512,11 @@ def t_test_nc(evaluations, variances, noise_ceil, dof=1):
return p


def extract_variances(variance, nc_included=True):
def extract_variances(
variance,
nc_included: bool = True,
n_rdm: Optional[int] = None,
n_pattern: Optional[int] = None):
""" extracts the variances for the individual model evaluations,
differences between model evaluations and for the comparison to
the noise ceiling
Expand All @@ -516,6 +533,12 @@ def extract_variances(variance, nc_included=True):
to the noise ceiling results
nc_included=False assumes that the noise ceiling is fixed instead.
To get the more accurate estimates that take into account
the number of subjects and/or the numbers of stimuli
can be passed as n_rdm and n_pattern respectively.
This function corrects for all ns that are passed. If you bootstrapped
only one factor only pass the N for that factor!
"""
if variance.ndim == 0:
variance = np.array([variance])
Expand All @@ -532,6 +555,9 @@ def extract_variances(variance, nc_included=True):
model_variances = variance
nc_variances = np.array([variance, variance]).T
diff_variances = np.diag(C @ np.diag(variance) @ C.T)
model_variances = _correct_1d(model_variances, n_pattern, n_rdm)
nc_variances = _correct_1d(nc_variances, n_pattern, n_rdm)
diff_variances = _correct_1d(diff_variances, n_pattern, n_rdm)
elif variance.ndim == 2:
# a single covariance matrix
if nc_included:
Expand All @@ -546,6 +572,9 @@ def extract_variances(variance, nc_included=True):
model_variances = np.diag(variance)
nc_variances = np.array([model_variances, model_variances]).T
diff_variances = np.diag(C @ variance @ C.T)
model_variances = _correct_1d(model_variances, n_pattern, n_rdm)
nc_variances = _correct_1d(nc_variances, n_pattern, n_rdm)
diff_variances = _correct_1d(diff_variances, n_pattern, n_rdm)
elif variance.ndim == 3:
# general transform for multiple covariance matrices
if nc_included:
Expand All @@ -565,12 +594,30 @@ def extract_variances(variance, nc_included=True):
).transpose(1, 2, 0)
diff_variances = np.einsum('ij,kjl,il->ki', C, variance, C)
# dual bootstrap variance estimate from 3 covariance matrices
model_variances = _dual_bootstrap(model_variances)
nc_variances = _dual_bootstrap(nc_variances)
diff_variances = _dual_bootstrap(diff_variances)
model_variances = _dual_bootstrap(model_variances, n_rdm, n_pattern)
nc_variances = _dual_bootstrap(nc_variances, n_rdm, n_pattern)
diff_variances = _dual_bootstrap(diff_variances, n_rdm, n_pattern)
return model_variances, diff_variances, nc_variances


def _correct_1d(
variance: NDArray,
n_pattern: Optional[int] = None,
n_rdm: Optional[int] = None):
if (n_pattern is not None) and (n_rdm is not None):
# uncorrected dual bootstrap?
n = min(n_rdm, n_pattern)
elif n_pattern is not None:
n = n_pattern
elif n_rdm is not None:
n = n_rdm
else:
n = None
if n is not None:
variance = (n / (n - 1)) * variance
return variance


def get_errorbars(model_var, evaluations, dof, error_bars='sem',
test_type='t-test'):
""" computes errorbars for the model-evaluations from a results object
Expand Down Expand Up @@ -617,31 +664,47 @@ def get_errorbars(model_var, evaluations, dof, error_bars='sem',
errorbar_high = std_eval \
* tdist.ppf(prop_cut, dof)
else:
raise Exception('computing errorbars: Argument ' +
'error_bars is incorrectly defined as '
+ str(error_bars) + '.')
raise ValueError('computing errorbars: Argument ' +
'error_bars is incorrectly defined as '
+ str(error_bars) + '.')
limits = np.stack((errorbar_low, errorbar_high))
if np.isnan(limits).any() or (abs(limits) == np.inf).any():
raise Exception(
raise ValueError(
'computing errorbars: Too few bootstrap samples for the ' +
'requested confidence interval: ' + error_bars + '.')
return limits


def _dual_bootstrap(variances):
def _dual_bootstrap(variances, n_rdm=None, n_pattern=None):
""" helper function to perform the dual bootstrap
Takes a 3x... array of variances and computes the corrections assuming:
variances[0] are the variances in the double bootstrap
variances[1] are the variances in the rdm bootstrap
variances[2] are the variances in the pattern bootstrap
If both n_rdm and n_pattern are given this uses
the more accurate small sample formula.
"""
variance = 2 * (variances[1] + variances[2]) \
- variances[0]
variance = np.maximum(np.maximum(
variance, variances[1]), variances[2])
variance = np.minimum(
variance, variances[0])
if n_rdm is None or n_pattern is None:
variance = 2 * (variances[1] + variances[2]) \
- variances[0]
variance = np.maximum(np.maximum(
variance, variances[1]), variances[2])
variance = np.minimum(
variance, variances[0])
else:
variance = (
(n_rdm / (n_rdm - 1)) * variances[1]
+ (n_pattern / (n_pattern - 1)) * variances[2]
- ((n_pattern * n_rdm / (n_pattern - 1) / (n_rdm - 1))
* (variances[0] - variances[1] - variances[2])))
variance = np.maximum(np.maximum(
variance,
(n_rdm / (n_rdm - 1)) * variances[1]),
(n_pattern / (n_pattern - 1)) * variances[2])
variance = np.minimum(
variance, variances[0])
return variance


Expand Down

0 comments on commit ed75983

Please sign in to comment.