Skip to content

Commit

Permalink
add tests for USHD
Browse files Browse the repository at this point in the history
  • Loading branch information
ADucellierIHME committed Oct 28, 2024
1 parent d8347d7 commit 6b4b9cf
Show file tree
Hide file tree
Showing 9 changed files with 8,151 additions and 19 deletions.
166 changes: 165 additions & 1 deletion src/raking/formatting_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ def format_data_3D(
df_margins_3 : pd.DataFrame
Margins data (sums over the third variable)
var_names : list of 3 strings
Names of the two variables over which we rake (e.g. cause, race, county)
Names of the three variables over which we rake (e.g. cause, race, county)
weights : string
Name of the column containing the raking weights
lower : string
Expand Down Expand Up @@ -815,3 +815,167 @@ def format_data_3D(
else:
h = None
return (y, s1, s2, s3, I, J, K, q, l, h)


def format_data_USHD(
df_obs: pd.DataFrame,
df_margins: pd.DataFrame,
weights: str = None,
lower: str = None,
upper: str = None,
) -> tuple[
np.ndarray,
np.ndarray,
int,
int,
int,
np.ndarray | None,
np.ndarray | None,
np.ndarray | None,
]:
"""Read the data and create the inputs of the raking functions (USHD problem).
Parameters
----------
df_obs : pd.DataFrame
Observations data
df_margins : pd.DataFrame
Margins data (GBD)
weights : string
Name of the column containing the raking weights
lower : string
Name of the column containing the lower boundaries (for logit raking)
upper : string
Name of the column containing the upper boundaries (for logit raking)
Returns
-------
y : np.ndarray
Vector of observations
s : np.ndarray
Total number of deaths (all causes, and each cause)
I : int
Number of possible values for cause
J : int
Number of possible values for race
K : int
Number of possible values for county
q : np.ndarray
Vector of weights
l : np.ndarray
Lower bounds for the observations
h : np.ndarray
Upper bounds for the observations
"""
assert isinstance(
df_obs, pd.DataFrame
), "The observations should be a pandas data frame."
assert (
len(df_obs) >= 18
), "There should be at least 18 data points for the observations."

assert isinstance(
df_margins, pd.DataFrame
), "The margins should be a pandas data frame."
assert (
len(df_margins) >= 3
), "There should be at least 3 data points for the margins."

for var_name in ['value', 'cause', 'race' 'county']:
assert var_name in df_obs.columns.tolist(), (
"The column for the categorical variable "
+ var_name
+ " is missing from the observations data frame."
)

assert 'cause' in df_margins.columns.tolist(), (
"The cause column is missing from the margins data frame."
)
assert "value_agg_over_race_county" in df_margins.columns.tolist(), (
"The column for the aggregated value over races and counties is missing from the margins data frame."
)

if weights is not None:
assert isinstance(
weigths, str
), "The name of the column containing the weights should be a string."
assert (
weights in df_obs.columns.tolist()
), "The column containing the weights is missing from the data frame."
if lower is not None:
assert isinstance(
lower, str
), "The name of the column containing the lower boundaries should be a string."
assert (
lower in df_obs.columns.tolist()
), "The column containing the lower boundaries is missing from the data frame."
if upper is not None:
assert isinstance(
upper, str
), "The name of the column containing the upper boundaries should be a string."
assert (
upper in df_obs.columns.tolist()
), "The column containing the upper_boundaries is missing from the data frame."

# Check the observations data
for var_name in ['value', 'cause', 'race' 'county']:
assert df_obs[var_name].isna().sum() == 0, (
"There are missing values in the "
+ var_name
+ " column of the observations."
)
assert (
len(df_obs[df_obs.duplicated(['cause', 'race' 'county'])]) == 0
), "There are duplicated rows in the observations."
count_obs = df_obs[['cause', 'race' 'county']].value_counts()
assert (len(count_obs.unique()) == 1) and (count_obs.unique()[0] == 1), (
"There are missing combinations of cause, race and county in the observations."
)

# Check the margins data
assert df_margins['cause'].isna().sum() == 0, (
"There are missing values in the cause column of the margins."
)
assert df_margins["value_agg_over_race_county"].isna().sum() == 0, (
"There are missing values in the value_agg_over_race_county column of the margins."
)
assert (
len(df_margins[df_margins.duplicated(['cause'])]) == 0
), "There are duplicated rows in the margins data frame."

# Check consistency between observations and margins
assert len(df_obs['cause'].unique()) == len(
df_margins['cause'].unique()
), (
"The number of categories for cause should be the same in the observations and margins data frames."
)
assert set(df_obs['cause'].unique().tolist()) == set(
df_margins['cause'].unique().tolist()
), (
"The names of the categories for cause should be the same in the observations and margins data frames."
)

# Create input variables for the raking functions
df_obs.sort_values(
by=['county', 'race', 'cause'], inplace=True
)
df_margins.sort_values(by=['cause'], inplace=True)
I = len(df_obs['cause'].unique()) - 1
J = len(df_obs['race'].unique()) - 1
K = len(df_obs['county'].unique())
y = df_obs.value.to_numpy()
s = df_margins["value_agg_over_race-county"].to_numpy()
if weights is not None:
q = df_obs[weights].to_numpy()
else:
q = None
if lower is not None:
l = df_obs[lower].to_numpy()
else:
l = None
if upper is not None:
h = df_obs[upper].to_numpy()
else:
h = None
return (y, s, I, J, K, q, l, h)

112 changes: 94 additions & 18 deletions src/raking/run_raking.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
constraints_1D,
constraints_2D,
constraints_3D,
constraints_USHD,
)
from raking.compute_covariance import compute_covariance_obs
from raking.compute_covariance import (
Expand All @@ -24,6 +25,7 @@
format_data_1D,
format_data_2D,
format_data_3D,
format_data_USHD,
)
from raking.raking_methods import (
raking_chi2,
Expand All @@ -37,10 +39,10 @@


def run_raking(
dim: int,
dim: int | str,
df_obs: pd.DataFrame,
df_margins: list,
var_names: list,
var_names: list | None,
draws: str = "draws",
cov_mat: bool = True,
sigma_yy: np.ndarray = None,
Expand All @@ -61,14 +63,14 @@ def run_raking(
Parameters
----------
dim : integer
Dimension of the raking problem (1, 2, 3)
dim : integer or string
Dimension of the raking problem (1, 2, 3) or special case (USHD)
df_obs : pd.DataFrame
Observations data
df_margins : list of pd.DataFrame
list of data frames contatining the margins data
var_names : list of strings
Names of the variables over which we rake (e.g. cause, race, county)
Names of the variables over which we rake (e.g. cause, race, county). None if using special case.
draws: string
Name of the column that contains the samples.
cov_mat : boolean
Expand Down Expand Up @@ -107,29 +109,34 @@ def run_raking(
df_obs : pd.DataFrame
The initial observations data frame with an additional column for the raked values
"""
assert isinstance(
dim, int
), "The dimension of the raking problem must be an integer."
assert isinstance(dim, int) or isinstance(dim, str)
), "The dimension of the raking problem must be an integer or string."
assert dim in [
1,
2,
3,
], "The dimension of the raking problem must be 1, 2 or 3."
"USHD"
], "The dimension of the raking problem must be 1, 2, 3 or USHD."
assert isinstance(
cov_mat, bool
), "cov_mat indicates whether we compute the covariance matrix, must be True or False."
assert isinstance(
var_names, list
), "The variables over which we rake must be entered as a list."
assert (
dim == len(var_names)
), "The number of variables over which we rake must be equal to the dimension of the problem."
if dim in [1, 2, 3]:
assert isinstance(
var_names, list
), "The variables over which we rake must be entered as a list."
assert (
dim == len(var_names)
), "The number of variables over which we rake must be equal to the dimension of the problem."
assert isinstance(
df_margins, list
), "The margins data frames must be entered as a list."
assert (
dim == len(df_margins)
), "The number of margins data frames must be equal to the dimension of the problem."
if dim in [1, 2, 3]:
assert (
dim == len(df_margins)
), "The number of margins data frames must be equal to the dimension of the problem."
else:
assert len(df_margins) == 1, \
"There should be only one margins data frame in the list."
assert isinstance(
method, str
), "The name of the distance function used for the raking must be a string."
Expand Down Expand Up @@ -172,6 +179,8 @@ def run_raking(
sigma_ss,
sigma_ys,
)
elif dim == "USHD":
pass
else:
pass
# Check if matrix is definite positive
Expand All @@ -198,6 +207,10 @@ def run_raking(
(y, s, q, l, h, A) = run_raking_3D(
df_obs, df_margins, var_names, weights, lower, upper, rtol, atol
)
elif dim == "USHD":
(y, s, q, l, h, A) = run_raking_USHD(
df_obs, df_margins, weights, lower, upper, rtol, atol
)
else:
pass

Expand Down Expand Up @@ -435,6 +448,69 @@ def run_raking_3D(
return (y, s, q, l, h, A)


def run_raking_USHD(
df_obs: pd.DataFrame,
df_margins: list,
weights: str = None,
lower: str = None,
upper: str = None,
rtol: float = 1e-05,
atol: float = 1e-08,
) -> tuple[
np.ndarray,
np.ndarray,
np.ndarray | None,
np.ndarray | None,
np.ndarray | None,
np.ndarray,
]:
"""
This function prepares variables to run the raking problem for the USHD case.
Parameters
----------
df_obs : pd.DataFrame
Observations data
df_margins : list of pd.DataFrame
list of data frames contatining the margins data
weights : string
Name of the column containing the raking weights
lower : string
Name of the column containing the lower boundaries (for logit raking)
upper : string
Name of the column containing the upper boundaries (for logit raking)
rtol : float
Relative tolerance to check whether the margins are consistant. See numpy.allclose documentation for details.
atol : float
Absolute tolerance to check whether the margins are consistant. See numpy.allclose documentation for details.
Returns
-------
y : np.ndarray
Vector of observations
s : np.ndarray
Margins vector
q : np.ndarray
Vector of weights
l : np.ndarray
Lower bounds for the observations
h : np.ndarray
Upper bounds for the observations
A : np.ndarray
Constraints matrix
"""
df_margins = df_margins[0]
(y, s, I, J, K, q, l, h) = format_data_USHD(
df_obs,
df_margins,
weights,
lower,
upper,
)
(A, s) = constraints_USHD(s, I, J, K, rtol, atol)
return (y, s, q, l, h, A)


def compute_covariance_1D(
df_obs: pd.DataFrame,
df_margins: pd.DataFrame,
Expand Down
Loading

0 comments on commit 6b4b9cf

Please sign in to comment.