add tests for USHD

ihmeuw-msca · Oct 28, 2024 · 6b4b9cf · 6b4b9cf
1 parent d8347d7
commit 6b4b9cf
Show file tree

Hide file tree

Showing 9 changed files with 8,151 additions and 19 deletions.
diff --git a/src/raking/formatting_methods.py b/src/raking/formatting_methods.py
@@ -427,7 +427,7 @@ def format_data_3D(
     df_margins_3 : pd.DataFrame
         Margins data (sums over the third variable)
     var_names : list of 3 strings
-        Names of the two variables over which we rake (e.g. cause, race, county)
+        Names of the three variables over which we rake (e.g. cause, race, county)
     weights : string
         Name of the column containing the raking weights
     lower : string
@@ -815,3 +815,167 @@ def format_data_3D(
     else:
         h = None
     return (y, s1, s2, s3, I, J, K, q, l, h)
+
+
+def format_data_USHD(
+    df_obs: pd.DataFrame,
+    df_margins: pd.DataFrame,
+    weights: str = None,
+    lower: str = None,
+    upper: str = None,
+) -> tuple[
+    np.ndarray,
+    np.ndarray,
+    int,
+    int,
+    int,
+    np.ndarray | None,
+    np.ndarray | None,
+    np.ndarray | None,
+]:
+    """Read the data and create the inputs of the raking functions (USHD problem).
+
+    Parameters
+    ----------
+    df_obs : pd.DataFrame
+        Observations data
+    df_margins : pd.DataFrame
+        Margins data (GBD)
+    weights : string
+        Name of the column containing the raking weights
+    lower : string
+        Name of the column containing the lower boundaries (for logit raking)
+    upper : string
+        Name of the column containing the upper boundaries (for logit raking)
+
+    Returns
+    -------
+    y : np.ndarray
+        Vector of observations
+    s : np.ndarray
+        Total number of deaths (all causes, and each cause)
+    I : int
+        Number of possible values for cause
+    J : int
+        Number of possible values for race
+    K : int
+        Number of possible values for county
+    q : np.ndarray
+        Vector of weights
+    l : np.ndarray
+        Lower bounds for the observations
+    h : np.ndarray
+        Upper bounds for the observations
+    """
+    assert isinstance(
+        df_obs, pd.DataFrame
+    ), "The observations should be a pandas data frame."
+    assert (
+        len(df_obs) >= 18
+    ), "There should be at least 18 data points for the observations."
+
+    assert isinstance(
+        df_margins, pd.DataFrame
+    ), "The margins should be a pandas data frame."
+    assert (
+        len(df_margins) >= 3
+    ), "There should be at least 3 data points for the margins."
+
+    for var_name in ['value', 'cause', 'race' 'county']:
+        assert var_name in df_obs.columns.tolist(), (
+            "The column for the categorical variable "
+            + var_name
+            + " is missing from the observations data frame."
+        )
+
+    assert 'cause' in df_margins.columns.tolist(), (
+        "The cause column is missing from the margins data frame."
+    )
+    assert "value_agg_over_race_county" in df_margins.columns.tolist(), (
+        "The column for the aggregated value over races and counties is missing from the margins data frame."
+    )
+
+    if weights is not None:
+        assert isinstance(
+            weigths, str
+        ), "The name of the column containing the weights should be a string."
+        assert (
+            weights in df_obs.columns.tolist()
+        ), "The column containing the weights is missing from the data frame."
+    if lower is not None:
+        assert isinstance(
+            lower, str
+        ), "The name of the column containing the lower boundaries should be a string."
+        assert (
+            lower in df_obs.columns.tolist()
+        ), "The column containing the lower boundaries is missing from the data frame."
+    if upper is not None:
+        assert isinstance(
+            upper, str
+        ), "The name of the column containing the upper boundaries should be a string."
+        assert (
+            upper in df_obs.columns.tolist()
+        ), "The column containing the upper_boundaries is missing from the data frame."
+
+    # Check the observations data
+    for var_name in ['value', 'cause', 'race' 'county']:
+        assert df_obs[var_name].isna().sum() == 0, (
+            "There are missing values in the "
+            + var_name
+            + " column of the observations."
+        )
+    assert (
+        len(df_obs[df_obs.duplicated(['cause', 'race' 'county'])]) == 0
+    ), "There are duplicated rows in the observations."
+    count_obs = df_obs[['cause', 'race' 'county']].value_counts()
+    assert (len(count_obs.unique()) == 1) and (count_obs.unique()[0] == 1), (
+        "There are missing combinations of cause, race and county in the observations."
+    )
+
+    # Check the margins data
+    assert df_margins['cause'].isna().sum() == 0, (
+        "There are missing values in the cause column of the margins."
+    )
+    assert df_margins["value_agg_over_race_county"].isna().sum() == 0, (
+        "There are missing values in the value_agg_over_race_county column of the margins."
+    )
+    assert (
+        len(df_margins[df_margins.duplicated(['cause'])]) == 0
+    ), "There are duplicated rows in the margins data frame."
+
+    # Check consistency between observations and margins
+    assert len(df_obs['cause'].unique()) == len(
+        df_margins['cause'].unique()
+    ), (
+        "The number of categories for cause should be the same in the observations and margins data frames."
+    )
+    assert set(df_obs['cause'].unique().tolist()) == set(
+        df_margins['cause'].unique().tolist()
+    ), (
+        "The names of the categories for cause should be the same in the observations and margins data frames."
+    )
+
+    # Create input variables for the raking functions
+    df_obs.sort_values(
+        by=['county', 'race', 'cause'], inplace=True
+    )
+    df_margins.sort_values(by=['cause'], inplace=True)
+    I = len(df_obs['cause'].unique()) - 1
+    J = len(df_obs['race'].unique()) - 1
+    K = len(df_obs['county'].unique())
+    y = df_obs.value.to_numpy()
+    s = df_margins["value_agg_over_race-county"].to_numpy()
+    if weights is not None:
+        q = df_obs[weights].to_numpy()
+    else:
+        q = None
+    if lower is not None:
+        l = df_obs[lower].to_numpy()
+    else:
+        l = None
+    if upper is not None:
+        h = df_obs[upper].to_numpy()
+    else:
+        h = None
+    return (y, s, I, J, K, q, l, h)
+
diff --git a/src/raking/run_raking.py b/src/raking/run_raking.py
@@ -7,6 +7,7 @@
     constraints_1D,
     constraints_2D,
     constraints_3D,
+    constraints_USHD,
 )
 from raking.compute_covariance import compute_covariance_obs
 from raking.compute_covariance import (
@@ -24,6 +25,7 @@
     format_data_1D,
     format_data_2D,
     format_data_3D,
+    format_data_USHD,
 )
 from raking.raking_methods import (
     raking_chi2,
@@ -37,10 +39,10 @@
 
 
 def run_raking(
-    dim: int,
+    dim: int | str,
     df_obs: pd.DataFrame,
     df_margins: list,
-    var_names: list,
+    var_names: list | None,
     draws: str = "draws",
     cov_mat: bool = True,
     sigma_yy: np.ndarray = None,
@@ -61,14 +63,14 @@ def run_raking(
 
     Parameters
     ----------
-    dim : integer
-        Dimension of the raking problem (1, 2, 3)
+    dim : integer or string
+        Dimension of the raking problem (1, 2, 3) or special case (USHD)
     df_obs : pd.DataFrame
         Observations data
     df_margins : list of pd.DataFrame
         list of data frames contatining the margins data
     var_names : list of strings
-        Names of the variables over which we rake (e.g. cause, race, county)
+        Names of the variables over which we rake (e.g. cause, race, county). None if using special case.
     draws: string
         Name of the column that contains the samples.
     cov_mat : boolean
@@ -107,29 +109,34 @@ def run_raking(
     df_obs : pd.DataFrame
         The initial observations data frame with an additional column for the raked values
     """
-    assert isinstance(
-        dim, int
-    ), "The dimension of the raking problem must be an integer."
+    assert isinstance(dim, int) or isinstance(dim, str)
+    ), "The dimension of the raking problem must be an integer or string."
     assert dim in [
         1,
         2,
         3,
-    ], "The dimension of the raking problem must be 1, 2 or 3."
+        "USHD"
+    ], "The dimension of the raking problem must be 1, 2, 3 or USHD."
     assert isinstance(
         cov_mat, bool
     ), "cov_mat indicates whether we compute the covariance matrix, must be True or False."
-    assert isinstance(
-        var_names, list
-    ), "The variables over which we rake must be entered as a list."
-    assert (
-        dim == len(var_names)
-    ), "The number of variables over which we rake must be equal to the dimension of the problem."
+    if dim in [1, 2, 3]:
+        assert isinstance(
+            var_names, list
+        ), "The variables over which we rake must be entered as a list."
+        assert (
+            dim == len(var_names)
+        ), "The number of variables over which we rake must be equal to the dimension of the problem."
     assert isinstance(
         df_margins, list
     ), "The margins data frames must be entered as a list."
-    assert (
-        dim == len(df_margins)
-    ), "The number of margins data frames must be equal to the dimension of the problem."
+    if dim in [1, 2, 3]:
+        assert (
+            dim == len(df_margins)
+        ), "The number of margins data frames must be equal to the dimension of the problem."
+    else:
+        assert len(df_margins) == 1, \
+            "There should be only one margins data frame in the list."
     assert isinstance(
         method, str
     ), "The name of the distance function used for the raking must be a string."
@@ -172,6 +179,8 @@ def run_raking(
                 sigma_ss,
                 sigma_ys,
             )
+        elif dim == "USHD":
+            pass
         else:
             pass
         # Check if matrix is definite positive
@@ -198,6 +207,10 @@ def run_raking(
         (y, s, q, l, h, A) = run_raking_3D(
             df_obs, df_margins, var_names, weights, lower, upper, rtol, atol
         )
+    elif dim == "USHD":
+        (y, s, q, l, h, A) = run_raking_USHD(
+            df_obs, df_margins, weights, lower, upper, rtol, atol
+        )
     else:
         pass
 
@@ -435,6 +448,69 @@ def run_raking_3D(
     return (y, s, q, l, h, A)
 
 
+def run_raking_USHD(
+    df_obs: pd.DataFrame,
+    df_margins: list,
+    weights: str = None,
+    lower: str = None,
+    upper: str = None,
+    rtol: float = 1e-05,
+    atol: float = 1e-08,
+) -> tuple[
+    np.ndarray,
+    np.ndarray,
+    np.ndarray | None,
+    np.ndarray | None,
+    np.ndarray | None,
+    np.ndarray,
+]:
+    """
+    This function prepares variables to run the raking problem for the USHD case.
+
+    Parameters
+    ----------
+    df_obs : pd.DataFrame
+        Observations data
+    df_margins : list of pd.DataFrame
+        list of data frames contatining the margins data
+    weights : string
+        Name of the column containing the raking weights
+    lower : string
+        Name of the column containing the lower boundaries (for logit raking)
+    upper : string
+        Name of the column containing the upper boundaries (for logit raking)
+    rtol : float
+        Relative tolerance to check whether the margins are consistant. See numpy.allclose documentation for details.
+    atol : float
+        Absolute tolerance to check whether the margins are consistant. See numpy.allclose documentation for details.
+
+    Returns
+    -------
+    y : np.ndarray
+        Vector of observations
+    s : np.ndarray
+        Margins vector
+    q : np.ndarray
+        Vector of weights
+    l : np.ndarray
+        Lower bounds for the observations
+    h : np.ndarray
+        Upper bounds for the observations
+    A : np.ndarray
+        Constraints matrix
+    """
+    df_margins = df_margins[0]
+    (y, s, I, J, K, q, l, h) = format_data_USHD(
+        df_obs,
+        df_margins,
+        weights,
+        lower,
+        upper,
+    )
+    (A, s) = constraints_USHD(s, I, J, K, rtol, atol)
+    return (y, s, q, l, h, A)
+
+
 def compute_covariance_1D(
     df_obs: pd.DataFrame,
     df_margins: pd.DataFrame,