Merge branch 'v0.1.1' into as/allow_arbitrary_cols

xKDR · Apr 10, 2023 · 5b2cc46 · 5b2cc46
2 parents e552250 + 9d69475
commit 5b2cc46
Show file tree

Hide file tree

Showing 6 changed files with 272 additions and 7 deletions.
diff --git a/docs/src/api.md b/docs/src/api.md
@@ -14,6 +14,8 @@ SurveyDesign
 ReplicateDesign
 load_data
 bootweights
+jackknifeweights
+jackknife_variance
 mean
 total
 quantile

diff --git a/src/Survey.jl b/src/Survey.jl
@@ -25,6 +25,7 @@ include("boxplot.jl")
 include("show.jl")
 include("ratio.jl")
 include("by.jl")
+include("jackknife.jl")
 
 export load_data
 export AbstractSurveyDesign, SurveyDesign, ReplicateDesign
@@ -35,5 +36,6 @@ export hist, sturges, freedman_diaconis
 export boxplot
 export bootweights
 export ratio
+export jackknifeweights, jackknife_variance
 
 end
diff --git a/src/jackknife.jl b/src/jackknife.jl
@@ -0,0 +1,159 @@
+"""
+    jackknifeweights(design::SurveyDesign)
+
+Delete-1 Jackknife algorithm for replication weights from sampling weights. The replicate weights are calculated using the following formula.
+
+```math
+w_{i(hj)} =
+\\begin{cases}
+    w_i\\quad\\quad &\\text{if observation unit }i\\text{ is not in stratum }h\\\\
+    0\\quad\\quad &\\text{if observation unit }i\\text{ is in psu }j\\text{of stratum }h\\\\
+    \\dfrac{n_h}{n_h - 1}w_i \\quad\\quad &\\text{if observation unit }i\\text{ is in stratum }h\\text{ but not in psu }j\\\\
+\\end{cases}
+```
+In the above formula, ``w_i`` represent the original weights, ``w_{i(hj)}`` represent the replicate weights when the ``j``th PSU from cluster ``h`` is removed, and ``n_h`` represents the number of unique PSUs within cluster ``h``. Replicate weights are added as columns to `design.data`, and these columns have names of the form `replicate_i`, where `i` ranges from 1 to the number of replicate weight columns.
+
+# Examples
+```jldoctest
+julia> using Survey;
+
+julia> apistrat = load_data("apistrat");
+
+julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw);
+
+julia> rstrat = jackknifeweights(dstrat)
+ReplicateDesign:
+data: 200×244 DataFrame
+strata: stype
+    [E, E, E  …  M]
+cluster: none
+popsize: [4420.9999, 4420.9999, 4420.9999  …  1018.0]
+sampsize: [100, 100, 100  …  50]
+weights: [44.21, 44.21, 44.21  …  20.36]
+allprobs: [0.0226, 0.0226, 0.0226  …  0.0491]
+type: jackknife
+replicates: 200
+
+```
+
+# Reference
+pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
+"""
+function jackknifeweights(design::SurveyDesign)
+    sort!(design.data, [design.strata, design.cluster])
+    df = design.data
+
+    stratified_gdf = groupby(df, design.strata)
+    replicate_index = 0
+    for (key, subgroup) in pairs(stratified_gdf)
+        stratum = key[design.strata]    # current stratum
+        psus_in_stratum = unique(subgroup[!, design.cluster])
+        nh = length(psus_in_stratum)
+
+        for psu in psus_in_stratum
+            replicate_index += 1
+            colname = "replicate_"*string(replicate_index)
+
+            # Initialise replicate_i with original sampling weights
+            df[!, colname] = Vector(df[!, design.weights])
+
+            # getting indexes
+            same_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .== psu)
+            different_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .!== psu)
+
+            # scaling weights appropriately
+            df[same_psu, colname] .*= 0
+            df[different_psu, colname] .*= nh/(nh - 1)
+        end
+    end
+
+    return ReplicateDesign(
+        df,
+        design.cluster,
+        design.popsize,
+        design.sampsize,
+        design.strata,
+        design.weights,
+        design.allprobs,
+        design.pps,
+        "jackknife",
+        UInt(replicate_index),
+        [Symbol("replicate_"*string(index)) for index in 1:replicate_index]
+    )
+end
+
+"""
+    jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign)
+
+Compute variance of column `x` for the given `func` using the Jackknife method. The formula to compute this variance is the following.
+
+```math
+\\hat{V}_{\\text{JK}}(\\hat{\\theta}) = \\sum_{h = 1}^H \\dfrac{n_h - 1}{n_h}\\sum_{j = 1}^{n_h}(\\hat{\\theta}_{(hj)} - \\hat{\\theta})^2
+```
+
+Above, ``\\hat{\\theta}`` represents the estimator computed using the original weights, and ``\\hat{\\theta_{(hj)}}`` represents the estimator computed from the replicate weights obtained when PSU ``j`` from cluster ``h`` is removed.
+
+# Examples
+```jldoctest
+julia> using Survey, StatsBase
+
+julia> apistrat = load_data("apistrat");
+
+julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw);
+
+julia> rstrat = jackknifeweights(dstrat)
+ReplicateDesign:
+data: 200×244 DataFrame
+strata: stype
+    [E, E, E  …  M]
+cluster: none
+popsize: [4420.9999, 4420.9999, 4420.9999  …  1018.0]
+sampsize: [100, 100, 100  …  50]
+weights: [44.21, 44.21, 44.21  …  20.36]
+allprobs: [0.0226, 0.0226, 0.0226  …  0.0491]
+type: jackknife
+replicates: 200
+
+julia> weightedmean(x, y) = mean(x, weights(y));
+
+julia> jackknife_variance(:api00, weightedmean, rstrat)
+1×2 DataFrame
+ Row │ estimator  SE
+     │ Float64    Float64
+─────┼────────────────────
+   1 │   662.287  9.53613
+
+```
+# Reference
+pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
+"""
+function jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign)
+    df = design.data
+    # sort!(df, [design.strata, design.cluster])
+    stratified_gdf = groupby(df, design.strata)
+
+    # estimator from original weights
+    θ = func(df[!, x], df[!, design.weights])
+
+    variance = 0
+    replicate_index = 1
+    for subgroup in stratified_gdf
+        psus_in_stratum = unique(subgroup[!, design.cluster])
+        nh = length(psus_in_stratum)
+        cluster_variance = 0
+        for psu in psus_in_stratum
+            # get replicate weights corresponding to current stratum and psu
+            rep_weights = df[!, "replicate_"*string(replicate_index)]
+
+            # estimator from replicate weights
+            θhj = func(df[!, x], rep_weights)
+
+            cluster_variance += ((nh - 1)/nh)*(θhj - θ)^2
+            replicate_index += 1
+        end
+        variance += cluster_variance
+    end
+
+    return DataFrame(estimator = θ, SE = sqrt(variance))
+end
+
diff --git a/src/mean.jl b/src/mean.jl
@@ -46,13 +46,19 @@ julia> mean(:api00, bclus1)
 ```
 """
 function mean(x::Symbol, design::ReplicateDesign)
-    X = mean(design.data[!, x], weights(design.data[!, design.weights]))
-    Xt = [
-        mean(design.data[!, x], weights(design.data[!, "replicate_"*string(i)])) for
-        i = 1:design.replicates
-    ]
-    variance = sum((Xt .- X) .^ 2) / design.replicates
-    DataFrame(mean = X, SE = sqrt(variance))
+    if design.type == "bootstrap"
+        θ̂ = mean(design.data[!, x], weights(design.data[!, design.weights]))
+        θ̂t = [
+            mean(design.data[!, x], weights(design.data[!, "replicate_"*string(i)])) for
+            i = 1:design.replicates
+        ]
+        variance = sum((θ̂t .- θ̂) .^ 2) / design.replicates
+        return DataFrame(mean = θ̂, SE = sqrt(variance))
+    # Jackknife integration
+    elseif design.type == "jackknife"
+        weightedmean(x, y) = mean(x, weights(y))
+        return jackknife_variance(x, weightedmean, design)
+    end
 end
 
 """

diff --git a/test/jackknife.jl b/test/jackknife.jl
@@ -0,0 +1,88 @@
+@testset "jackknife" begin
+    # Create Jackknife replicate designs
+    bsrs_jk = srs |> jackknifeweights # SRS
+    dstrat_jk = dstrat |> jackknifeweights # Stratified
+    dclus2_jk = dclus2 |> jackknifeweights # Two stage cluster
+    dnhanes_jk = dnhanes |> jackknifeweights # multistage stratified
+
+    mean_srs_jk = mean([:api00,:api99], bsrs_jk)
+    @test mean_srs_jk.SE[1] ≈ 9.4028 atol = 1e-3
+    @test mean_srs_jk.SE[2] ≈ 9.6575 atol = 1e-3
+
+    mean_strat_jk = mean([:api00,:api99],dstrat_jk)
+    @test mean_strat_jk.SE[1] ≈ 9.5361 atol = 1e-3
+    @test mean_strat_jk.SE[2] ≈ 10.097 atol = 1e-3
+
+    mean_clus2_jk = mean([:api00,:api99],dclus2_jk)
+    @test mean_clus2_jk.SE[1] ≈ 34.9388 atol = 1e-3 # R gives 34.928
+    @test mean_clus2_jk.SE[2] ≈ 34.6565 atol = 1e-3 # R gives 34.645
+
+    # Tests using for NHANES
+    mean_nhanes_jk = mean([:seq1, :seq2],dnhanes_jk)
+    @test mean_nhanes_jk.estimator[1] ≈ 21393.96 atol = 1e-3
+    @test mean_nhanes_jk.SE[1] ≈ 143.371 atol = 1e-3 # R is slightly diff in 2nd decimal place
+    @test mean_nhanes_jk.estimator[2] ≈ 38508.328 atol = 1e-3
+    @test mean_nhanes_jk.SE[2] ≈ 258.068 atol = 1e-3 # R is slightly diff in 2nd decimal place
+end
+
+# # R code for correctness above
+# library(survey) 
+# data(api)
+# apiclus1$pw = rep(757/15,nrow(apiclus1))
+# No corrections needed for apiclus2, it has correct weights by default
+
+# #############
+# ###### 23.03.22 PR#260
+
+# ## srs with fpc (doesnt match Julia)
+# srs <- svydesign(id=~1,data=apisrs, weights=~pw, fpc=~fpc)
+# bsrs_jk <- as.svrepdesign(srs, type="JK1", compress=FALSE)
+# svymean(~api00,bsrs_jk)
+# #         mean     SE
+# # api00 656.58 9.2497
+
+# ## srs without fpc (matches Julia)
+# srs <- svydesign(id=~1,data=apisrs, weights=~pw)#, fpc=~fpc)
+# bsrs_jk <- as.svrepdesign(srs, type="JK1", compress=FALSE)
+# svymean(~api00+api99,bsrs_jk)
+# #         mean     SE
+# # api00 656.58 9.4028
+# # api99 624.68 9.6575
+
+# # strat with fpc (doesnt match Julia)
+# dstrat <- svydesign(data=apistrat, id=~1, weights=~pw, strata=~stype,fpc=~fpc)
+# dstrat_jk <- as.svrepdesign(dstrat, type="JKn", compress=FALSE)
+# svymean(~api99,dstrat_jk)
+# #         mean     SE
+# # api99 629.39 9.9639
+
+# # strat without fpc (matches Julia)
+# dstrat <- svydesign(data=apistrat, id=~1, weights=~pw, strata=~stype)#,fpc=~fpc)
+# dstrat_jk <- as.svrepdesign(dstrat, type="JKn", compress=FALSE)
+# svymean(~api00+api99,dstrat_jk)
+# #         mean     SE
+# # api00 662.29  9.5361
+# # api99 629.39 10.0970
+
+#### apiclus2
+# > # clus2 without fpc (doesnt match Julia)
+# > dclus2 <- svydesign(id=~dnum+snum, weights=~pw, data=apiclus2)
+# > dclus2_jk <- as.svrepdesign(dclus2, type="JK1", compress=FALSE)
+# > svymean(~api00+api99,dclus2)
+#         mean     SE
+# api00 670.81 30.712
+# api99 645.03 30.308
+# > svymean(~api00+api99,dclus2_jk)
+#         mean     SE
+# api00 670.81 34.928
+# api99 645.03 34.645
+
+# NHANES test
+# > data("nhanes")
+# > nhanes$seq1 = seq(1.0, 5*8591.0, by = 5)
+# > nhanes$seq2 = seq(1.0, 9*8591.0, by = 9)#,  length.out=8591)
+# > dnhanes <- svydesign(id=~SDMVPSU, strata=~SDMVSTRA, weights=~WTMEC2YR, nest=TRUE, data=nhanes)
+# > svymean(~seq1+seq2,dnhanes)
+#       mean     SE
+# seq1 21394 143.34
+# seq2 38508 258.01
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -41,6 +41,13 @@ apiclus2 = load_data("apiclus2") # Load API dataset
 dclus2 = SurveyDesign(apiclus2; clusters = :dnum, weights = :pw) # Create SurveyDesign
 dclus2_boot = dclus2 |> bootweights # Create replicate design
 
+# NHANES
+nhanes = load_data("nhanes")
+nhanes.seq1 = collect(1.0:5.0:42955.0)
+nhanes.seq2 = collect(1.0:9.0:77319.0) # [9k for k in 0:8590.0]
+dnhanes = SurveyDesign(nhanes; clusters = :SDMVPSU, strata = :SDMVSTRA, weights = :WTMEC2YR)
+dnhanes_boot = dnhanes |> bootweights
+
 @testset "Survey.jl" begin
     @test size(load_data("apiclus1")) == (183, 40)
     @test size(load_data("apiclus2")) == (126, 41)
@@ -56,3 +63,4 @@ include("hist.jl")
 include("boxplot.jl")
 include("ratio.jl")
 include("show.jl")
+include("jackknife.jl")