diff --git a/docs/src/api.md b/docs/src/api.md index 341dcbba..e3dd4f08 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -14,6 +14,8 @@ SurveyDesign ReplicateDesign load_data bootweights +jackknifeweights +jackknife_variance mean total quantile diff --git a/src/Survey.jl b/src/Survey.jl index 62f263a2..6960cfb5 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -25,6 +25,7 @@ include("boxplot.jl") include("show.jl") include("ratio.jl") include("by.jl") +include("jackknife.jl") export load_data export AbstractSurveyDesign, SurveyDesign, ReplicateDesign @@ -35,5 +36,6 @@ export hist, sturges, freedman_diaconis export boxplot export bootweights export ratio +export jackknifeweights, jackknife_variance end diff --git a/src/jackknife.jl b/src/jackknife.jl new file mode 100644 index 00000000..315de348 --- /dev/null +++ b/src/jackknife.jl @@ -0,0 +1,159 @@ +""" + jackknifeweights(design::SurveyDesign) + +Delete-1 Jackknife algorithm for replication weights from sampling weights. The replicate weights are calculated using the following formula. + +```math +w_{i(hj)} = +\\begin{cases} + w_i\\quad\\quad &\\text{if observation unit }i\\text{ is not in stratum }h\\\\ + 0\\quad\\quad &\\text{if observation unit }i\\text{ is in psu }j\\text{of stratum }h\\\\ + \\dfrac{n_h}{n_h - 1}w_i \\quad\\quad &\\text{if observation unit }i\\text{ is in stratum }h\\text{ but not in psu }j\\\\ +\\end{cases} +``` +In the above formula, ``w_i`` represent the original weights, ``w_{i(hj)}`` represent the replicate weights when the ``j``th PSU from cluster ``h`` is removed, and ``n_h`` represents the number of unique PSUs within cluster ``h``. Replicate weights are added as columns to `design.data`, and these columns have names of the form `replicate_i`, where `i` ranges from 1 to the number of replicate weight columns. + +# Examples +```jldoctest +julia> using Survey; + +julia> apistrat = load_data("apistrat"); + +julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw); + +julia> rstrat = jackknifeweights(dstrat) +ReplicateDesign: +data: 200×244 DataFrame +strata: stype + [E, E, E … M] +cluster: none +popsize: [4420.9999, 4420.9999, 4420.9999 … 1018.0] +sampsize: [100, 100, 100 … 50] +weights: [44.21, 44.21, 44.21 … 20.36] +allprobs: [0.0226, 0.0226, 0.0226 … 0.0491] +type: jackknife +replicates: 200 + +``` + +# Reference +pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) +""" +function jackknifeweights(design::SurveyDesign) + sort!(design.data, [design.strata, design.cluster]) + df = design.data + + stratified_gdf = groupby(df, design.strata) + replicate_index = 0 + for (key, subgroup) in pairs(stratified_gdf) + stratum = key[design.strata] # current stratum + psus_in_stratum = unique(subgroup[!, design.cluster]) + nh = length(psus_in_stratum) + + for psu in psus_in_stratum + replicate_index += 1 + colname = "replicate_"*string(replicate_index) + + # Initialise replicate_i with original sampling weights + df[!, colname] = Vector(df[!, design.weights]) + + # getting indexes + same_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .== psu) + different_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .!== psu) + + # scaling weights appropriately + df[same_psu, colname] .*= 0 + df[different_psu, colname] .*= nh/(nh - 1) + end + end + + return ReplicateDesign( + df, + design.cluster, + design.popsize, + design.sampsize, + design.strata, + design.weights, + design.allprobs, + design.pps, + "jackknife", + UInt(replicate_index), + [Symbol("replicate_"*string(index)) for index in 1:replicate_index] + ) +end + +""" + jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign) + +Compute variance of column `x` for the given `func` using the Jackknife method. The formula to compute this variance is the following. + +```math +\\hat{V}_{\\text{JK}}(\\hat{\\theta}) = \\sum_{h = 1}^H \\dfrac{n_h - 1}{n_h}\\sum_{j = 1}^{n_h}(\\hat{\\theta}_{(hj)} - \\hat{\\theta})^2 +``` + +Above, ``\\hat{\\theta}`` represents the estimator computed using the original weights, and ``\\hat{\\theta_{(hj)}}`` represents the estimator computed from the replicate weights obtained when PSU ``j`` from cluster ``h`` is removed. + +# Examples +```jldoctest +julia> using Survey, StatsBase + +julia> apistrat = load_data("apistrat"); + +julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw); + +julia> rstrat = jackknifeweights(dstrat) +ReplicateDesign: +data: 200×244 DataFrame +strata: stype + [E, E, E … M] +cluster: none +popsize: [4420.9999, 4420.9999, 4420.9999 … 1018.0] +sampsize: [100, 100, 100 … 50] +weights: [44.21, 44.21, 44.21 … 20.36] +allprobs: [0.0226, 0.0226, 0.0226 … 0.0491] +type: jackknife +replicates: 200 + +julia> weightedmean(x, y) = mean(x, weights(y)); + +julia> jackknife_variance(:api00, weightedmean, rstrat) +1×2 DataFrame + Row │ estimator SE + │ Float64 Float64 +─────┼──────────────────── + 1 │ 662.287 9.53613 + +``` +# Reference +pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) +""" +function jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign) + df = design.data + # sort!(df, [design.strata, design.cluster]) + stratified_gdf = groupby(df, design.strata) + + # estimator from original weights + θ = func(df[!, x], df[!, design.weights]) + + variance = 0 + replicate_index = 1 + for subgroup in stratified_gdf + psus_in_stratum = unique(subgroup[!, design.cluster]) + nh = length(psus_in_stratum) + cluster_variance = 0 + for psu in psus_in_stratum + # get replicate weights corresponding to current stratum and psu + rep_weights = df[!, "replicate_"*string(replicate_index)] + + # estimator from replicate weights + θhj = func(df[!, x], rep_weights) + + cluster_variance += ((nh - 1)/nh)*(θhj - θ)^2 + replicate_index += 1 + end + variance += cluster_variance + end + + return DataFrame(estimator = θ, SE = sqrt(variance)) +end + diff --git a/src/mean.jl b/src/mean.jl index 75fef801..580c63f2 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -46,13 +46,19 @@ julia> mean(:api00, bclus1) ``` """ function mean(x::Symbol, design::ReplicateDesign) - X = mean(design.data[!, x], weights(design.data[!, design.weights])) - Xt = [ - mean(design.data[!, x], weights(design.data[!, "replicate_"*string(i)])) for - i = 1:design.replicates - ] - variance = sum((Xt .- X) .^ 2) / design.replicates - DataFrame(mean = X, SE = sqrt(variance)) + if design.type == "bootstrap" + θ̂ = mean(design.data[!, x], weights(design.data[!, design.weights])) + θ̂t = [ + mean(design.data[!, x], weights(design.data[!, "replicate_"*string(i)])) for + i = 1:design.replicates + ] + variance = sum((θ̂t .- θ̂) .^ 2) / design.replicates + return DataFrame(mean = θ̂, SE = sqrt(variance)) + # Jackknife integration + elseif design.type == "jackknife" + weightedmean(x, y) = mean(x, weights(y)) + return jackknife_variance(x, weightedmean, design) + end end """ diff --git a/test/jackknife.jl b/test/jackknife.jl new file mode 100644 index 00000000..73ab5b71 --- /dev/null +++ b/test/jackknife.jl @@ -0,0 +1,88 @@ +@testset "jackknife" begin + # Create Jackknife replicate designs + bsrs_jk = srs |> jackknifeweights # SRS + dstrat_jk = dstrat |> jackknifeweights # Stratified + dclus2_jk = dclus2 |> jackknifeweights # Two stage cluster + dnhanes_jk = dnhanes |> jackknifeweights # multistage stratified + + mean_srs_jk = mean([:api00,:api99], bsrs_jk) + @test mean_srs_jk.SE[1] ≈ 9.4028 atol = 1e-3 + @test mean_srs_jk.SE[2] ≈ 9.6575 atol = 1e-3 + + mean_strat_jk = mean([:api00,:api99],dstrat_jk) + @test mean_strat_jk.SE[1] ≈ 9.5361 atol = 1e-3 + @test mean_strat_jk.SE[2] ≈ 10.097 atol = 1e-3 + + mean_clus2_jk = mean([:api00,:api99],dclus2_jk) + @test mean_clus2_jk.SE[1] ≈ 34.9388 atol = 1e-3 # R gives 34.928 + @test mean_clus2_jk.SE[2] ≈ 34.6565 atol = 1e-3 # R gives 34.645 + + # Tests using for NHANES + mean_nhanes_jk = mean([:seq1, :seq2],dnhanes_jk) + @test mean_nhanes_jk.estimator[1] ≈ 21393.96 atol = 1e-3 + @test mean_nhanes_jk.SE[1] ≈ 143.371 atol = 1e-3 # R is slightly diff in 2nd decimal place + @test mean_nhanes_jk.estimator[2] ≈ 38508.328 atol = 1e-3 + @test mean_nhanes_jk.SE[2] ≈ 258.068 atol = 1e-3 # R is slightly diff in 2nd decimal place +end + +# # R code for correctness above +# library(survey) +# data(api) +# apiclus1$pw = rep(757/15,nrow(apiclus1)) +# No corrections needed for apiclus2, it has correct weights by default + +# ############# +# ###### 23.03.22 PR#260 + +# ## srs with fpc (doesnt match Julia) +# srs <- svydesign(id=~1,data=apisrs, weights=~pw, fpc=~fpc) +# bsrs_jk <- as.svrepdesign(srs, type="JK1", compress=FALSE) +# svymean(~api00,bsrs_jk) +# # mean SE +# # api00 656.58 9.2497 + +# ## srs without fpc (matches Julia) +# srs <- svydesign(id=~1,data=apisrs, weights=~pw)#, fpc=~fpc) +# bsrs_jk <- as.svrepdesign(srs, type="JK1", compress=FALSE) +# svymean(~api00+api99,bsrs_jk) +# # mean SE +# # api00 656.58 9.4028 +# # api99 624.68 9.6575 + +# # strat with fpc (doesnt match Julia) +# dstrat <- svydesign(data=apistrat, id=~1, weights=~pw, strata=~stype,fpc=~fpc) +# dstrat_jk <- as.svrepdesign(dstrat, type="JKn", compress=FALSE) +# svymean(~api99,dstrat_jk) +# # mean SE +# # api99 629.39 9.9639 + +# # strat without fpc (matches Julia) +# dstrat <- svydesign(data=apistrat, id=~1, weights=~pw, strata=~stype)#,fpc=~fpc) +# dstrat_jk <- as.svrepdesign(dstrat, type="JKn", compress=FALSE) +# svymean(~api00+api99,dstrat_jk) +# # mean SE +# # api00 662.29 9.5361 +# # api99 629.39 10.0970 + +#### apiclus2 +# > # clus2 without fpc (doesnt match Julia) +# > dclus2 <- svydesign(id=~dnum+snum, weights=~pw, data=apiclus2) +# > dclus2_jk <- as.svrepdesign(dclus2, type="JK1", compress=FALSE) +# > svymean(~api00+api99,dclus2) +# mean SE +# api00 670.81 30.712 +# api99 645.03 30.308 +# > svymean(~api00+api99,dclus2_jk) +# mean SE +# api00 670.81 34.928 +# api99 645.03 34.645 + +# NHANES test +# > data("nhanes") +# > nhanes$seq1 = seq(1.0, 5*8591.0, by = 5) +# > nhanes$seq2 = seq(1.0, 9*8591.0, by = 9)#, length.out=8591) +# > dnhanes <- svydesign(id=~SDMVPSU, strata=~SDMVSTRA, weights=~WTMEC2YR, nest=TRUE, data=nhanes) +# > svymean(~seq1+seq2,dnhanes) +# mean SE +# seq1 21394 143.34 +# seq2 38508 258.01 \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 048f74bb..d709103c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -41,6 +41,13 @@ apiclus2 = load_data("apiclus2") # Load API dataset dclus2 = SurveyDesign(apiclus2; clusters = :dnum, weights = :pw) # Create SurveyDesign dclus2_boot = dclus2 |> bootweights # Create replicate design +# NHANES +nhanes = load_data("nhanes") +nhanes.seq1 = collect(1.0:5.0:42955.0) +nhanes.seq2 = collect(1.0:9.0:77319.0) # [9k for k in 0:8590.0] +dnhanes = SurveyDesign(nhanes; clusters = :SDMVPSU, strata = :SDMVSTRA, weights = :WTMEC2YR) +dnhanes_boot = dnhanes |> bootweights + @testset "Survey.jl" begin @test size(load_data("apiclus1")) == (183, 40) @test size(load_data("apiclus2")) == (126, 41) @@ -56,3 +63,4 @@ include("hist.jl") include("boxplot.jl") include("ratio.jl") include("show.jl") +include("jackknife.jl") \ No newline at end of file