-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'v0.1.1' into as/allow_arbitrary_cols
- Loading branch information
Showing
6 changed files
with
272 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
""" | ||
jackknifeweights(design::SurveyDesign) | ||
Delete-1 Jackknife algorithm for replication weights from sampling weights. The replicate weights are calculated using the following formula. | ||
```math | ||
w_{i(hj)} = | ||
\\begin{cases} | ||
w_i\\quad\\quad &\\text{if observation unit }i\\text{ is not in stratum }h\\\\ | ||
0\\quad\\quad &\\text{if observation unit }i\\text{ is in psu }j\\text{of stratum }h\\\\ | ||
\\dfrac{n_h}{n_h - 1}w_i \\quad\\quad &\\text{if observation unit }i\\text{ is in stratum }h\\text{ but not in psu }j\\\\ | ||
\\end{cases} | ||
``` | ||
In the above formula, ``w_i`` represent the original weights, ``w_{i(hj)}`` represent the replicate weights when the ``j``th PSU from cluster ``h`` is removed, and ``n_h`` represents the number of unique PSUs within cluster ``h``. Replicate weights are added as columns to `design.data`, and these columns have names of the form `replicate_i`, where `i` ranges from 1 to the number of replicate weight columns. | ||
# Examples | ||
```jldoctest | ||
julia> using Survey; | ||
julia> apistrat = load_data("apistrat"); | ||
julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw); | ||
julia> rstrat = jackknifeweights(dstrat) | ||
ReplicateDesign: | ||
data: 200×244 DataFrame | ||
strata: stype | ||
[E, E, E … M] | ||
cluster: none | ||
popsize: [4420.9999, 4420.9999, 4420.9999 … 1018.0] | ||
sampsize: [100, 100, 100 … 50] | ||
weights: [44.21, 44.21, 44.21 … 20.36] | ||
allprobs: [0.0226, 0.0226, 0.0226 … 0.0491] | ||
type: jackknife | ||
replicates: 200 | ||
``` | ||
# Reference | ||
pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) | ||
""" | ||
function jackknifeweights(design::SurveyDesign) | ||
sort!(design.data, [design.strata, design.cluster]) | ||
df = design.data | ||
|
||
stratified_gdf = groupby(df, design.strata) | ||
replicate_index = 0 | ||
for (key, subgroup) in pairs(stratified_gdf) | ||
stratum = key[design.strata] # current stratum | ||
psus_in_stratum = unique(subgroup[!, design.cluster]) | ||
nh = length(psus_in_stratum) | ||
|
||
for psu in psus_in_stratum | ||
replicate_index += 1 | ||
colname = "replicate_"*string(replicate_index) | ||
|
||
# Initialise replicate_i with original sampling weights | ||
df[!, colname] = Vector(df[!, design.weights]) | ||
|
||
# getting indexes | ||
same_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .== psu) | ||
different_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .!== psu) | ||
|
||
# scaling weights appropriately | ||
df[same_psu, colname] .*= 0 | ||
df[different_psu, colname] .*= nh/(nh - 1) | ||
end | ||
end | ||
|
||
return ReplicateDesign( | ||
df, | ||
design.cluster, | ||
design.popsize, | ||
design.sampsize, | ||
design.strata, | ||
design.weights, | ||
design.allprobs, | ||
design.pps, | ||
"jackknife", | ||
UInt(replicate_index), | ||
[Symbol("replicate_"*string(index)) for index in 1:replicate_index] | ||
) | ||
end | ||
|
||
""" | ||
jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign) | ||
Compute variance of column `x` for the given `func` using the Jackknife method. The formula to compute this variance is the following. | ||
```math | ||
\\hat{V}_{\\text{JK}}(\\hat{\\theta}) = \\sum_{h = 1}^H \\dfrac{n_h - 1}{n_h}\\sum_{j = 1}^{n_h}(\\hat{\\theta}_{(hj)} - \\hat{\\theta})^2 | ||
``` | ||
Above, ``\\hat{\\theta}`` represents the estimator computed using the original weights, and ``\\hat{\\theta_{(hj)}}`` represents the estimator computed from the replicate weights obtained when PSU ``j`` from cluster ``h`` is removed. | ||
# Examples | ||
```jldoctest | ||
julia> using Survey, StatsBase | ||
julia> apistrat = load_data("apistrat"); | ||
julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw); | ||
julia> rstrat = jackknifeweights(dstrat) | ||
ReplicateDesign: | ||
data: 200×244 DataFrame | ||
strata: stype | ||
[E, E, E … M] | ||
cluster: none | ||
popsize: [4420.9999, 4420.9999, 4420.9999 … 1018.0] | ||
sampsize: [100, 100, 100 … 50] | ||
weights: [44.21, 44.21, 44.21 … 20.36] | ||
allprobs: [0.0226, 0.0226, 0.0226 … 0.0491] | ||
type: jackknife | ||
replicates: 200 | ||
julia> weightedmean(x, y) = mean(x, weights(y)); | ||
julia> jackknife_variance(:api00, weightedmean, rstrat) | ||
1×2 DataFrame | ||
Row │ estimator SE | ||
│ Float64 Float64 | ||
─────┼──────────────────── | ||
1 │ 662.287 9.53613 | ||
``` | ||
# Reference | ||
pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) | ||
""" | ||
function jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign) | ||
df = design.data | ||
# sort!(df, [design.strata, design.cluster]) | ||
stratified_gdf = groupby(df, design.strata) | ||
|
||
# estimator from original weights | ||
θ = func(df[!, x], df[!, design.weights]) | ||
|
||
variance = 0 | ||
replicate_index = 1 | ||
for subgroup in stratified_gdf | ||
psus_in_stratum = unique(subgroup[!, design.cluster]) | ||
nh = length(psus_in_stratum) | ||
cluster_variance = 0 | ||
for psu in psus_in_stratum | ||
# get replicate weights corresponding to current stratum and psu | ||
rep_weights = df[!, "replicate_"*string(replicate_index)] | ||
|
||
# estimator from replicate weights | ||
θhj = func(df[!, x], rep_weights) | ||
|
||
cluster_variance += ((nh - 1)/nh)*(θhj - θ)^2 | ||
replicate_index += 1 | ||
end | ||
variance += cluster_variance | ||
end | ||
|
||
return DataFrame(estimator = θ, SE = sqrt(variance)) | ||
end | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
@testset "jackknife" begin | ||
# Create Jackknife replicate designs | ||
bsrs_jk = srs |> jackknifeweights # SRS | ||
dstrat_jk = dstrat |> jackknifeweights # Stratified | ||
dclus2_jk = dclus2 |> jackknifeweights # Two stage cluster | ||
dnhanes_jk = dnhanes |> jackknifeweights # multistage stratified | ||
|
||
mean_srs_jk = mean([:api00,:api99], bsrs_jk) | ||
@test mean_srs_jk.SE[1] ≈ 9.4028 atol = 1e-3 | ||
@test mean_srs_jk.SE[2] ≈ 9.6575 atol = 1e-3 | ||
|
||
mean_strat_jk = mean([:api00,:api99],dstrat_jk) | ||
@test mean_strat_jk.SE[1] ≈ 9.5361 atol = 1e-3 | ||
@test mean_strat_jk.SE[2] ≈ 10.097 atol = 1e-3 | ||
|
||
mean_clus2_jk = mean([:api00,:api99],dclus2_jk) | ||
@test mean_clus2_jk.SE[1] ≈ 34.9388 atol = 1e-3 # R gives 34.928 | ||
@test mean_clus2_jk.SE[2] ≈ 34.6565 atol = 1e-3 # R gives 34.645 | ||
|
||
# Tests using for NHANES | ||
mean_nhanes_jk = mean([:seq1, :seq2],dnhanes_jk) | ||
@test mean_nhanes_jk.estimator[1] ≈ 21393.96 atol = 1e-3 | ||
@test mean_nhanes_jk.SE[1] ≈ 143.371 atol = 1e-3 # R is slightly diff in 2nd decimal place | ||
@test mean_nhanes_jk.estimator[2] ≈ 38508.328 atol = 1e-3 | ||
@test mean_nhanes_jk.SE[2] ≈ 258.068 atol = 1e-3 # R is slightly diff in 2nd decimal place | ||
end | ||
|
||
# # R code for correctness above | ||
# library(survey) | ||
# data(api) | ||
# apiclus1$pw = rep(757/15,nrow(apiclus1)) | ||
# No corrections needed for apiclus2, it has correct weights by default | ||
|
||
# ############# | ||
# ###### 23.03.22 PR#260 | ||
|
||
# ## srs with fpc (doesnt match Julia) | ||
# srs <- svydesign(id=~1,data=apisrs, weights=~pw, fpc=~fpc) | ||
# bsrs_jk <- as.svrepdesign(srs, type="JK1", compress=FALSE) | ||
# svymean(~api00,bsrs_jk) | ||
# # mean SE | ||
# # api00 656.58 9.2497 | ||
|
||
# ## srs without fpc (matches Julia) | ||
# srs <- svydesign(id=~1,data=apisrs, weights=~pw)#, fpc=~fpc) | ||
# bsrs_jk <- as.svrepdesign(srs, type="JK1", compress=FALSE) | ||
# svymean(~api00+api99,bsrs_jk) | ||
# # mean SE | ||
# # api00 656.58 9.4028 | ||
# # api99 624.68 9.6575 | ||
|
||
# # strat with fpc (doesnt match Julia) | ||
# dstrat <- svydesign(data=apistrat, id=~1, weights=~pw, strata=~stype,fpc=~fpc) | ||
# dstrat_jk <- as.svrepdesign(dstrat, type="JKn", compress=FALSE) | ||
# svymean(~api99,dstrat_jk) | ||
# # mean SE | ||
# # api99 629.39 9.9639 | ||
|
||
# # strat without fpc (matches Julia) | ||
# dstrat <- svydesign(data=apistrat, id=~1, weights=~pw, strata=~stype)#,fpc=~fpc) | ||
# dstrat_jk <- as.svrepdesign(dstrat, type="JKn", compress=FALSE) | ||
# svymean(~api00+api99,dstrat_jk) | ||
# # mean SE | ||
# # api00 662.29 9.5361 | ||
# # api99 629.39 10.0970 | ||
|
||
#### apiclus2 | ||
# > # clus2 without fpc (doesnt match Julia) | ||
# > dclus2 <- svydesign(id=~dnum+snum, weights=~pw, data=apiclus2) | ||
# > dclus2_jk <- as.svrepdesign(dclus2, type="JK1", compress=FALSE) | ||
# > svymean(~api00+api99,dclus2) | ||
# mean SE | ||
# api00 670.81 30.712 | ||
# api99 645.03 30.308 | ||
# > svymean(~api00+api99,dclus2_jk) | ||
# mean SE | ||
# api00 670.81 34.928 | ||
# api99 645.03 34.645 | ||
|
||
# NHANES test | ||
# > data("nhanes") | ||
# > nhanes$seq1 = seq(1.0, 5*8591.0, by = 5) | ||
# > nhanes$seq2 = seq(1.0, 9*8591.0, by = 9)#, length.out=8591) | ||
# > dnhanes <- svydesign(id=~SDMVPSU, strata=~SDMVSTRA, weights=~WTMEC2YR, nest=TRUE, data=nhanes) | ||
# > svymean(~seq1+seq2,dnhanes) | ||
# mean SE | ||
# seq1 21394 143.34 | ||
# seq2 38508 258.01 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters