Skip to content

Commit

Permalink
Merge branch 'v0.1.1' into as/allow_arbitrary_cols
Browse files Browse the repository at this point in the history
  • Loading branch information
smishr authored Apr 10, 2023
2 parents e552250 + 9d69475 commit 5b2cc46
Show file tree
Hide file tree
Showing 6 changed files with 272 additions and 7 deletions.
2 changes: 2 additions & 0 deletions docs/src/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ SurveyDesign
ReplicateDesign
load_data
bootweights
jackknifeweights
jackknife_variance
mean
total
quantile
Expand Down
2 changes: 2 additions & 0 deletions src/Survey.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ include("boxplot.jl")
include("show.jl")
include("ratio.jl")
include("by.jl")
include("jackknife.jl")

export load_data
export AbstractSurveyDesign, SurveyDesign, ReplicateDesign
Expand All @@ -35,5 +36,6 @@ export hist, sturges, freedman_diaconis
export boxplot
export bootweights
export ratio
export jackknifeweights, jackknife_variance

end
159 changes: 159 additions & 0 deletions src/jackknife.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
"""
jackknifeweights(design::SurveyDesign)
Delete-1 Jackknife algorithm for replication weights from sampling weights. The replicate weights are calculated using the following formula.
```math
w_{i(hj)} =
\\begin{cases}
w_i\\quad\\quad &\\text{if observation unit }i\\text{ is not in stratum }h\\\\
0\\quad\\quad &\\text{if observation unit }i\\text{ is in psu }j\\text{of stratum }h\\\\
\\dfrac{n_h}{n_h - 1}w_i \\quad\\quad &\\text{if observation unit }i\\text{ is in stratum }h\\text{ but not in psu }j\\\\
\\end{cases}
```
In the above formula, ``w_i`` represent the original weights, ``w_{i(hj)}`` represent the replicate weights when the ``j``th PSU from cluster ``h`` is removed, and ``n_h`` represents the number of unique PSUs within cluster ``h``. Replicate weights are added as columns to `design.data`, and these columns have names of the form `replicate_i`, where `i` ranges from 1 to the number of replicate weight columns.
# Examples
```jldoctest
julia> using Survey;
julia> apistrat = load_data("apistrat");
julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw);
julia> rstrat = jackknifeweights(dstrat)
ReplicateDesign:
data: 200×244 DataFrame
strata: stype
[E, E, E … M]
cluster: none
popsize: [4420.9999, 4420.9999, 4420.9999 … 1018.0]
sampsize: [100, 100, 100 … 50]
weights: [44.21, 44.21, 44.21 … 20.36]
allprobs: [0.0226, 0.0226, 0.0226 … 0.0491]
type: jackknife
replicates: 200
```
# Reference
pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
"""
function jackknifeweights(design::SurveyDesign)
sort!(design.data, [design.strata, design.cluster])
df = design.data

stratified_gdf = groupby(df, design.strata)
replicate_index = 0
for (key, subgroup) in pairs(stratified_gdf)
stratum = key[design.strata] # current stratum
psus_in_stratum = unique(subgroup[!, design.cluster])
nh = length(psus_in_stratum)

for psu in psus_in_stratum
replicate_index += 1
colname = "replicate_"*string(replicate_index)

# Initialise replicate_i with original sampling weights
df[!, colname] = Vector(df[!, design.weights])

# getting indexes
same_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .== psu)
different_psu = (df[!, design.strata] .== stratum) .&& (df[!, design.cluster] .!== psu)

# scaling weights appropriately
df[same_psu, colname] .*= 0
df[different_psu, colname] .*= nh/(nh - 1)
end
end

return ReplicateDesign(
df,
design.cluster,
design.popsize,
design.sampsize,
design.strata,
design.weights,
design.allprobs,
design.pps,
"jackknife",
UInt(replicate_index),
[Symbol("replicate_"*string(index)) for index in 1:replicate_index]
)
end

"""
jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign)
Compute variance of column `x` for the given `func` using the Jackknife method. The formula to compute this variance is the following.
```math
\\hat{V}_{\\text{JK}}(\\hat{\\theta}) = \\sum_{h = 1}^H \\dfrac{n_h - 1}{n_h}\\sum_{j = 1}^{n_h}(\\hat{\\theta}_{(hj)} - \\hat{\\theta})^2
```
Above, ``\\hat{\\theta}`` represents the estimator computed using the original weights, and ``\\hat{\\theta_{(hj)}}`` represents the estimator computed from the replicate weights obtained when PSU ``j`` from cluster ``h`` is removed.
# Examples
```jldoctest
julia> using Survey, StatsBase
julia> apistrat = load_data("apistrat");
julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw);
julia> rstrat = jackknifeweights(dstrat)
ReplicateDesign:
data: 200×244 DataFrame
strata: stype
[E, E, E … M]
cluster: none
popsize: [4420.9999, 4420.9999, 4420.9999 … 1018.0]
sampsize: [100, 100, 100 … 50]
weights: [44.21, 44.21, 44.21 … 20.36]
allprobs: [0.0226, 0.0226, 0.0226 … 0.0491]
type: jackknife
replicates: 200
julia> weightedmean(x, y) = mean(x, weights(y));
julia> jackknife_variance(:api00, weightedmean, rstrat)
1×2 DataFrame
Row │ estimator SE
│ Float64 Float64
─────┼────────────────────
1 │ 662.287 9.53613
```
# Reference
pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010)
"""
function jackknife_variance(x::Symbol, func::Function, design::ReplicateDesign)
df = design.data
# sort!(df, [design.strata, design.cluster])
stratified_gdf = groupby(df, design.strata)

# estimator from original weights
θ = func(df[!, x], df[!, design.weights])

variance = 0
replicate_index = 1
for subgroup in stratified_gdf
psus_in_stratum = unique(subgroup[!, design.cluster])
nh = length(psus_in_stratum)
cluster_variance = 0
for psu in psus_in_stratum
# get replicate weights corresponding to current stratum and psu
rep_weights = df[!, "replicate_"*string(replicate_index)]

# estimator from replicate weights
θhj = func(df[!, x], rep_weights)

cluster_variance += ((nh - 1)/nh)*(θhj - θ)^2
replicate_index += 1
end
variance += cluster_variance
end

return DataFrame(estimator = θ, SE = sqrt(variance))
end

20 changes: 13 additions & 7 deletions src/mean.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,19 @@ julia> mean(:api00, bclus1)
```
"""
function mean(x::Symbol, design::ReplicateDesign)
X = mean(design.data[!, x], weights(design.data[!, design.weights]))
Xt = [
mean(design.data[!, x], weights(design.data[!, "replicate_"*string(i)])) for
i = 1:design.replicates
]
variance = sum((Xt .- X) .^ 2) / design.replicates
DataFrame(mean = X, SE = sqrt(variance))
if design.type == "bootstrap"
θ̂ = mean(design.data[!, x], weights(design.data[!, design.weights]))
θ̂t = [
mean(design.data[!, x], weights(design.data[!, "replicate_"*string(i)])) for
i = 1:design.replicates
]
variance = sum((θ̂t .- θ̂) .^ 2) / design.replicates
return DataFrame(mean = θ̂, SE = sqrt(variance))
# Jackknife integration
elseif design.type == "jackknife"
weightedmean(x, y) = mean(x, weights(y))
return jackknife_variance(x, weightedmean, design)
end
end

"""
Expand Down
88 changes: 88 additions & 0 deletions test/jackknife.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
@testset "jackknife" begin
# Create Jackknife replicate designs
bsrs_jk = srs |> jackknifeweights # SRS
dstrat_jk = dstrat |> jackknifeweights # Stratified
dclus2_jk = dclus2 |> jackknifeweights # Two stage cluster
dnhanes_jk = dnhanes |> jackknifeweights # multistage stratified

mean_srs_jk = mean([:api00,:api99], bsrs_jk)
@test mean_srs_jk.SE[1] 9.4028 atol = 1e-3
@test mean_srs_jk.SE[2] 9.6575 atol = 1e-3

mean_strat_jk = mean([:api00,:api99],dstrat_jk)
@test mean_strat_jk.SE[1] 9.5361 atol = 1e-3
@test mean_strat_jk.SE[2] 10.097 atol = 1e-3

mean_clus2_jk = mean([:api00,:api99],dclus2_jk)
@test mean_clus2_jk.SE[1] 34.9388 atol = 1e-3 # R gives 34.928
@test mean_clus2_jk.SE[2] 34.6565 atol = 1e-3 # R gives 34.645

# Tests using for NHANES
mean_nhanes_jk = mean([:seq1, :seq2],dnhanes_jk)
@test mean_nhanes_jk.estimator[1] 21393.96 atol = 1e-3
@test mean_nhanes_jk.SE[1] 143.371 atol = 1e-3 # R is slightly diff in 2nd decimal place
@test mean_nhanes_jk.estimator[2] 38508.328 atol = 1e-3
@test mean_nhanes_jk.SE[2] 258.068 atol = 1e-3 # R is slightly diff in 2nd decimal place
end

# # R code for correctness above
# library(survey)
# data(api)
# apiclus1$pw = rep(757/15,nrow(apiclus1))
# No corrections needed for apiclus2, it has correct weights by default

# #############
# ###### 23.03.22 PR#260

# ## srs with fpc (doesnt match Julia)
# srs <- svydesign(id=~1,data=apisrs, weights=~pw, fpc=~fpc)
# bsrs_jk <- as.svrepdesign(srs, type="JK1", compress=FALSE)
# svymean(~api00,bsrs_jk)
# # mean SE
# # api00 656.58 9.2497

# ## srs without fpc (matches Julia)
# srs <- svydesign(id=~1,data=apisrs, weights=~pw)#, fpc=~fpc)
# bsrs_jk <- as.svrepdesign(srs, type="JK1", compress=FALSE)
# svymean(~api00+api99,bsrs_jk)
# # mean SE
# # api00 656.58 9.4028
# # api99 624.68 9.6575

# # strat with fpc (doesnt match Julia)
# dstrat <- svydesign(data=apistrat, id=~1, weights=~pw, strata=~stype,fpc=~fpc)
# dstrat_jk <- as.svrepdesign(dstrat, type="JKn", compress=FALSE)
# svymean(~api99,dstrat_jk)
# # mean SE
# # api99 629.39 9.9639

# # strat without fpc (matches Julia)
# dstrat <- svydesign(data=apistrat, id=~1, weights=~pw, strata=~stype)#,fpc=~fpc)
# dstrat_jk <- as.svrepdesign(dstrat, type="JKn", compress=FALSE)
# svymean(~api00+api99,dstrat_jk)
# # mean SE
# # api00 662.29 9.5361
# # api99 629.39 10.0970

#### apiclus2
# > # clus2 without fpc (doesnt match Julia)
# > dclus2 <- svydesign(id=~dnum+snum, weights=~pw, data=apiclus2)
# > dclus2_jk <- as.svrepdesign(dclus2, type="JK1", compress=FALSE)
# > svymean(~api00+api99,dclus2)
# mean SE
# api00 670.81 30.712
# api99 645.03 30.308
# > svymean(~api00+api99,dclus2_jk)
# mean SE
# api00 670.81 34.928
# api99 645.03 34.645

# NHANES test
# > data("nhanes")
# > nhanes$seq1 = seq(1.0, 5*8591.0, by = 5)
# > nhanes$seq2 = seq(1.0, 9*8591.0, by = 9)#, length.out=8591)
# > dnhanes <- svydesign(id=~SDMVPSU, strata=~SDMVSTRA, weights=~WTMEC2YR, nest=TRUE, data=nhanes)
# > svymean(~seq1+seq2,dnhanes)
# mean SE
# seq1 21394 143.34
# seq2 38508 258.01
8 changes: 8 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ apiclus2 = load_data("apiclus2") # Load API dataset
dclus2 = SurveyDesign(apiclus2; clusters = :dnum, weights = :pw) # Create SurveyDesign
dclus2_boot = dclus2 |> bootweights # Create replicate design

# NHANES
nhanes = load_data("nhanes")
nhanes.seq1 = collect(1.0:5.0:42955.0)
nhanes.seq2 = collect(1.0:9.0:77319.0) # [9k for k in 0:8590.0]
dnhanes = SurveyDesign(nhanes; clusters = :SDMVPSU, strata = :SDMVSTRA, weights = :WTMEC2YR)
dnhanes_boot = dnhanes |> bootweights

@testset "Survey.jl" begin
@test size(load_data("apiclus1")) == (183, 40)
@test size(load_data("apiclus2")) == (126, 41)
Expand All @@ -56,3 +63,4 @@ include("hist.jl")
include("boxplot.jl")
include("ratio.jl")
include("show.jl")
include("jackknife.jl")

0 comments on commit 5b2cc46

Please sign in to comment.