-
Notifications
You must be signed in to change notification settings - Fork 0
/
Supplementary_Code_Documentation_A_Pre-processing.Rmd
792 lines (631 loc) · 36.2 KB
/
Supplementary_Code_Documentation_A_Pre-processing.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
---
title: "Coral Heritability Meta-analysis"
subtitle: "Supplementary Code Documentation A: Pre-processing"
author: "Kevin R Bairos-Novak"
date: "Document last run on `r format(Sys.time(), '%Y-%m-%d')`"
output:
html_document:
df_print: kable
toc: yes
toc_float: false
highlight: tango
theme: cerulean
number_sections: no
fig_width: 10
fig_height: 8
fig_caption: yes
code_folding: hide
self_contained: TRUE
editor_options:
chunk_output_type: console
knit: (function(inputFile, encoding) { rmarkdown::render(inputFile, encoding = encoding, output_file = paste0(xfun::sans_ext(inputFile),"_",format(Sys.time(), '%Y-%m-%d'), ".html")) })
---
```{r}
knitr::opts_chunk$set(echo = TRUE, comment="",
message=F, warning=F)
patt <- "Supplementary_Code_Documentation_A_Pre-processing"
x <- paste0(patt,"_",format(Sys.time(), '%Y-%m-%d'), ".html")
y <- list.files(pattern = glob2rx(paste0(patt,"*.html")))
y <- y[!(y == x)]
unlink(y, recursive=T)
setwd("~/Documents/PhD Thesis/Heritability meta-analysis")
rm(list=ls());
```
<style>
.container { width: 1000px; }
h2 { color: #f8f8f8; background-color: #437FAA; }
h3 { color: #f8f8f8; background-color: #437FAA; text-align: center; }
<!-- Get highlight from Terminal: pandoc --print-highlight-style tango -->
</style>
---
```{r, class.source = 'fold-show'}
rm(list=ls())
## Install the required packages if not previously installed, e.g.:
# if(!require("readxl")) install.packages("readxl")
library(readxl)
library(tidyverse); theme_set(theme_light())
library(metafor)
library(grid)
library(gridExtra)
library(quantreg)
source('~/Documents/PhD Thesis/Heritability meta-analysis/Functions/useful_functions.R')
# using qlogis() for logit, plogis() for inverse logit
data <- read_excel("Data/heritability_estimates.xlsx", sheet="h2")
# create factor of different sampling variance types, relevel to have se on top
data <- data %>% mutate(sv.type = factor(case_when(
!is.na(se) ~ "se",
!is.na(bci.lwr) ~ "bci",
TRUE ~ "none"))) %>%
within({sv.type = relevel(sv.type, ref="se")}) %>%
filter(not.valid==0) # delete ones that don't have valid h2s
data$est.id <- factor(1:nrow(data)) # assign individual estimate numbers
```
# Different measures of sampling variance
Data transformation is necessary for analyses of proportions in order to avoid boundary issues when proportions are nearby 0 and 1. They also aid in producing data that is closer to gaussian-distributed, allowing better estimation of parameters in random effect meta-analysis.
The main issue with data transformation arises in the fact that we have three main sampling variance (herein referred to as *SV*, i.e. the square of the standard error of the mean) types, namely, sampling variance is reported as either:
- standard error of the mean (herein *SE*)
- confidence or bayesian credible intervals (*CI*)
- no estimate of sampling variance (*none*)
Using the equation for calculating 95% confidence intervals ($p ± z^{*} \cdot SE(p)$), we can convert *SE* to *CI*, or vice-versa, then conduct a transformation on only *SE*s or *CI*s. These two classes of transformations both have important caveats.
Direct transformations of *SE* are more typical for meta-analyses of proportions (Wang 2018), and have the advantage that no *SE* is too large to transform to a new scale.
However, going from *CI* to *SE* sacrifices the asymmetry implicit in many bayesian posteriors, which would otherwise be somewhat maintained if directly converting *CI*s to a different scale. Additionally, most *SE*-transforms for binomial data require a sample size *n*, which is not an observed quantity for proportions such as heritability-- a ratio of two variances and thus an inherent proportion with no associated sample size.
Alternatively, we can change *SE*s to *CI*s and transform both the point estimate and the upper and lower bounds provided by the 95% *CI* to get to the new scale.
This allows us to keep the implicit asymmetry of many posterior estimates of $h^2$ calculated using MCMCglmm, but for some estimates where the *SE* is large, no transformation is possible, as either the upper and/or lower bound is outside of the range from [0,1] in many cases.
Additionally, if $p ± z^{*} \cdot SE(p)$ happens to create values very close but within 0 or 1, then upon logit transformation, these values tend to have exceedingly high logit-scale *SE* values.
Thus, the issues of transformation are not perfect through either approach. We thus compare a number of transformations computed using both *SE*- and *CI*-transformations. and finish by comparing the main results from each model to assess their robustness.
# Transformations
## SE-transformations
Two different classes of transforms are possible in our data: converting directly from *SE*s to the transformed scale using either the logit (a.k.a. log-odds, Delta-logit), logarithmic, arcsine square root, and Freeman-Tukey double arcsine transformations. These transformations are automatically conducted using the `metafor::escalc()` function, however, they require the calculation of a pseudo-sample size for each heritability estimate, when these in reality do not exist for an inherent proportion, or at least not in the way that binomial proportions do. To bypass this, we can estimate a rough binomial *n* using a rearrangement of the following formula for the expected sampling variance of a proportion:
$$SE(p) = \sqrt{\frac{p(1-p)}{n}}$$
thus:
$$ n_{estimate} \approx \frac{p(1-p)}{SE(p)^2}$$
We can then directly transform all point estimates and *SE*s to other scales. Here, we will only cover the formulae for two methods outlined Wang (2018): the logit-Delta and double arcsine transform.
### Logit-Delta transform
The logit point-estimate transform is simply the logit of the proportion $p$,
$p_{logit} = ln\left( \frac{p}{1-p} \right)$ with the sampling variance:
$$ SE(p_{logit})^2 = \frac{1}{n_{est} \, p}+ \frac{1}{n_{est} \, (1-p)} $$
This transform has the known caveat of variance instability such that studies closer to 0.5 may have undue weight placed upon their estimates due to the narrowing of sampling variance (Barendregt et al. 2013; Hamza et al. 2008). Additionally, extreme estimates close to 0 or 1 become undefined upon logit-transformed.
However, both of these problems can be solved using the variance-stabilizing double arcsine transform (Wang 2018).
### Double-arcsine transform
$$ p_{double-arcsine} = \frac{1}{2} \left( sin^{-1} \sqrt{\frac{np}{n+1}}+sin^{-1} \sqrt{\frac{np+1}{n+1}} \right) $$
And sampling variance:
$$ SE(p_{double-arcsine})^2 = \frac{1}{4n+2} $$
However, the double-arcsine transformation has no tractable back-transformation method (Wang 2018), and back-estimates have been criticized for producing nonsensical *SE* values on the original scale.
The following advice was provided by Lipsey and Wilson (2001) and Viechtbauer (2010) regarding the transforms: proportions between 0.2 to 0.8 can be analyzed without tranform, proportions lying outside the range of 0.2-0.8 can be transformed via logit, and double arcsine when the sample size is small or when proportions are even more extreme.
## CI-transformations
SE-transformations were developed for binomial proportions, not continuous quantities constrained to fall between zero and 1, like heritabiity, that require the back-calculation of a pseudo-sample size. However, instead of converting all sampling variances to *SE*, we can instead go in the opposite direction and change *SE*s to 95% *CI*s to transform these upper and lower interval boundaries directly. We used 95% CIs instead of other intervals as all papers using the `MCMCglmm` package and animal model ubiquitously reported 95% bayesian credible intervals.
Then, once the *CI*s are on the transformed scale, we simply back-calculate *SE* from the original equation for 95% *CI*s using half the range:
$$ SE(p_{trans}) \simeq \frac{\frac{1}{2} range(95\% \, CI_{trans})}{t^*} $$
where $t^*=1.98$ for large sample sizes (e.g., n=100, `qt(0.975, 100)`=1.98). In this way, we can transform values to new scales using many different common transformations, such as log+1, logit, arcsine, arcsine-square-root, and square-root+1 transformations. log+1 and square-root+1 have additions of 1 since some of the lower confidence intervals fall below 0.
The transformation for each is applied directly to the point estimates and lower and upper *CI* bounds using the respective R functions: `log(x+1)`, `qlogis(x)`, `asin(x)`, `asin(sqrt(x))`, and `sqrt(x+1)`.
## Calculating each transformation
Next, we calculate each of the above-mentioned transforms, comparing only well-behaved proportions with upper and lower *CI* bounds between 0.001-0.999 exclusive.
```{r conduct transformations, warning=F}
#### Transformations on SE (+impute sample n) ####
# calculate SEs from BCIs, call them se.t
data <- data %>% mutate(
se.t = case_when(
sv.type == "bci" ~ (bci.upr - bci.lwr) / (2*qt(0.975, 100)),
sv.type == "se" ~ se,
TRUE ~ NA_real_),
n.pseudo = case_when( # n = p(1-p)/(se^2)
sv.type != "none" ~ (val*(1-val))/(se.t^2),
# sampling variance of a population's proportion is:
# SV = SE^2 = p*(1-p)/n; (Lipsey and Wilson 2001)
TRUE ~ NA_real_)) %>%
# transform using log transformed proportion
escalc(xi=val*n.pseudo, ni=n.pseudo, data=., measure="PLN") %>%
rename(val.logse = yi, vi.logse = vi) %>%
mutate(se.logse = sqrt(vi.logse)) %>%
# transform using logit-delta on SE
escalc(xi=val*n.pseudo, ni=n.pseudo, data=., measure="PLO") %>%
rename(val.logitse = yi, vi.logitse = vi) %>%
mutate(se.logitse = sqrt(vi.logitse)) %>%
# transform using arcsine-sqrt on SE
escalc(xi=val*n.pseudo, ni=n.pseudo, data=., measure="PAS") %>%
rename(val.arcsqrtse = yi, vi.arcsqrtse = vi) %>%
mutate(se.arcsqrtse = sqrt(vi.arcsqrtse)) %>%
# transform using Freeman-Tukey double arcsine on SE
escalc(xi=val*n.pseudo, ni=n.pseudo, data=., measure="PFT", add=0) %>%
rename(val.doubarcse = yi, vi.doubarcse = vi) %>%
mutate(se.doubarcse = sqrt(vi.doubarcse)) %>%
select(-c(vi.logse, vi.logitse, vi.arcsqrtse, vi.doubarcse))
#### Transformations on CI ####
## Four steps for all transformations:
# [1] Directly convert point estimates;
# [2] Transform SEs to 95% CIs;
# [3] if BCI/CIs not outside of 0.001-0.999 range (i.e. valid)...
# (extreme 95% CI values often happen by chance when converting SE to
# 95% CI, so exclude them and use half-range to avoid inflating SE)
# [4] Transform CIs, get SE on transformed scale as half the 95%CI range
# divided by 1.98 for large sample sizes (n=100)
#
dat <- data %>%
mutate(
val.logitci = qlogis(val), # [1]
val.logci = log1p(val),
val.arcci = asin(val),
val.arcsqrtci = asin(sqrt(val)),
val.sqrtci = sqrt(val+1),
ci.lwr.t = case_when( # [2]
sv.type == "bci" ~ bci.lwr,
sv.type == "se" ~ val-qt(0.975, 100)*se,
TRUE ~ NA_real_),
ci.upr.t = case_when( # [2]
sv.type == "bci" ~ bci.upr,
sv.type == "se" ~ val+qt(0.975, 100)*se,
TRUE ~ NA_real_),
ci.lwr.logit.valid = ifelse(sv.type != "none" & ci.lwr.t > 0.001, T, F),
ci.upr.logit.valid = ifelse(sv.type != "none" & ci.upr.t < 0.999, T, F)) %>%
filter(ci.lwr.logit.valid==TRUE & ci.upr.logit.valid==TRUE) %>%
mutate(
se.logci = (log1p(ci.upr.t) - log1p(ci.lwr.t)) / (2*qt(0.975, 100)),# [4] log+1
se.logitci = (qlogis(ci.upr.t) - qlogis(ci.lwr.t)) / (2*qt(0.975, 100)), # [4] logit
se.arcci = (sqrt((ci.upr.t)) - sqrt((ci.lwr.t))) / (2*qt(0.975, 100)), # [4] arcsine
se.arcsqrtci = (asin(sqrt(ci.upr.t)) - asin(sqrt(ci.lwr.t))) / (2*qt(0.975, 100)), # [4] arcsine-sqrt
se.sqrtci = (sqrt((ci.upr.t)+1) - sqrt((ci.lwr.t))+1) / (2*qt(0.975, 100)) # [4] sqrt+1
) %>%
select(-c( ci.lwr.logit.valid, ci.upr.logit.valid))
# assign("last.warning", NULL, envir = baseenv())
# create a new data frame to manipulate:
# (and avoid gamete contribution trait, which contains only a single h2)
data.t <- dat %>% filter(trait != "gamete contribution")
```
# Plotting the different transformations
Next, we compare each of the eight transformations via plotting. We produce three main diagnostic plots, namely:
1. SE of original vs. transformed SE (should scale roughly linearly)
2. QQ-plots from initial model fits (transformed_y ~ h2 type * trait type)
3. Coefficient plots comparing relative parameter estimates for initial model fits
## Original vs. transformed SE
```{r, echo=F, fig.width=15, fig.height = 15}
#### SE original vs. transformed ####
# create a new subset of data with intact CIs
se.names <- data.t %>% select(starts_with("se.")) %>% select(-se.t) %>% names
se.titles <- gsub("se.",x=se.names,"") %>%
gsub("se",x=.," on SE") %>% gsub("ci",x=.," on CI")
p <- list()
h <- list()
g <- list()
for (i in 1:length(se.names)) {
x <- data.t %>% rename(se.y = se.names[i]) %>%
filter(!is.na(se.y))
# plot the original vs. transformed SE values
p[[i]] <-
ggplot(x, aes(x=se.t, y=se.y)) +
labs(y="transformed SE", x="original SE", title=se.titles[i]) +
geom_smooth(method="lm", col="red", se=F) +
geom_smooth(method="loess", col="blue", se=F) +
geom_point()
# create a histogram of residuals
h[[i]] <- lm(se.y ~ se.t, data=x) %>% resid() %>% qplot(bins=10) +
labs(x="Residual values", y="Freq") +
geom_vline(xintercept=0, linetype="dashed") +
theme(panel.background = element_rect(fill = NULL),
plot.background=element_rect(fill=NULL,
color="black",size = 1)) +
# geom_density() +
scale_x_continuous(breaks=c(-2,-1,0),
minor_breaks=c(-1.5,-0.5,0.5,1.5))
# get plot x and y range to best lay inset plot
p.yr <- ggplot_build(p[[i]])$layout$panel_params[[1]]$y.range
p.xr <- ggplot_build(p[[i]])$layout$panel_params[[1]]$x.range
p.ydist <- p.yr[2]-p.yr[1]
p.xdist <- p.xr[2]-p.xr[1]
# add inset histogram to regular plot, save as grob object
g[[i]] <- ggplotGrob(p[[i]] + annotation_custom(ggplotGrob(h[[i]]), xmin=p.xr[1], xmax=p.xr[1]+p.xdist*1/2, ymin=p.yr[1]+3/5*p.ydist, ymax=p.yr[2]))
rm(x, p.yr, p.xr, p.ydist, p.xdist)
}
# grid.arrange(grobs=p, ncol=3) # plots only the charts with no histogram graph
grid.arrange(grobs=g, ncol=3) # causes crash if it's too small a plot area
```
We can see clearly that some transformations, such as the logarithmic-CI transform, produces the most linear and predictable increase in points. However, note that two outliers are obscured by the residual values plot in the top left of the plot, so it isn't perfect either.
## QQ-plots from an initial mixed meta-model
```{r, echo=F, fig.width=10, fig.height = 10}
#### Q-Q norm plots ####
# qqnorm.rma() simply does a standard qqnorm plot based on studentized residuals
# e.g.:
# m1.logse %>% rstandard() %>% as.data.frame %>%
# ggplot(aes(sample=z)) + stat_qq() + stat_qq_line()
m.traitonly.notrans <- rma(yi=val ~ trait -1,
sei=se.t, data=data.t, method="REML", test="knha")
m.traitonly.notrans.mv <- rma.mv(yi=val ~ trait -1,
V=se.t^2, random = ~1|study/est.id, data=data.t, method="REML", tdist=T)
val.names = c("val.notrans.mv", gsub(pattern="se.", x=se.names, replacement="val."))
par(mfrow=c(3,3))
mod <- list(m.traitonly.notrans)
mod.mv <- list(m.traitonly.notrans.mv)
for (i in 2:(length(val.names))) {
x <- data.t %>% rename(val.y = val.names[i],
se.y = se.names[i-1] ) %>%
filter(!is.na(se.y))
mod[[i]] <- rma(yi=val.y ~ trait -1,
sei=se.y, data=x, method="REML", test="knha")
mod.mv[[i]] <- rma.mv(yi=val.y ~ trait -1,
V=se.y^2, data=x, random = ~1|study/est.id, method="REML", tdist=T)
if(i == length(val.names)) break
qqnorm(mod[[i]], main=se.titles[i-1])
}
par(mfrow=c(1,1))
```
Not sure what is wrong with the sqrt on CI transform line... Seems to be an error.
Similarly, the log(X+1)-CI transformation (**log on SE** above) produces transformed SE values and associated residual values that are more Gaussian-distributed relative to other transformations.
## Coefficient plots
```{r, echo=F}
#### Coefficient plot ####
# fit models and extract parameter coefficent estimates and associated SEs
coefs <- mod.mv %>% map_dfr(~coef(.) %>% as.matrix() %>% t() %>% as.data.frame)
coefs.se <- mod.mv %>% map_dfr(~.$se %>% as.matrix() %>% t() %>% as.data.frame)
colnames(coefs.se) <- colnames(coefs)
coefs.se <- coefs.se %>%
mutate(transformation=c("untransformed", se.titles)) %>%
pivot_longer(-transformation, names_to="trait", values_to="se")
coefs <- coefs %>% mutate(transformation=c("untransformed", se.titles)) %>%
pivot_longer(-transformation, names_to="trait", values_to="estimate") %>%
left_join(coefs.se, by = c("transformation", "trait")) %>%
mutate(trait = gsub(pattern="trait",x=trait,replacement="")) %>%
group_by(transformation) %>%
mutate(est.max = max(abs(estimate)),
est.rel = abs(estimate/est.max),
se.rel = se/est.max) %>%
ungroup
# create coefficient plot
coefs %>%
ggplot(aes(x=est.rel, y=trait)) +
geom_linerange(aes(xmin=est.rel - se.rel, xmax=est.rel + se.rel, col=transformation),
position = position_dodge(0.9)) +
geom_point(aes(fill=transformation), shape=21, size=2, color="white", position = position_dodge(0.9)) +
geom_hline(yintercept=seq(0.5,length(unique(x$trait))+.5,by=1)) +
geom_vline(xintercept=c(0,1), linetype="dashed", color="grey") +
scale_y_discrete(expand=expansion(add=0.5)) +
coord_cartesian(xlim=c(-0.1,1.1)) +
guides(color = guide_legend(reverse = TRUE), fill = guide_legend(reverse = TRUE)) +
labs(y=NULL, x="Relative coefficient estimate\n=|estimate / max(estimate)|")
```
Finally, the log-CI transformation produces model coefficients that are similar to other transformations, including a model using untransformed proportions and associated SE. The SE-transforms all have fairly large parameter SEs, and thus are not very precise relative to CI-transform models.
Thus, we decided to use a log(X+C) transformation of the point estimate and confidence intervals, where C is some positive constant.
---
# Comparing transformation outcomes on statistical analysis
Next, we compare trait type only, additive (trait + stage type) and interaction (trait x stage type) models of reported heritability across each transformation to see if the results are similar or different according to the transformation.
```{r, echo=F, warning=FALSE}
# data.t <- data %>% filter(trait != "gamete contribution")
#### Examine differences in model outcomes ####
m.traitonly.logse <- rma.mv(yi=val.logse ~ trait,
V=se.logse^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.additive.logse <- rma.mv(yi=val.logse ~ trait + stage,
V=se.logse^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.interaction.logse <- rma.mv(yi=val.logse ~ trait * stage,
V=se.logse^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.traitonly.logitse <- rma.mv(yi=val.logitse ~ trait,
V=se.logitse^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.additive.logitse <- rma.mv(yi=val.logitse ~ trait + stage,
V=se.logitse^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.interaction.logitse <- rma.mv(yi=val.logitse ~ trait * stage,
V=se.logitse^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.traitonly.arcsqrtse <- rma.mv(yi=val.arcsqrtse ~ trait,
V=se.arcsqrtse^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.additive.arcsqrtse <- rma.mv(yi=val.arcsqrtse ~ trait + stage,
V=se.arcsqrtse^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.interaction.arcsqrtse <- rma.mv(yi=val.arcsqrtse ~ trait * stage,
V=se.arcsqrtse^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.traitonly.doubarcse <- rma.mv(yi=val.doubarcse ~ trait,
V=se.doubarcse^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.additive.doubarcse <- rma.mv(yi=val.doubarcse ~ trait + stage,
V=se.doubarcse^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.interaction.doubarcse <- rma.mv(yi=val.doubarcse ~ trait * stage,
V=se.doubarcse^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.traitonly.logci <- rma.mv(yi=val.logci ~ trait,
V=se.logci^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.additive.logci <- rma.mv(yi=val.logci ~ trait + stage,
V=se.logci^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.interaction.logci <- rma.mv(yi=val.logci ~ trait * stage,
V=se.logci^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.traitonly.logitci <- rma.mv(yi=val.logitci ~ trait,
V=se.logitci^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.additive.logitci <- rma.mv(yi=val.logitci ~ trait + stage,
V=se.logitci^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.interaction.logitci <- rma.mv(yi=val.logitci ~ trait * stage,
V=se.logitci^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.traitonly.arcci <- rma.mv(yi=val.arcci ~ trait,
V=se.arcci^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.additive.arcci <- rma.mv(yi=val.arcci ~ trait + stage,
V=se.arcci^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.interaction.arcci <- rma.mv(yi=val.arcci ~ trait * stage,
V=se.arcci^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.traitonly.arcsqrtci <- rma.mv(yi=val.arcsqrtci ~ trait,
V=se.arcsqrtci^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.additive.arcsqrtci <- rma.mv(yi=val.arcsqrtci ~ trait + stage,
V=se.arcsqrtci^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.interaction.arcsqrtci <- rma.mv(yi=val.arcsqrtci ~ trait * stage,
V=se.arcsqrtci^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.traitonly.sqrtci <- rma.mv(yi=val.sqrtci ~ trait,
V=se.sqrtci^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.additive.sqrtci <- rma.mv(yi=val.sqrtci ~ trait + stage,
V=se.sqrtci^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.interaction.sqrtci <- rma.mv(yi=val.sqrtci ~ trait * stage,
V=se.sqrtci^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.traitonly.notrans <- rma.mv(yi=val ~ trait,
V=se.t^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.additive.notrans <- rma.mv(yi=val ~ trait + stage,
V=se.t^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
m.interaction.notrans <- rma.mv(yi=val ~ trait * stage,
V=se.t^2, random = ~1|study/est.id, data=data.t, method="ML", tdist=T)
```
```{r, results="hold"}
## Standard error-transform models (all models similar):
# Log-SE transform model:
myAIC(m.traitonly.logse, m.additive.logse, m.interaction.logse) # additive model favoured
myAIC(m.traitonly.logitse, m.additive.logitse, m.interaction.logitse) # trait only favoured
myAIC(m.traitonly.arcsqrtse, m.additive.arcsqrtse, m.interaction.arcsqrtse) # trait only favoured
myAIC(m.traitonly.doubarcse, m.additive.doubarcse, m.interaction.doubarcse) # trait only favoured
```
## Confidence interval-transform models
```{r, results="hold"}
## Confidence interval-transform models (all models similar):
# Log-CI model:
myAIC(m.traitonly.logci, m.additive.logci, m.interaction.logci) # trait only favoured
myAIC(m.traitonly.logitci, m.additive.logitci, m.interaction.logitci) # trait only favoured
myAIC(m.traitonly.arcci, m.additive.arcci, m.interaction.arcci) # trait only favoured
myAIC(m.traitonly.arcsqrtci, m.additive.arcsqrtci, m.interaction.arcsqrtci) # trait only favoured
```
## Untransformed model with no imputed SE
```{r, results="hold"}
## Untransformed model with no imputed SE:
myAIC(m.traitonly.notrans, m.additive.notrans, m.interaction.notrans) # trait only favoured
```
All models have similar results! So the transformation of CI does not affect results relative to untransformed, but the SE transform appears to change the outcome (and select a more complicated model).
```{r, echo=F}
rm(m.traitonly.logse, m.additive.logse, m.interaction.logse)
rm(m.traitonly.logitse, m.additive.logitse, m.interaction.logitse)
rm(m.traitonly.arcsqrtse, m.additive.arcsqrtse, m.interaction.arcsqrtse)
rm(m.traitonly.doubarcse, m.additive.doubarcse, m.interaction.doubarcse)
rm(m.traitonly.logci, m.additive.logci, m.interaction.logci)
rm(m.traitonly.logitci, m.additive.logitci, m.interaction.logitci)
rm(m.traitonly.arcci, m.additive.arcci, m.interaction.arcci)
rm(m.traitonly.arcsqrtci, m.additive.arcsqrtci, m.interaction.arcsqrtci)
rm(m.traitonly.notrans, m.additive.notrans, m.interaction.notrans)
```
We decided on the log(X+1) CI-transform in the end, due to its well-behaved properties seen previously as well as its versatility in allowing the transformation of heritability estimates with a lower confidence interval up to -1 and for any upper confidence interval! However, the amount being added (currently '+1' but it could be any quantity, '+C') is arbitrary, thus we next optimize for the contstant being added, C.
---
# Comparing log X+C transforms
In this section, we examine the effect of varying the added constant 'C' on the resulting log-transformation, considering how many data points would have to be excluded due to their lower CI being less than -C (and thus not being log-transformable!).
To do so, we plot the lowest rounded value of C and the resulting transformation when 1 to 6 of the lowest CI values are excluded.
```{r plot log(X+C) transforms, echo=F, fig.width = 10, fig.height = 17}
# boxplot(data$ci.lwr.t) # remove outliers on negative end to decide cut-off of inclusion (or + value)
# boxplot(data$ci.lwr.t[c(-4,-33, -5)])
# # same for upper limit:
# boxplot(data$ci.upr.t[c(-4,-33)])
# So basically, the estimates in row -4, -33 (and maybe -5) have too large of values.
# try removing these three values, one at a time to see effect on accuracy...
data <- data %>%
mutate(
val.logitci = qlogis(val), # [1]
val.logci = log1p(val),
val.arcci = asin(val),
val.arcsqrtci = asin(sqrt(val)),
val.sqrtci = sqrt(val+1),
ci.lwr.t = case_when( # [2]
sv.type == "bci" ~ bci.lwr,
sv.type == "se" ~ val-qt(0.975, 100)*se,
TRUE ~ NA_real_),
ci.upr.t = case_when( # [2]
sv.type == "bci" ~ bci.upr,
sv.type == "se" ~ val+qt(0.975, 100)*se,
TRUE ~ NA_real_))
Z <- data$ci.lwr.t[data$ci.lwr.t < 0] %>% sort
Z <- c(-unique(floor(Z*10)/10),0) # round down to add in values
# So, do log(X+Z) where Z = -Z[i]*2 to get twice the size of lowest value
p <- list()
h <- list()
g <- list()
for (i in seq_along(Z)) {
x <- data %>%
mutate(
val.logpZ = log(val+Z[i]),
ci.lwr.logpX.valid = case_when( # [3]
sv.type != "none" & ci.lwr.t > -Z[i] ~ TRUE,
TRUE ~ FALSE),
se.logpZ = case_when( # [4] log(X+Z) transform
ci.lwr.logpX.valid == TRUE ~
(log(ci.upr.t+Z[i]) - log(ci.lwr.t+Z[i])) / (2*qt(0.975, 100)),
# ci.lwr.logpX.valid == FALSE ~
# (log(ci.upr.t+Z[i]) - val.logpZ)/qt(0.975, 100),
TRUE ~ NA_real_)) %>%
select(se.t, ci.lwr.logpX.valid, sv.type, se.logpZ)
mod <- x %>% filter(ci.lwr.logpX.valid==T) %>%
lm(se.logpZ ~ se.t, data=.)
x <- x %>% mutate(se.y = case_when(
ci.lwr.logpX.valid == TRUE ~ se.logpZ,
ci.lwr.logpX.valid == FALSE & sv.type != "none" ~
coef(mod)[1] + coef(mod)[2]*se.t),
n.excl = case_when(
ci.lwr.logpX.valid == FALSE & sv.type != "none" ~ TRUE,
TRUE ~ FALSE
))
# plotting values:
# plot the original vs. transformed SE values
p[[i]] <-
x %>% filter(ci.lwr.logpX.valid==T) %>%
ggplot(aes(x=se.t, y=se.y)) +
labs(y="transformed SE", x="original SE", title=paste0("log(X+",round(Z[i],2),"); n.excluded = ",sum(x$n.excl))) +
geom_abline(intercept=coef(mod)[1], slope=coef(mod)[2], linetype=2, col="red") +
# geom_smooth(method="lm", col="red", se=F) +
# geom_smooth(method="loess", col="blue", se=F) +
geom_point()
# create a histogram of residuals
h[[i]] <- lm(se.y ~ se.t, data=x, na.action=na.omit) %>% resid() %>% qplot(bins=10) +
labs(x="Residual values", y="Freq") +
geom_vline(xintercept=0, linetype="dashed") +
theme(panel.background = element_rect(fill = NULL),
plot.background=element_rect(fill=NULL,
color="black",size = 1)) +
# geom_density() +
scale_x_continuous(breaks=c(-2,-1,0),
minor_breaks=c(-1.5,-0.5,0.5,1.5))
# get plot x and y range to best lay inset plot
p.yr <- ggplot_build(p[[i]])$layout$panel_params[[1]]$y.range
p.xr <- ggplot_build(p[[i]])$layout$panel_params[[1]]$x.range
p.ydist <- p.yr[2]-p.yr[1]
p.xdist <- p.xr[2]-p.xr[1]
# add inset histogram to regular plot, save as grob object
g[[i]] <- ggplotGrob(p[[i]] + annotation_custom(ggplotGrob(h[[i]]), xmin=p.xr[1], xmax=p.xr[1]+p.xdist*1/2, ymin=p.yr[1]+3/5*p.ydist, ymax=p.yr[2]))
rm(x, p.yr, p.xr, p.ydist, p.xdist)
}
grid.arrange(grobs=g, ncol=2)
# clearly log X+0.2 is good enough, no need for more transformation. Will exclude 3 points, however.
```
Due to the limited number of exclusions while having more well-behaved residuals, we opt for setting C=0.2 and using a log(X+0.2)-CI transformation for the remainder of our analyses.
We thus calculate new point estimates and associated SEs on the log(X+0.2) scale by transforming the original estimate and calculated CIs, then calculating the SEs from the transformed CIs. The new variables 'val.log' (transformed estimate) and 'se.log' (transformed standard error) are created below. Note: lower CI values less than 0.2 could not be computed using a log(X+0.2) transformation and thus are set to NA and required imputation later on.
```{r compute new log(X+C) estimates and SE}
data <- data %>%
mutate(
val.log = log(val+0.2),
ci.lwr.logpX.valid =
ifelse(sv.type != "none" & ci.lwr.t > -0.2, TRUE, FALSE), # [3]
se.log = case_when( # [4] log(X+0.2) transform
ci.lwr.logpX.valid == TRUE ~
(log(ci.upr.t+0.2) - log(ci.lwr.t+0.2)) / (2*qt(0.975, 100)),
# ci.lwr.logpX.valid == FALSE ~
# (log(ci.upr.t+Z[i]) - val.logpZ)/qt(0.975, 100),
TRUE ~ NA_real_))
```
---
# Imputing missing SE values
```{r, echo=F, results="hide"}
data.ts <- data %>% filter(!is.na(data$se.log))
#### Predicting missing log SE values ####
trait.means <- data.ts %>%
group_by(trait) %>%
summarise(mean.val.log = mean(val.log),
min.val.log = min(val.log),
max.val.log = max(val.log),
se.val.log = mean(se.log)) %>% ungroup
h2.means <- data.ts %>%
group_by(heritability) %>%
summarise(mean.val.log = mean(val.log),
min.val.log = min(val.log),
max.val.log = max(val.log),
se.val.log = mean(se.log)) %>% ungroup
sv.type.means <- data.ts %>%
group_by(sv.type) %>%
summarise(mean.val.log = mean(val.log),
min.val.log = min(val.log),
max.val.log = max(val.log),
se.val.log = mean(se.log)) %>% ungroup
# Exploring some models to predict log SE:
data.ts %>% lm(se.log ~ val.log + n.genotypes + sv.type + heritability +
stage, data=., na.action=na.omit) %>% summary
data.ts %>% lm(se.log ~ val.log + sv.type + heritability, data=., na.action=na.omit) %>%
summary
# But can't include heritability type, as it is one of our fixed effects
```
### Best predictors of sampling variance
We next aim to predict the value of missing SE values. There were 3 excluded values due to the log(X+0.2)-CI transformation, while 4 values never had any SE or BCI estimate.
```{r}
data %>% group_by(sv.type, is.na(se.log)) %>% count
# 32 valid SEs; 3 invalid SEs (lower interval < 0.2, thus excluded)
# 56 valid BCIs; and 4 estimates with no originally-reported sampling variance
```
To decide on how to impute, we fit a number of linear models, but the simplest one without including fixed effects is:
transformed SE ~ transformed h2 x sampling variance type (SE or BCI).
```{r}
lm(se.log ~ val.log * sv.type, data=data.ts, na.action = na.omit) %>% summary
```
```{r, echo=F}
# Plot it:
mod <- lm(se.log ~ val.log * sv.type, data=data.ts, na.action = na.omit)
# create new dataset and predict values for this dataset using the gam
new.data <- expand.grid(sv.type = c("se", "bci"),
val.log = seq(log(0.06+0.2), log(0.96+0.2), by=0.01))
# predict missing values based on linear function:
new.data$se.log <- new.data %>%
predict(mod, type="response", newdata=.) %>%
unlist %>% as.numeric
# plot the data
data.ts %>%
ggplot(aes(x=val.log, y=se.log, color=sv.type)) +
geom_point() +
geom_line(data=new.data, aes(x=val.log, y=se.log, color=sv.type)) +
# scale_y_continuous(limits = c(0,1)) +
scale_x_continuous(
breaks=log(c(0.01,0.04,0.10,0.25,0.5,0.75,0.90)+0.2),
labels = c(0.01,0.04,0.10,0.25,0.5,0.75,0.90)) +
labs(x="Heritability estimate", y=expression("Log(X+0.2)-scale SE (two outliers above are cut off)"))
```
Plotting this relationship, we see that the transformed SE for studies that report SE (often through the ANOVA method of calculating $H^2$/$h^2$) are in general higher than the transformed SE from studies originally reporting BCIs (often through the Bayesian MCMCglmm method, which tends to be more precise).
Additionally, the overall tendancy is for precision to increase at larger values of $H^2$/$h^2$.
### Quantile regression
Since most of the unknown values for sampling variance are unreported, rather than excluded, we don't know the original sampling variance type in order to estimate them from the model. Additionally, they may have been omitted due to their relatively high imprecision. **Thus, to be conservative and reduce the impact of imputed SEs, we impute the transformed SEs using a 95% quantile regression following the linear relationship: log-SE ~ log-h2** to account for the decrease in transformed SE values with increasing point estimates. Doing so ensures that points have relatively high SE on the transformed scale, thus down-weighting their relative influence on the analysis.
```{r impute SE values using quantile regression, echo=F}
qr.lm <- data.ts %>%
# filter(se.log<2) %>%
quantreg::rq(se.log ~ val.log, tau=0.95, data=.) %>%
coef()
data$se.log.orig <- data$se.log
# Impute new values:
data <- data %>% mutate(
imputed.se = case_when(
is.na(se.log.orig) ~ "yes",
TRUE ~ "no"
),
se.log = case_when(
is.na(se.log.orig) ~ qr.lm[1] + val.log*qr.lm[2],
!is.na(se.log.orig) ~ se.log.orig,
TRUE ~ NA_real_
))
# create new dataset and predict values for this dataset using the gam
new.data <- expand.grid(sv.type = c("se", "bci"),
val.log = seq(log(0.06+0.2), log(0.96+0.2), by=0.01))
# predict missing values based on linear function:
new.data$se.log <- new.data %>%
predict(mod, type="response", newdata=.) %>%
unlist %>% as.numeric
#### Plot data with imputed se values (supplementary figures) ####
# plot the data
data %>%
ggplot(aes(x=val.log, y=se.log, color=sv.type)) +
geom_point() +
geom_line(data=new.data, aes(x=val.log, y=se.log, color=sv.type)) +
# scale_y_continuous(limits = c(0,1)) +
scale_x_continuous(
breaks=log(c(0.012,0.04,0.10,0.25,0.5,0.75,0.90)+0.2),
labels = c(0.012,0.04,0.10,0.25,0.5,0.75,0.90)) +
labs(x="Heritability estimate", y=expression("Log(X+0.2)-scale SE (two extreme outliers removed)")) +
geom_abline(intercept = qr.lm[1], slope = qr.lm[2], color="grey")
# last line is the line created by 95% quantile regression of all data
```
```{r}
qr.lm
```
The new imputed estimates (n=8) appear on the grey line determined by a 95% quantile-regression of the data, with the transformed heritability point estimate as a predictor.
We now save this final transformed dataset with imputed SEs to be used in subsequent analyses.
---
# Save final values
```{r reorganize and save final data, class.source = 'fold-show'}
# get new levels in order of each trait coefficient
final.model <- rma.mv(yi=val.log ~ trait-1, V=(se.log^2),
random = ~1|study/est.id, data=data, method="REML", tdist=T)
new.levels <- data.frame(trait = gsub("trait","", x=names(coef(final.model))),
est = coef(final.model), row.names=NULL) %>%
arrange(est) %>% pull(trait) %>% as.character()
# Factor variables, select specific columns, arrange by trait type and h2 est
d <- data %>%
mutate(trait = fct_relevel(trait, new.levels),
h2 = fct_relevel(heritability, "broad"),
stage = fct_relevel(stage, "larvae", "juvenile"),
growth.form = fct_relevel(growth.form, "corymbose"),
sv.log = se.log^2) %>%
select(study:region, stage:trait,
relatedness.info:model.type,
h2, val:bci.upr, sv.type, imputed.se,
val.log, sv.log, se.log) %>%
arrange(desc(trait), desc(val.log)) %>%
mutate(est.id = factor(row_number())) # set unique estimate ID for rma.mv()
# save new data file
data.full <- d
save(data.full, file="Data/heritability_estimates_processed.RData")
```
----
R session info:
```{r}
sessionInfo()
```