-
Notifications
You must be signed in to change notification settings - Fork 1
/
00-get-minpair-data.Rmd
159 lines (133 loc) · 4.51 KB
/
00-get-minpair-data.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
---
title: "Download data from the database"
output: rmarkdown::github_document
author: Tristan Mahr
date: "`r Sys.Date()`"
---
```{r setup, include = FALSE}
knitr::opts_chunk$set(
fig.path = "figs/00-",
echo = TRUE,
comment = "#>",
collapse = TRUE,
fig.width = 6,
fig.asp = 0.62)
```
This script downloads response-level data from the minimal pairs
experiment from timepoint 1 of the longitudinal study.
The data-set includes only trials from the most frequently used
item-pairs. (There was some variation in the item counts and item sets
in the initial versions of the task, so we elect to keep the most
frequent items.) We exclude a participant with too few items from their
administration of the experiment. (Perhaps a crashed or aborted
experiment.) We exclude a participant who apparently chose the wrong
item on every trial. We also include EVT-2, PPVT-4 and GFTA-2 as these
are useful covariates.
Connect and download the scores
------------------------------------------------------------------------
Connect to UMN database using the L2TDatabase package.
<https://github.com/LearningToTalk/L2TDatabase>
```{r}
# Download test scores from the database
library(L2TDatabase)
library(dplyr)
library(ggplot2)
# Connect to database. This assumes the L2T Database configuration file
# is the user's home folder.
db_config_file <- path.expand("~/l2t_db.cnf")
l2t <- l2t_connect(db_config_file, "backend")
# Get the timepoint 1 minimal pair responses and vocabulary scores. (I'm
# using the backend database because I wrote the first version of this
# script using it.)
min_pair_raw <- tbl(l2t, "MinPair_Admin") %>%
left_join(tbl(l2t, "MinPair_Responses")) %>%
left_join(tbl(l2t, "ChildStudy")) %>%
left_join(tbl(l2t, "Child")) %>%
left_join(tbl(l2t, "Study")) %>%
left_join(tbl(l2t, "EVT")) %>%
left_join(tbl(l2t, "PPVT")) %>%
left_join(tbl(l2t, "GFTA")) %>%
# No practice trials
filter(Study == "TimePoint1", Running == "Test") %>%
select(
Study, ResearchID = ShortResearchID, Female:CImplant,
MinPair_Age, MinPair_Dialect:Correct,
PPVT_Age, PPVT_Raw:PPVT_GSV,
EVT_Age, EVT_Raw:EVT_GSV,
GFTA_Age, GFTA_RawCorrect:GFTA_Standard) %>%
select(
-MinPair_Completion, -MinPair_Notes,
-MinPair_Admin_Timestamp, -ResponseID) %>%
collect(n = Inf)
min_pair_raw
```
Screen the data
------------------------------------------------------------------------
```{r}
# Function to sort two words, combine with hyphen.
# c("dog", "cat") -> "cat-dog"
make_word_pair <- function(x, y) {
paste0(sort(c(x, y)), collapse = "-")
}
# Use the word-pair function on each row
min_pair_raw <- min_pair_raw %>%
mutate(WordPair = purrr::map2_chr(Item1, Item2, make_word_pair))
# Keep only the word-pairs used in every administration. With ~190
# children, these are words with at least 300 responses.
frequent_items <- min_pair_raw %>%
count(WordPair) %>%
filter(300 < n) %>%
print() %>%
select(WordPair)
# Who is missing the common items?
too_few_trials <- min_pair_raw %>%
inner_join(frequent_items) %>%
count(ResearchID) %>%
filter(n != 30) %>%
print()
# Who is that child who responded with the opposite answer?
impossibly_low <- min_pair_raw %>%
group_by(ResearchID) %>%
summarise(PropCorrect = mean(Correct)) %>%
filter(PropCorrect < .15)
pbinom(2, 30, .5)
freq_items <- min_pair_raw %>%
inner_join(frequent_items) %>%
anti_join(too_few_trials) %>%
filter(CImplant == 0)
prop_correct <- freq_items %>%
group_by(ResearchID) %>%
summarise(MeanCorrect = mean(Correct), n = n())
```
Make some plots
------------------------------------------------------------------------
```{r minp-raw-histogram-with-outlier}
p1 <- ggplot(prop_correct) +
aes(x = MeanCorrect) +
# bins are 5% quantiles
geom_histogram(binwidth = .05, center = .525, color = "white") +
xlab("Observed proportion of items correct") +
ylab("N participants") +
theme_grey(base_size = 16) +
annotate(
"label", x = .15, y = 5,
label = "Chose opposite\nitem on purpose", size = 4.5)
p1
```
```{r min-raw-histogram}
p2 <- ggplot(prop_correct %>% anti_join(impossibly_low)) +
aes(x = MeanCorrect) +
# bins are 5% quantiles
geom_histogram(binwidth = .05, center = .525, color = "white") +
xlab("Observed proportion of items correct") +
ylab("N participants") +
theme_grey(base_size = 16)
p2
```
Save data (without outlier)
------------------------------------------------------------------------
```{r}
freq_items %>%
anti_join(impossibly_low) %>%
readr::write_csv("./data/raw-minimal-pairs-responses.csv")
```