-
Notifications
You must be signed in to change notification settings - Fork 0
/
2016-2020-Arizona.R
164 lines (134 loc) · 6.41 KB
/
2016-2020-Arizona.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# Analyses of Arizona iNat data re COVID19 (Temporal Autocorrelation LME)
# Kathleen L Prudic
# created 2020-07-09
# Remove wonton variables
rm(list = ls())
# Load additional packages
library(tidyverse)
library(lubridate)
library(nlme)
# Load data
arizona_spring <- read_csv(file = "data/2011-2020-iNat-Arizona.csv")
# Filter data to have months March, April, May only
arizona_spring <- arizona_spring %>%
filter(year(observed_on) %in% c(2016:2020))
# Count number of unique users and observations by county for each year
# Column names are county, year, users, observations
arizona_counts <- arizona_spring %>%
group_by(place_county_name, year(observed_on)) %>%
summarize(num_observations = n(),
num_users = length(unique(user_id)))
# Rename columns from place_county_name to county and year(observed_on) to year
arizona_counts <- arizona_counts %>%
rename(county = place_county_name,
year = "year(observed_on)")
# Log transform counts (base 10)
arizona_counts <- arizona_counts %>%
mutate(log_num_observations = log10(num_observations),
log_num_users = log10(num_users))
# Plot number of observations
obs_plot <- ggplot(data = arizona_counts, mapping = aes(x = year,
y = log_num_observations,
group = county,
color = county)) +
geom_line()
obs_plot
# Plot number of users
users_plot <- ggplot(data = arizona_counts, mapping = aes (x = year,
y = log_num_users,
group = county,
color = county)) +
geom_line()
# geom_point() +
# geom_smooth(method = "lm", se = FALSE)
users_plot
# Create column for pre and post COVID19 assignment
arizona_counts <- arizona_counts %>%
mutate(COVID19 = if_else(year > 2019, true = "post", false = "pre"))
# Relevel COVID19 column
arizona_counts$COVID19 <- factor(arizona_counts$COVID19,
levels = c("pre", "post"))
################################################################################
# Mixed effects models testing COVID on observations and unique users
# OBSERVATIONS
# Random intercept for county
obs_model_intercept <- lme(log_num_observations ~ COVID19,
data = arizona_counts,
corr = corARMA(form = ~1|county/year, q = 2),
random = ~1|county)
obs_model_intercept
summary(obs_model_intercept)
# Random intercept and slope for county
obs_model_slope <- lme(log_num_observations ~ COVID19,
data = arizona_counts,
corr = corARMA(form = ~1|county/year, q = 2),
random = ~1 + year|county)
obs_model_slope
summary(obs_model_slope)
# Compare intercept model to intercept + slope model
anova(obs_model_intercept, obs_model_slope)
# Model df AIC BIC logLik Test L.Ratio p-value
# obs_model_intercept 1 6 114.4906 128.1506 -51.24530
# obs_model_slope 2 8 118.4795 136.6928 -51.23974 1 vs 2 0.01110932 0.9945
# USERS
# Random intercept for county
users_model_intercept <- lme(log_num_users ~ COVID19,
data = arizona_counts,
corr = corARMA(form = ~1|county/year, q = 2),
random = ~1|county)
users_model_intercept
summary(users_model_intercept)
# Random intercept and slope for county
users_model_slope <- lme(log_num_users ~ COVID19,
data = arizona_counts,
corr = corARMA(form = ~1|county/year, q = 2),
random = ~1 + year|county,
control = lmeControl(msMaxIter = 200,
msMaxEval = 500))
users_model_slope
summary(users_model_slope)
# Compare intercept model to intercept + slope model
anova(users_model_intercept, users_model_slope)
# Model df AIC BIC logLik Test L.Ratio p-value
# users_model_intercept 1 6 89.83231 103.4923 -38.91615
# users_model_slope 2 8 93.82860 112.0419 -38.91430 1 vs 2 0.00370131 0.9982
################################################################################
# Not sure about temporal autocorrelation results. What happens when we compare
# slopes including and excluding 2020 data? If there is growth post-covid, then
# we expect model slope with 2020 data to be greater than model slope without
# 2020 data. We think
# Linear regression on all data using time (years, Mar-May) to predict number
# of observations (log 10) and number of unique users (log 10)(mixed effects
# model, county = random)
# Random intercept for county
obs_reality_intercept <- lme(log_num_observations ~ year,
data = arizona_counts,
random = ~1|county)
obs_reality_intercept
summary(obs_reality_intercept)
# Extract slope for effect of time
obs_reality_intercept$coefficients$fixed
# Create new dataframe with no 2020 data (Mar-May, 2016-2019)
arizona_counts_no2020 <- arizona_counts %>%
filter(year %in% c(2016:2019))
# Linear regression on no 2020 data using time to predict number of observations
# and number of users (both log 10)(mixed effects model, county = random)
obs_reality_no2020_intercept <- lme(log_num_observations ~ year,
data = arizona_counts_no2020,
random = ~1|county)
obs_reality_no2020_intercept
summary(obs_reality_no2020_intercept)
# Extract slope for effect of time
obs_reality_no2020_intercept$coefficients$fixed
# Extract slopes for two models for eyeball comparison and reality check
obs_reality_intercept$coefficients$fixed
obs_reality_no2020_intercept$coefficients$fixed
plot(x=arizona_counts$year, y=arizona_counts$log_num_observations)
abline(a = obs_reality_intercept$coefficients$fixed[1],
b = obs_reality_intercept$coefficients$fixed[2])
abline(a = obs_reality_no2020_intercept$coefficients$fixed[1],
b = obs_reality_no2020_intercept$coefficients$fixed[2],
lty = 2)
# Looks like slope is smaller when including 2020 data. Sigh. Not sure about
# temporal autocorrelation. Nope nope nope, not sure at all