-
Notifications
You must be signed in to change notification settings - Fork 0
/
2018-2019-iNat-NorthAmerica.R
137 lines (119 loc) · 5.47 KB
/
2018-2019-iNat-NorthAmerica.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Analyses of North America iNaturalist data re COVID19 (%change by state/province)
# Kathleen L Prudic
# created 2020-07-09
# DOI: https://doi.org/10.15468/dl.4prgpy
# Remove wonton variables
rm(list = ls())
# Load additional packages
library(tidyverse)
library(lubridate)
library(data.table)
# Load data large dataset have to use data.table, ugh
# Despite the file extension (.csv) it is a tab delimited file, GBIF!
global_obs <- fread(file = "data/2018-2020-iNaturalist-Global.csv")
# Select columns only related to analyses
cols_keep <- c("year", "month", "countryCode", "stateProvince",
"occurrenceID", "recordedBy")
# DT[, keep, with = FALSE]
global_obs <- global_obs[, cols_keep, with = FALSE]
# Convert to a tibble, not a tribble
global_obs_tbl <- as_tibble(global_obs)
# Filter to CAN, MEX, USA data only
# DT[V2 > 5]
noram_obs <- global_obs_tbl %>%
filter(countryCode %in% c("CA", "MX", "US"))
# Count number of unique users and observations by state/provience for 2018, 2019, 2020
# Column names are year, month, countryCode, stateProvince, occurrenceID, recordedBy
noram_counts <- noram_obs %>%
filter(nchar(stateProvince) > 0) %>%
group_by(stateProvince, year) %>%
summarize(num_observations = n(),
num_users = length(unique(recordedBy)))
# Plot number of observations
noram_obs_plot <- ggplot(data = noram_counts, mapping = aes(x = year,
y = num_observations,
group = stateProvince,
color = stateProvince)) +
geom_line() +
scale_y_log10() +
theme(legend.position = "none")
noram_obs_plot
# Calculate percent change in observations 2018-2019 and 2019-2020
# by spring (Mar-May) and state/province
# ((observations in year t/ observations in year t-1) -1) * 100
noram_obs_change <- noram_counts %>%
mutate(prev_obs = lag(num_observations)) %>%
filter(!is.na(prev_obs)) %>%
mutate(per_change_obs = (((num_observations/prev_obs)-1) * 100)) %>%
select(-prev_obs) %>%
mutate(comparison = if_else(year == 2019,
true = "2018-2019",
false = "2019-2020"))
# Plot percent change in observations
change_noram_obs_plot <- ggplot(data = noram_obs_change,
mapping = aes(x = comparison,
y = per_change_obs,
group = stateProvince,
color = stateProvince)) +
geom_line() +
theme(legend.position = "none")
change_noram_obs_plot
# Plot number of users
noram_users_plot <- ggplot(data = noram_counts, mapping = aes (x = year,
y = num_users,
group = stateProvince,
color = stateProvince)) +
geom_line() +
scale_y_log10() +
theme(legend.position = "none")
noram_users_plot
# Calculate percent change in users 2018-2019 and 2019-2020
# by spring (Mar-May) and nation
noram_users_change <- noram_counts %>%
mutate(prev_users = lag(num_users)) %>%
filter(!is.na(prev_users)) %>%
mutate(per_change_users = (((num_users/prev_users)-1) * 100)) %>%
select(-prev_users) %>%
mutate(comparison = if_else(year == 2019,
true = "2018-2019",
false = "2019-2020"))
# Plot percent change in users
change_noram_users_plot <- ggplot(data = noram_users_change,
mapping = aes(x = comparison,
y = per_change_users,
group = stateProvince,
color = stateProvince)) +
geom_line() +
theme(legend.position = "none")
change_noram_users_plot
# Create dataframe for observations t test
noram_obs_ttest <- noram_obs_change %>%
select(stateProvince, year, per_change_obs) %>%
pivot_wider(names_from = year, values_from = per_change_obs) %>%
rename(`2018-2019` = `2019`,
`2019-2020` = `2020`)
# t test on change in growth in unique observations
noram_obs_ttest_list <- t.test(x = noram_obs_ttest$`2018-2019`,
y = noram_obs_ttest$`2019-2020`,
alternative = "greater",
paired = TRUE)
noram_obs_ttest_list
cat(round(noram_obs_ttest_list$estimate, digits = 6), " (",
paste0(round(as.numeric(noram_obs_ttest_list$conf.int), digits = 4), collapse = ", "),
")", sep = "")
# Create dataframe for observers t test
noram_users_ttest <- noram_users_change %>%
select(stateProvince, year, per_change_users) %>%
pivot_wider(names_from = year, values_from = per_change_users) %>%
rename(`2018-2019` = `2019`,
`2019-2020` = `2020`)
# t test on change in growth in unique observers
noram_users_ttest_list <- t.test(x = noram_users_ttest$`2018-2019`,
y = noram_users_ttest$`2019-2020`,
alternative = "greater",
paired = TRUE)
noram_users_ttest_list
cat(round(noram_users_ttest_list$estimate, digits = 5), " (",
paste0(round(as.numeric(noram_users_ttest_list$conf.int), digits = 4), collapse = ", "),
")", sep = "")