-
Notifications
You must be signed in to change notification settings - Fork 0
/
2015-2020-eBird-USA-dataprocessing.R
88 lines (70 loc) · 2.29 KB
/
2015-2020-eBird-USA-dataprocessing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# USA eBird data processing script
# Kathleen L Prudic
# created 2020-08-25
# Remove wonton variables
rm(list = ls())
# Load additional packages
library(tidyverse)
library(lubridate)
# Load data
ebird_file <- "~/Downloads/eBird/Data/ebd_US_relJul-2020.txt"
# Establish output file names
ebird_outfile <- "data/2015-2020-ebird-USA.csv"
# Read in Header row, include 1 row of data to ensure data type identification
# ie dates get read in as dates, numbers get read in as numbers
ebird_header <- read_tsv(file = ebird_file, n_max = 1, col_types = cols())
ebird_colnames <- colnames(ebird_header)
do_filter <- TRUE
# Read in data 1000 rows at a time
for (i in 1:10000) {
# Use readr::read_tsv
max_rows <- 1000
start <- (i * max_rows) - max_rows + 1
# First iteration, start at row 2
if (i == 1) {
start <- 2
max_rows <- max_rows - 1
}
ebird_rows <- read_tsv(file = ebird_file,
skip = start - 1,
n_max = max_rows,
col_names = FALSE,
col_types = cols())
# Assign column names
colnames(ebird_rows) <- ebird_colnames
if (i %% 10 == 0) {
cat("iteration ", i, " start: ", start, "\n")
}
# Select columns of interest
ebird_data <- ebird_rows %>%
select(`GLOBAL UNIQUE IDENTIFIER`, `OBSERVATION DATE`, `OBSERVER ID`,
`SCIENTIFIC NAME`, `COUNTRY CODE`, STATE, LATITUDE, LONGITUDE)
if (do_filter) {
# Filter to months of interest
ebird_data <- ebird_data %>%
filter(month(`OBSERVATION DATE`) %in% c(3:6))
# Filter to year of interest
ebird_data <- ebird_data %>%
filter(year(`OBSERVATION DATE`) %in% c(2012:2020))
# Filter to US States
# Use %in% datasets::state.name
ebird_data <- ebird_data %>%
filter(STATE %in% c(datasets::state.name, "District of Columbia"))
}
# Write to file
append <- TRUE
if (i == 1) {
append <- FALSE
}
write_csv(x = ebird_data,
path = ebird_outfile,
append = append)
}
# Try to find row post 2015
ebird_rows <- read_tsv(file = ebird_file,
skip = 2e8,
n_max = max_rows,
col_names = FALSE,
col_types = cols())
colnames(ebird_rows) <- ebird_colnames