-
Notifications
You must be signed in to change notification settings - Fork 1
/
lighting.R
217 lines (171 loc) · 9.52 KB
/
lighting.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
# How does Roadway Lighting Condition Affect Safety?
# 2 May 2020
# Author: Mark Egge ([email protected])
# DATA TRANSFORMATION -------------------------------------------------------------------
# Read in PA Crash CSVs for 2012 – 2016
# Downloaded from https://crashinfo.penndot.gov/PCIT/welcome.html
library(data.table) # fast data manipulation
crashes <- list()
for(year in as.character(2012:2016)) {
file_name <- paste0("data/Statewide_", year, "/CRASH_", year, "_Statewide.csv")
crashes[[year]] <- data.table::fread(file_name)
}
crashes <- rbindlist(crashes) # join the five files into one
# Filter to only crashes with "2 - Dark - No Street Lights" or
# "3 - Dark - Street Lights"
# See doc/PA_Crashes_Data_Dictionary.pdf for details
crashes <- crashes[ILLUMINATION %in% c(2, 3)]
crashes[ILLUMINATION == 2, crash_lighting := "unlit"]
crashes[ILLUMINATION == 3, crash_lighting := "lit"]
# GIS --------------------------------------------------------------------------
# read in shapefile of Pennsylvania freeway road segments based on HPMS 2018
# Source: https://geo.dot.gov/server/rest/services/Hosted/Pennsylvania_2018_PR/FeatureServer/0
library(sf) # spatial data manipulation
segments <- sf::st_read("shp/traffic/traffic.shp")
segments$id <- 1:nrow(segments) # assign a unique identifier for each segment
#' SPATIAL JOIN TO CALCULATE CRASH COUNTS AND RATES ----------------------------
#' Spatially join crashes to road segments, count the crashes, and join back
# Create a sf spatial object from crashes using the the crash data lat/lon fields.
crashes <- st_as_sf(crashes, coords = c("DEC_LONG", "DEC_LAT"), crs = 4326, na.fail = FALSE)
# Reproject from Web Mercator lat/lon coordinates to
# PA State Plane EPSG:2272 NAD83 / Pennsylvania South (ft)
crashes <- st_transform(crashes, 2272) # reproject data to EPSG:2272
# Buffer the road segments by 50 feet
segment_buffer <- st_buffer(segments, dist = 50, endCapStyle = "FLAT")
# Spatially join the crashes to the buffered segments
joined <- st_join(segment_buffer, crashes, left = TRUE)
# TABULATE CRASHES AND IMPUTE LIGHTING
#' Our roadway segments dataset does not include information about overhead lighting.
#' We can use the data in the crash dataset (which describes the lighting condition)
#' to infer if a roadway has overhead illumination.
#'
#' The section below uses crash data to impute a "lighting" attribute for our
#' road segments.
#'
#' Where a roadway has multiple crashes, the lighting attribute is assigned
#' based on predominant value reported for associated crashes.
# Convert sf spatial object to a data.table for aggregation
segment_crashes <- as.data.table(st_drop_geometry(joined))
# count total crashes by segment id and illumination condition
counts <- segment_crashes[, .(crashes = .N), by = .(id, crash_lighting)]
# Reshape long to wide, filling missing values with zero
counts <- dcast(counts, id ~ crash_lighting, value.var = "crashes", fill = 0)
# Count annual crashes by segment
counts[, total_crashes := lit + unlit]
# Annual Crahes = total crashes divided by five years
counts[, crashes := total_crashes / 5]
# Assign lighting based on majority of crashes (lit or unlit)
counts[total_crashes > 0,
lighting := as.factor(ifelse(lit >= unlit, "lit", "unlit"))]
# Join imputed lighting conditions back to segments sf
segments <- merge(segments, counts, by = "id")
# Calculate crash rates - crashes per million VMT
segments$mvmt <- segments$DLY_VMT * 365 / 1000000 # million annual vehicle miles travelled
segments$rate <- segments$crashes / segments$mvmt # annual crashes per 1m annual vmt
# DECISION TREES --------------------------------------------------------------
#' FILL MISSING SEGMENT ILLUMINATION VALUES WITH A DECISION TREE ---------------
#
# Train a decision tree which assigns lighting to
# segments without a crash history based on attributes like length and volume
# K-Nearest Neighbors would also provide a great way to fill missing lighting values
# Table of "lit", "unlit", and "unknown" lighting_conditon attributes
cat("Before:"); table(segments$lighting, useNA = "ifany")
library(rpart); library(rpart.plot) # decision tree models
# For segments without an imputed lighting_condition, predict
# which road segments are illuminated using an rpart decision tree
fit <- rpart(lighting ~ ST_RT_NO + CTY_CODE + DISTRICT_N + SEG_LNGTH_ + CUR_AADT,
data = segments)
# Classifier performance evaluation omitted for the sake of brevity; moderate fit
# use decision tree to fill in missing lighting values
predictions <- predict(fit, type = "class", newdata = segments[is.na(segments$lighting), ])
segments[is.na(segments$lighting), ]$lighting <- predictions
# Table of "lit", "unlit", and "unknown" lighting_conditon attributes
cat("After"); table(segments$lighting, useNA = "ifany")
# Decision Trees are a great tool for revealing structural relationships in data
# What explains / predicts crash counts? A decision tree:
fit <- rpart(crashes ~ DISTRICT_N + SEG_LNGTH_ + CUR_AADT + DLY_VMT, data = segments)
rpart.plot(fit, digits = -1, main = "Crash Counts")
# What explains / predicts crash rates?
fit <- rpart(rate ~ DISTRICT_N + SEG_LNGTH_ + CUR_AADT + DLY_VMT, data = segments)
rpart.plot(fit, digits = -1, main = "Crashes per Million VMT")
# PLOTTING ---------------------------------------------------
#' In this section we map, plot, and summarize our analysis dataset to better
#' understand what relationships may existing within the data. The findings from
#' this section inform the subsequent modeling section.
# MAP LIGHTING TYPES
# limit to Pittsburgh region and reproject to web mercator for leaflet
library(leaflet) # easy mapping in R
# Subset to Pittsburgh region
leaflet_segments <- segments[segments$DISTRICT_N %in% c("11", "12"), ]
# Reproject from StatePlane to Web Mercator for Leaflet
leaflet_segments <- st_transform(leaflet_segments, 4326)
# Map lighting conditions and crash rates
pal <- colorFactor(c("yellow", "brown", "orange"), leaflet_segments$lighting)
qpal <- colorQuantile(palette = "YlOrRd", domain = leaflet_segments$rate, n = 7)
leaflet(leaflet_segments) %>%
addProviderTiles(providers$CartoDB.DarkMatter) %>%
addPolylines(color = ~pal(lighting), group = "Lighting") %>%
addPolylines(color = ~qpal(rate), group = "Crash Rates") %>%
addLegend(group = "Lighting", pal = pal, values = ~lighting, position = "bottomright") %>%
addLegend(pal = qpal, values = ~rate, group = "Crash Rates",
labFormat = function(type, cuts, p) {
n = length(cuts); p = paste0(round(p * 100), '%')
cuts = paste0(round(cuts[-n], 2), " – ", round(cuts[-1], 2))
},
position = "bottomright", title = "Crash Rate") %>%
addLayersControl(
overlayGroups = c("Lighting", "Crash Rates"),
options = layersControlOptions(collapsed = FALSE)
) %>%
hideGroup("Crash Rates")
# plot crash counts vs. vmt
# relationship between crashes and vmt seems fairly linear
library(ggplot2)
ggplot(segments, aes(x = mvmt, y = crashes, color = lighting)) +
geom_jitter(alpha = 0.5, stroke = 0, size = 1) + # scatter plot
guides(color = guide_legend(override.aes = list(alpha = 1))) +
coord_cartesian(xlim = c(0, 40), ylim = c(0, 20)) + # zoom in
xlab("Million Vehicle Miles Traveled") + ylab("Crash Count") +
ggtitle("Crash Count vs. VMT")
# box plot of crash rates (limiting to rate < 5 for visualization)
ggplot(segments, aes(x = lighting, y = rate)) +
geom_boxplot() + ylab("Crash Rate") +
ggtitle("Crashes per Million VMT by Lighting Condition") +
coord_cartesian(ylim = c(0, 3)) # zoom in
ggplot(segments) +
geom_density(aes(x = mvmt), color = "blue", show.legend = TRUE) +
geom_density(aes(x = crashes), color = "purple", show.legend = TRUE) +
coord_cartesian(xlim = c(0, 30))
# MODELING ---------------------------------------------------------------
library(pscl) # Poisson and zero-inflated regression
# fit a zero inflated negative binomial model
fit <- pscl::zeroinfl(total_crashes ~ lighting + mvmt | mvmt, data = segments)
summary(fit)
# what does this look like?
synth <- data.table(lighting = c(rep("lit", 100), rep("unlit", 100)),
mvmt = seq(0.1, 70, length.out = 100))
fit_aadt <- lm(CUR_AADT ~ poly(mvmt, 3), data = segments)
synth$CUR_AADT <- predict(fit_aadt, newdata = synth)
synth$total_crashes <- predict(fit, newdata = synth)
ggplot() +
geom_point(data = segments, aes(x = mvmt, y = crashes, color = lighting),
alpha = 1/5, stroke = 0, size = 1.5) +
guides(color = guide_legend(override.aes = list(alpha = 1))) +
geom_line(data = synth, aes(x = mvmt, y = total_crashes, color = lighting)) +
ggtitle("Estimated Crashes and VMT", "By Lighting Condition")
# FINDINGS --------------------------------------------------------------------
#'
#' The zero-inflated negative binomial model outputs show that unlit road segments
#' have 0.39 fewer crashes on average than lit segments, controlling for VMT.
#'
#' This is an important reminder that correlation does not imply causation. This
#' result does not mean that roadway lighting causes crashes. The most plausible
#' explanation is that roadway lighting is installed in dangerous road sections
#' and our regression equation does not fully control for the other exogenous
#' factors such as intersections or other roadway conditions associated with greater
#' conflicts and crashes.
#'
#' This finding is rather counter-intuitive from a "lighting improves safety"
#' perspective, but makes sense if lighting is installed in locations with higher
#' crash risk (e.g. intersections).
#'