source("setup.R")
= sf::read_sf(file.path("data", "obs", "obs-covariates.gpkg")) |>
@@ -268,7 +269,7 @@ obs
2.1 The input table
Simply strip the spatial information off of obs
and bkg
, select just the environmental covariates, and then row bind them together
-
+
= obs |>
input_obs ::st_drop_geometry() |>
sf::select(dplyr::all_of(c("sst", "u_wind", "v_wind"))) |>
@@ -285,7 +286,7 @@ dplyr
2.2 The input vector
The each element of the input vector must have a 1 for each observation row, and a 0 for each background row. Since we arranged to have all of the the observations come first, we can easily make the vector with two calls to rep()
.
-
+
= c( rep(1, nrow(input_obs)), rep(0, nrow(input_bkg)) ) input_vector
@@ -293,7 +294,7 @@
3 Build the model
Here we pass our inputs to the maxnet()
function, leaving all of the optional arguments to the default values. Be sure to look over the docs for model construction - try ?maxnet
-
+
= maxnet::maxnet(input_vector, input_table) model
That’s it. The returned object is of maxnet
class; it’s essentially a list with all of the pertinent information required for subsequent use.
@@ -302,7 +303,7 @@ 4 Assess the model
So what do we know about the model? Is it any good?
One thing we can do is to plot what are called response curves. These show, for each parameter, how the model responds along the typical range of parameter values. We plot below the responses with type cloglog
which transform the response value into the 0-1 range.
-
+
plot(model, type = "cloglog")
@@ -320,7 +321,7 @@ v3.0, ...
for for the split model(s)
The maxnetic provides some convenience functions for working with maxnet models including file storage functions.
-
+
= file.path("data", "model", "v1", "v1.0")
v1_path = dir.create(v1_path, recursive = TRUE, showWarnings = FALSE)
ok ::write_maxnet(model, file.path(v1_path, "model_v1.0.rds")) maxnetic
@@ -332,7 +333,7 @@
6.1 Predict with a data frame
Here we provide a data frame, in our case the original input data, to the predict()
function with type cloglog
which transform the response value into the 0-1 range.
-
+
= predict(model, input_table, type = 'cloglog')
prediction hist(prediction, xlab = "prediction", main = "Basic Model")
@@ -341,10 +342,11 @@
6.1.1 How did it do?
-
We can use some utilities in the maxnetic package to help us assess the model. First, we need to create a table with two columns: label
and pred
. Label is the simple a vector of 0/1 indicating that the predicted value is known to be either background or presence. We already have that in our input_vector
. Pred is simple the 0-1 scale predicted value. Once we have that we can craft a receiver operator characteristic curve and compute it’s AUC.
-
-= dplyr::tibble(label = input_vector, pred = as.vector(prediction))
- x plot_ROC(x, title = "v1.0 Basic Model")
+We can use some utilities in the maxnetic package to help us assess the model. The pAUC()
function will compute statistics, include a presence-only AUC value. We need to pass it two items - the universe of predictions and the predictions for just the presence points.
+
+= input_vector > 0
+ ix = maxnetic::pAUC(prediction, prediction[ix])
+ pauc plot(pauc, title = "v1.0 Basic Model")
@@ -354,52 +356,33 @@
6.2 Predict with rasters
-We can also predict using raster inputs using our basic model. Let’s read in rasters for each month of 2018, and then run a prediction for each month.
-
+We can also predict using raster inputs using our basic model. Let’s read in rasters for each month of 2019, and then run a prediction for each month.
+We provide a function read_predictors()
that will read and bind the rasters together for you given the filtered databases and paths. So, first we define the paths and filter the databases to point to just the months in 2019.
+
= as.Date(c("2019-01-01", "2019-12-31"))
dates
= "data/oisst"
sst_path = oisster::read_database(sst_path) |>
sst_db ::arrange(date) |>
dplyr::filter(dplyr::between(date, dates[1], dates[2]))
- dplyr
-
-= sst_db |>
- sst ::compose_filename(path = sst_path) |>
- oisster::read_stars(along = list(time = sst_db$date)) |>
- stars::set_names("sst")|>
- rlangst_to_180()
-
-
-= "data/nbs"
- wind_path = nbs::read_database(wind_path) |>
- wind_db ::arrange(date)|>
+ dplyr
+= "data/nbs"
+ wind_path = nbs::read_database(wind_path) |>
+ wind_db ::arrange(date)|>
+ dplyr::filter(dplyr::between(date, dates[1], dates[2]))
+ dplyr
+= wind_db |>
+ u_wind_db ::filter(param == "u_wind")|>
+ dplyr::filter(dplyr::between(date, dates[1], dates[2]))
+ dplyr
+= wind_db |>
+ v_wind_db ::filter(param == "v_wind")|>
dplyr::filter(dplyr::between(date, dates[1], dates[2]))
dplyr
-= wind_db |>
- u_wind_db ::filter(param == "u_wind")|>
- dplyr::filter(dplyr::between(date, dates[1], dates[2]))
- dplyr= u_wind_db |>
- u_wind ::compose_filename(path = wind_path) |>
- nbs::read_stars(along = list(time = u_wind_db$date)) |>
- stars::set_names("u_wind") |>
- rlangst_to_180()
-
-= wind_db |>
- v_wind_db ::filter(param == "v_wind")|>
- dplyr::filter(dplyr::between(date, dates[1], dates[2]))
- dplyr= v_wind_db |>
- v_wind ::compose_filename(path = wind_path) |>
- nbs::read_stars(along = list(time = v_wind_db$date)) |>
- stars::set_names("v_wind") |>
- rlangst_to_180()
-
-Once we have them in hand we need to bind them together. But we need to attend to common but important issue. The sst
rasters and windspeed
rasters have different extents. We can’t bind them together until we warp one set to match the other. Let’s warp sst
to match u_wind
. And then we can bind them together.
-
-= stars::st_warp(sst, u_wind)
- sst_warped = list(sst_warped, u_wind, v_wind)
- x = do.call(c, append(x, list(along = NA_integer_)))
- predictors predictors
+= read_predictors(sst_db = sst_db,
+ predictors u_wind_db = u_wind_db,
+ v_wind_db = v_wind_db)
+ predictors
stars object with 3 dimensions and 3 attributes
attribute(s):
@@ -414,10 +397,10 @@
-Now we can run the prediction.
-
-= predict(model, predictors, type = 'cloglog')
- pred pred
+You can see that we have the rasters in one object of three attributes (sst
, u_wind
and v_wind
) each with 12 layers (Jan 2019 - Dec 2019). Now we can run the prediction.
+
+= predict(model, predictors, type = 'cloglog')
+ pred pred
stars object with 3 dimensions and 1 attribute
attribute(s):
@@ -431,25 +414,25 @@
Since we get a spatially mapped prediction back, we can plot it.
-
-= rnaturalearth::ne_coastline(scale = 'large', returnclass = 'sf') |>
- coast ::st_crop(pred) sf
+
+= rnaturalearth::ne_coastline(scale = 'large', returnclass = 'sf') |>
+ coast ::st_crop(pred) sf
Warning: attribute variables are assumed to be spatially constant throughout
all geometries
-= function() {
- plot_coast plot(sf::st_geometry(coast), col = 'green', add = TRUE)
-
- }plot(pred, hook = plot_coast)
+= function() {
+ plot_coast plot(sf::st_geometry(coast), col = 'green', add = TRUE)
+
+ }plot(pred, hook = plot_coast)
-
+
Well, that certainly looks appealing with higher likelihood of near shore observations occurring during the warmer months.
6.2.1 How did it do?
-To compute an ROC and AUC for each month, we have a little bit of work to do. We need to extract the observations and background for each month from the prediction maps. These we can then pass to the plot_ROC()
function.
+To compute an ROC and AUC for each month, we have a little bit of work to do. We need to extract the observations locations for each month from the prediction maps. These we can then plot.
-
-= obs |>
- test_obs ::filter(dplyr::between(date, dates[1], dates[2])) |>
- dplyr::select(dplyr::all_of("date")) |>
- dplyr::mutate(date = oisster::current_month(date))
- dplyr
-= bkg |>
- test_bkg ::filter(dplyr::between(date, dates[1], dates[2])) |>
- dplyr::select(dplyr::all_of("date")) |>
- dplyr::mutate(date = oisster::current_month(date))
- dplyr
-= dplyr::bind_rows(test_obs, test_bkg)
- test_input
-= stars::st_extract(pred, test_input, time_column = 'date') |>
- x print()
+
+= obs |>
+ test_obs ::filter(dplyr::between(date, dates[1], dates[2])) |>
+ dplyr::select(dplyr::all_of("date")) |>
+ dplyr::mutate(date = oisster::current_month(date))
+ dplyr
+= stars::st_extract(pred, test_obs, time_column = 'date') |>
+ x print()
-Simple feature collection with 1537 features and 3 fields
+Simple feature collection with 612 features and 3 fields
Geometry type: POINT
Dimension: XY
-Bounding box: xmin: -75.99915 ymin: 35.01635 xmax: -58.83057 ymax: 45.95233
+Bounding box: xmin: -75.7589 ymin: 35.1211 xmax: -65.48274 ymax: 43.83954
Geodetic CRS: WGS 84
First 10 features:
pred time date geometry
@@ -499,66 +475,125 @@
Finally we can build a table that merges the prediction with the labels. We are going to add the name of the month to group by that.
-
-= x |>
- y ::mutate(label = c(rep(1, nrow(test_obs)), rep(0, nrow(test_bkg))),
- dplyrmonth = factor(format(date, "%b"), levels = month.abb),
- .before = 2) |>
- ::st_drop_geometry() |>
- sf::select(dplyr::all_of(c("month", "label", "pred"))) |>
- dplyr::group_by(month)
- dplyr
-::count(y, month, label) |>
- dplyrprint(n = 24)
+
+= x |>
+ y ::mutate(month = factor(format(date, "%b"), levels = month.abb),
+ dplyr.before = 1) |>
+ ::select(dplyr::all_of(c("month", "pred", "date"))) |>
+ dplyr::group_by(month)
+ dplyr
+::count(y, month) |>
+ dplyrprint(n = 12)
-# A tibble: 24 × 3
-# Groups: month [12]
- month label n
- <fct> <dbl> <int>
- 1 Jan 0 36
- 2 Jan 1 21
- 3 Feb 0 15
- 4 Feb 1 7
- 5 Mar 0 46
- 6 Mar 1 23
- 7 Apr 0 259
- 8 Apr 1 169
- 9 May 0 182
-10 May 1 119
-11 Jun 0 73
-12 Jun 1 53
-13 Jul 0 76
-14 Jul 1 48
-15 Aug 0 46
-16 Aug 1 39
-17 Sep 0 48
-18 Sep 1 21
-19 Oct 0 102
-20 Oct 1 79
-21 Nov 0 27
-22 Nov 1 19
-23 Dec 0 15
-24 Dec 1 14
+Simple feature collection with 12 features and 2 fields
+Geometry type: MULTIPOINT
+Dimension: XY
+Bounding box: xmin: -75.7589 ymin: 35.1211 xmax: -65.48274 ymax: 43.83954
+Geodetic CRS: WGS 84
+# A tibble: 12 × 3
+ month n geometry
+ * <fct> <int> <MULTIPOINT [°]>
+ 1 Jan 21 ((-74.63902 36.26849), (-75.01758 36.49984), (-75.01801 36.72554…
+ 2 Feb 7 ((-74.52432 37.24967), (-74.45561 37.16891), (-74.74373 36.72355…
+ 3 Mar 23 ((-74.53117 36.26996), (-74.60195 36.72201), (-74.67127 36.72266…
+ 4 Apr 169 ((-72.924 38.6733), (-73.0165 38.591), (-73.0036 38.56), (-73.10…
+ 5 May 119 ((-74.56571 35.6059), (-75.2181 35.1934), (-75.3228 35.535), (-7…
+ 6 Jun 53 ((-73.10608 38.72575), (-74.86204 36.27105), (-75.04656 36.34824…
+ 7 Jul 48 ((-74.53554 36.19828), (-74.91756 36.27104), (-75.10905 36.27065…
+ 8 Aug 39 ((-72.78628 38.68677), (-72.98868 38.61241), (-74.9889 36.2911),…
+ 9 Sep 21 ((-75.3167 36.0439), (-75.5204 36.3294), (-75.5519 36.1854), (-7…
+10 Oct 79 ((-67.06445 42.91399), (-68.43614 43.83954), (-69.14391 43.16967…
+11 Nov 19 ((-72.52681 39.21286), (-71.54966 39.99385), (-67.79606 40.36107…
+12 Dec 14 ((-75.242 35.2705), (-75.3335 35.3027), (-75.436 35.1211), (-75.…
Now how about one ROC plot for each month? Yikes! This requires a iterative approach, using group_map()
, to compute the ROC for each month. We then follow with plot wrapping by the patchwork package.
-
-= dplyr::group_map(y,
- rocs function(tbl, key){
- ::plot_ROC(tbl, title = sprintf("%s, n = %i", key$month, nrow(tbl)),
- maxneticxlab = "", ylab = "")
-
- })
-::wrap_plots(rocs, ncol = 4) patchwork
+
+= dplyr::group_map(y,
+ paucs function(tbl, key, pred_rasters = NULL){
+ = key$month %in% month.abb
+ ix = dplyr::slice(pred_rasters, "time", ix)
+ x = maxnetic::pAUC(x, tbl)
+ pauc plot(pauc,title = key$month,
+ xlab = "", ylab = "")
+ pred_rasters = pred)
+ },
+::wrap_plots(paucs, ncol = 4) patchwork
-
+
-Hmmm. That’s surprising, yes? Why during the summer months does our AUC go down. In fact, at times we are predicting the likelihood of not having an observation reported. It’s hard to know what to think, but consider that we are using a model generated across all months of multiple years and it might not predict a particular month and year very well. A step toward refinement, our next step is to make 12 models, one for each month.
-
-
+Hmmm. That’s surprising, yes? Why during the summer months does our AUC go down when we have the most number of observations? That seems counter intuitive.
+
+
+7 Thinking about AUC
+AUC is a diagnostic that provides a peek into the predictive power of a model. But what is it? An analogy is fitting a straight line to a small set of observations verses a large set of observations and then comparing the correlation coefficients. Here’s a simple example using R’s built-in dataset cars
which is a data frame of 50 observations of speed and stopping distances of cars. We’ll compute a linear model for the entire data set, and then a second for a small subsample of the data. (Learn more about linear models in R here.)
+
+data("cars")
+= dplyr::as_tibble(cars)
+ cars
+= lm(dist ~ speed, data = cars)
+ all_fit summary(all_fit)
+
+
+Call:
+lm(formula = dist ~ speed, data = cars)
+
+Residuals:
+ Min 1Q Median 3Q Max
+-29.069 -9.525 -2.272 9.215 43.201
+
+Coefficients:
+ Estimate Std. Error t value Pr(>|t|)
+(Intercept) -17.5791 6.7584 -2.601 0.0123 *
+speed 3.9324 0.4155 9.464 1.49e-12 ***
+---
+Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+Residual standard error: 15.38 on 48 degrees of freedom
+Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438
+F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12
+
+
+
+set.seed(5)
+= dplyr::slice_sample(cars, n = 3)
+ sub_cars = lm(dist ~ speed, data = sub_cars)
+ sub_fit summary(sub_fit)
+
+
+Call:
+lm(formula = dist ~ speed, data = sub_cars)
+
+Residuals:
+ 1 2 3
+ 3 3 -6
+
+Coefficients:
+ Estimate Std. Error t value Pr(>|t|)
+(Intercept) -6.5000 8.8741 -0.732 0.598
+speed 3.3750 0.6495 5.196 0.121
+
+Residual standard error: 7.348 on 1 degrees of freedom
+Multiple R-squared: 0.9643, Adjusted R-squared: 0.9286
+F-statistic: 27 on 1 and 1 DF, p-value: 0.121
+
+
+You can see that the rU+00B2
value is quite high for the smaller data set, but the model may not be predictive over the full range of data. AUC is somewhat analogous to to rU+00B2
in that a relatively low score does not necessarily suggest a poor model.
+
+::ggplot(data = cars, ggplot2::aes(x = speed, y = dist)) +
+ ggplot2::geom_point(color = "blue") +
+ ggplot2::geom_abline(slope = coef(all_fit)[2], intercept = coef(all_fit)[1], color = "blue") +
+ ggplot2::geom_point(data = sub_cars, ggplot2::aes(x = speed, y = dist), color = "orange") +
+ ggplot2::geom_abline(slope = coef(sub_fit)[2], intercept = coef(sub_fit)[1], color = "orange") ggplot2
+
+
+
+
+
+
Back to top
diff --git a/docs/modeling-01_files/figure-html/unnamed-chunk-12-1.png b/docs/modeling-01_files/figure-html/unnamed-chunk-11-1.png
similarity index 100%
rename from docs/modeling-01_files/figure-html/unnamed-chunk-12-1.png
rename to docs/modeling-01_files/figure-html/unnamed-chunk-11-1.png
diff --git a/docs/modeling-01_files/figure-html/unnamed-chunk-14-1.png b/docs/modeling-01_files/figure-html/unnamed-chunk-14-1.png
new file mode 100644
index 0000000..6059e6c
Binary files /dev/null and b/docs/modeling-01_files/figure-html/unnamed-chunk-14-1.png differ
diff --git a/docs/modeling-01_files/figure-html/unnamed-chunk-15-1.png b/docs/modeling-01_files/figure-html/unnamed-chunk-15-1.png
deleted file mode 100644
index c76ba3c..0000000
Binary files a/docs/modeling-01_files/figure-html/unnamed-chunk-15-1.png and /dev/null differ
diff --git a/docs/modeling-01_files/figure-html/unnamed-chunk-17-1.png b/docs/modeling-01_files/figure-html/unnamed-chunk-17-1.png
new file mode 100644
index 0000000..33eebe3
Binary files /dev/null and b/docs/modeling-01_files/figure-html/unnamed-chunk-17-1.png differ
diff --git a/docs/modeling-01_files/figure-html/unnamed-chunk-8-1.png b/docs/modeling-01_files/figure-html/unnamed-chunk-8-1.png
index 87068ba..1102229 100644
Binary files a/docs/modeling-01_files/figure-html/unnamed-chunk-8-1.png and b/docs/modeling-01_files/figure-html/unnamed-chunk-8-1.png differ
diff --git a/docs/modeling-02.html b/docs/modeling-02.html
index 2430785..17f7428 100644
--- a/docs/modeling-02.html
+++ b/docs/modeling-02.html
@@ -282,7 +282,8 @@ So the colder months have fewer observations than the warmer months. We already knew that, but it will be interesting to see how that manifests itself in the models.
2.1 Build the monthly models
-
+Since we are building 12 models (rather than one) it is useful to create a function that computes a model for any month, and then iterate through the months of the year.
+
# A function for making one month's model
#
# @param tbl a data frame of one month's observations
@@ -324,6 +325,13 @@ ::group_map(model_month, bkg = bkg, path = path) |>
::set_names(levels(obs$month)) rlang
dplyr
+We can look at the response plots for every month, but for demonstration purposes, we’ll just show one month.
+
+plot(models[['Jun']], type = 'cloglog')
+
+
+
+
@@ -332,116 +340,112 @@
3.1 Load the raster databases (sst
and u_wind
and v_wind
)
We also make sure they are in date order and add a “month” variable to each.
-
-= "data/oisst"
- sst_path = oisster::read_database(sst_path) |>
- sst_db ::arrange(date) |>
- dplyr::mutate(month = format(date, "%b"))
- dplyr
-
-= "data/nbs"
- wind_path = nbs::read_database(wind_path) |>
- wind_db ::arrange(date)|>
- dplyr::mutate(month = format(date, "%b"))
- dplyr
-= wind_db |>
- u_wind_db ::filter(param == "u_wind")
- dplyr
-= wind_db |>
- v_wind_db ::filter(param == "v_wind") dplyr
+
+= "data/oisst"
+ sst_path = oisster::read_database(sst_path) |>
+ sst_db ::arrange(date) |>
+ dplyr::mutate(month = format(date, "%b"))
+ dplyr
+
+= "data/nbs"
+ wind_path = nbs::read_database(wind_path) |>
+ wind_db ::arrange(date)|>
+ dplyr::mutate(month = format(date, "%b"))
+ dplyr
+= wind_db |>
+ u_wind_db ::filter(param == "u_wind")
+ dplyr
+= wind_db |>
+ v_wind_db ::filter(param == "v_wind") dplyr
3.2 Iterate through the months making predictions
Now we can build an iterator function that will make a prediction for each month. Let’s narrow our predictions to just those for a particular year, 2019, and read the rasters in all at once.
-
-= as.Date(c("2019-01-01", "2019-12-31"))
- dates = read_predictors(
- x sst_db = dplyr::filter(sst_db, dplyr::between(date, dates[1], dates[2])),
- u_wind_db = dplyr::filter(u_wind_db, dplyr::between(date, dates[1], dates[2])),
- v_wind_db = dplyr::filter(v_wind_db, dplyr::between(date, dates[1], dates[2]))
- )
+
+= as.Date(c("2019-01-01", "2019-12-31"))
+ dates = read_predictors(
+ x sst_db = dplyr::filter(sst_db, dplyr::between(date, dates[1], dates[2])),
+ u_wind_db = dplyr::filter(u_wind_db, dplyr::between(date, dates[1], dates[2])),
+ v_wind_db = dplyr::filter(v_wind_db, dplyr::between(date, dates[1], dates[2]))
+ )
Now we can iterate through the months.
-
-= seq(from = dates[1], to = dates[2], by = "month")
- date_sequence = lapply(names(models),
- pred_rasters function(mon){
- = which(month.abb %in% mon)
- ix predict(models[[mon]], dplyr::slice(x, time, ix, drop), type = "cloglog")
-
- }) = do.call(c, append(pred_rasters, list(along = list(time = date_sequence)))) pred_rasters
+
+= seq(from = dates[1], to = dates[2], by = "month")
+ date_sequence = lapply(names(models),
+ pred_rasters function(mon){
+ = which(month.abb %in% mon)
+ ix predict(models[[mon]], dplyr::slice(x, time, ix, drop), type = "cloglog")
+
+ }) = do.call(c, append(pred_rasters, list(along = list(time = date_sequence)))) pred_rasters
Let’s plot them.
-
-= rnaturalearth::ne_coastline(scale = 'large', returnclass = 'sf') |>
- coast ::st_geometry() |>
- sf::st_crop(pred_rasters)
- sf
-= function() {
- plot_coast plot(coast, col = 'green', add = TRUE)
-
- }plot(pred_rasters |> st_to_180(), hook = plot_coast)
+
+= rnaturalearth::ne_coastline(scale = 'large', returnclass = 'sf') |>
+ coast ::st_geometry() |>
+ sf::st_crop(pred_rasters)
+ sf
+= function() {
+ plot_coast plot(coast, col = 'green', add = TRUE)
+
+ }plot(pred_rasters, hook = plot_coast)
-
+
-Let’s see what we can discern from the predict abilities. We can extract the predicted values at the observed locations.
-
-= stars::st_extract(pred_rasters,
- pred_obs ::filter(obs, dplyr::between(date, dates[1], dates[2])),
- dplyrtime_column = "month_id") |>
- ::st_drop_geometry()
- sf= stars::st_extract(pred_rasters,
- pred_bkg ::filter(bkg, dplyr::between(date, dates[1], dates[2])),
- dplyrtime_column = "month_id") |>
- ::st_drop_geometry()
- sf
-= dplyr::bind_rows(pred_obs, pred_bkg) |>
- preds ::mutate(label = c(rep(1, nrow(pred_obs)), rep(0, nrow(pred_bkg))), .before = 1) |>
- dplyr::mutate(month = factor(format(time, "%b"), levels = month.abb)) |>
- dplyr::group_by(month)
- dplyr
-
-= dplyr::group_map(preds,
- aucs function(x, y) {
- ::tibble(month = y$month, auc = maxnetic::AUC(x))
- dplyr|>
- }) ::bind_rows() |>
- dplyr::right_join(counts, by = "month") |>
- dplyrprint(n=12)
+Let’s see what we can discern from the predict abilities. We can extract the predicted values at the observed locations. Having those in hand allows us to compute pAUC for each month.
+
+= stars::st_extract(pred_rasters,
+ pred_obs ::filter(obs, dplyr::between(date, dates[1], dates[2])),
+ dplyrtime_column = "month_id") |>
+ ::mutate(month = factor(format(month_id, "%b"), levels = month.abb)) |>
+ dplyr::group_by(month)
+ dplyr
+= dplyr::group_map(pred_obs,
+ paucs function(x, y) {
+ = month.abb %in% y$month
+ ix = dplyr::slice(pred_rasters, "time", ix)
+ s = maxnetic::pAUC(s,x)
+ pauc ::tibble(month = y$month,
+ dplyrauc = pauc$area,
+ pauc = list(pauc))
+ |>
+ })::bind_rows() |>
+ dplyrprint(n = 12)
-# A tibble: 12 × 4
- month auc n_obs n_bkg
- <fct> <dbl> <int> <int>
- 1 Jan 0.987 33 51
- 2 Feb 0.876 40 57
- 3 Mar 0.957 50 79
- 4 Apr 0.897 341 528
- 5 May 0.888 541 943
- 6 Jun 0.547 2137 3471
- 7 Jul 0.376 2108 3233
- 8 Aug 0.588 1698 2597
- 9 Sep 0.742 725 1205
-10 Oct 0.797 328 485
-11 Nov 0.873 494 739
-12 Dec 0.995 66 90
+# A tibble: 12 × 3
+ month auc pauc
+ <fct> <dbl> <list>
+ 1 Jan 0.703 <pAUC [3]>
+ 2 Feb 0.689 <pAUC [3]>
+ 3 Mar 0.698 <pAUC [3]>
+ 4 Apr 0.677 <pAUC [3]>
+ 5 May 0.654 <pAUC [3]>
+ 6 Jun 0.662 <pAUC [3]>
+ 7 Jul 0.665 <pAUC [3]>
+ 8 Aug 0.696 <pAUC [3]>
+ 9 Sep 0.663 <pAUC [3]>
+10 Oct 0.633 <pAUC [3]>
+11 Nov 0.627 <pAUC [3]>
+12 Dec 0.665 <pAUC [3]>
-OK, that’s unexpected. The months with the lower counts of observations have relatively higher AUCs. Huh? Let’s look at that graphically.
-
-= tidyr::pivot_longer(aucs, dplyr::all_of(c("n_obs", "n_bkg")),
- aucs_long names_to = "type", values_to = "count") |>
- ::mutate(type = dplyr::recode(type, n_obs = "obs", n_bkg = "bkg"))
- dplyr
-::ggplot(data = aucs_long, aes(x = count, y = auc, color = type)) +
- ggplot2::geom_point() +
- ggplot2::geom_smooth(method='lm', formula= y~x) ggplot2
+Note that last element, pauc
, is the result returned by the maxnetic::pAUC()
function which we can plot.
+
+= paucs |>
+ pp ::group_by(month) |>
+ dplyr::group_map(
+ dplyrfunction(tbl, key){
+ plot(tbl$pauc[[1]], title = key$month, xlab = "", ylab = "")
+
+ }
+ )::wrap_plots(pp, ncol = 4) patchwork
-
+
-Surprised? Could this be overfitting resulting from sampling background in time weighted to the months when we have observations? Hmmmm.
+Well, it would be easy to become dispirited by this result. It would be reasonable to expect AUC values to improve if we built monthly models rather than a single model applied to any month. But it seems to not be so. Darn!
diff --git a/docs/modeling-02_files/figure-html/unnamed-chunk-10-1.png b/docs/modeling-02_files/figure-html/unnamed-chunk-10-1.png
new file mode 100644
index 0000000..fcdcd3e
Binary files /dev/null and b/docs/modeling-02_files/figure-html/unnamed-chunk-10-1.png differ
diff --git a/docs/modeling-02_files/figure-html/unnamed-chunk-4-1.png b/docs/modeling-02_files/figure-html/unnamed-chunk-4-1.png
new file mode 100644
index 0000000..c1388b1
Binary files /dev/null and b/docs/modeling-02_files/figure-html/unnamed-chunk-4-1.png differ
diff --git a/docs/modeling-02_files/figure-html/unnamed-chunk-7-1.png b/docs/modeling-02_files/figure-html/unnamed-chunk-7-1.png
deleted file mode 100644
index df52a54..0000000
Binary files a/docs/modeling-02_files/figure-html/unnamed-chunk-7-1.png and /dev/null differ
diff --git a/docs/modeling-02_files/figure-html/unnamed-chunk-8-1.png b/docs/modeling-02_files/figure-html/unnamed-chunk-8-1.png
new file mode 100644
index 0000000..6b8c08d
Binary files /dev/null and b/docs/modeling-02_files/figure-html/unnamed-chunk-8-1.png differ
diff --git a/docs/modeling-02_files/figure-html/unnamed-chunk-9-1.png b/docs/modeling-02_files/figure-html/unnamed-chunk-9-1.png
deleted file mode 100644
index 03d16aa..0000000
Binary files a/docs/modeling-02_files/figure-html/unnamed-chunk-9-1.png and /dev/null differ
diff --git a/docs/search.json b/docs/search.json
index 2f62830..cbd7a91 100644
--- a/docs/search.json
+++ b/docs/search.json
@@ -53,7 +53,14 @@
"href": "modeling-01.html#make-a-prediction",
"title": "Basic modeling",
"section": "6 Make a prediction",
- "text": "6 Make a prediction\nNow we can make predictions with our basic model. We’ll do it two ways. First by simply feeding the input data used to create the model into the prediction. This might seems a bit circular, but it is perfectly reasonable to see how the model does on already labeled data. Second we’ll make a prediction for each month in 2020 using raster data.\n\n6.1 Predict with a data frame\nHere we provide a data frame, in our case the original input data, to the predict() function with type cloglog which transform the response value into the 0-1 range.\n\nprediction = predict(model, input_table, type = 'cloglog')\nhist(prediction, xlab = \"prediction\", main = \"Basic Model\")\n\n\n\n\n\n6.1.1 How did it do?\nWe can use some utilities in the maxnetic package to help us assess the model. First, we need to create a table with two columns: label and pred. Label is the simple a vector of 0/1 indicating that the predicted value is known to be either background or presence. We already have that in our input_vector. Pred is simple the 0-1 scale predicted value. Once we have that we can craft a receiver operator characteristic curve and compute it’s AUC.\n\nx = dplyr::tibble(label = input_vector, pred = as.vector(prediction))\nplot_ROC(x, title = \"v1.0 Basic Model\")\n\n\n\n\nOverall, this is telling us that the model isn’t especially strong as a prediction tool, but it is much better than a 50-50 guess (that’s when AUC is close to 0.5, and the curve follows the light grey line). Learn more about ROC and AUC here.\n\n\n\n6.2 Predict with rasters\nWe can also predict using raster inputs using our basic model. Let’s read in rasters for each month of 2018, and then run a prediction for each month.\n\ndates = as.Date(c(\"2019-01-01\", \"2019-12-31\"))\n\nsst_path = \"data/oisst\"\nsst_db = oisster::read_database(sst_path) |>\n dplyr::arrange(date) |>\n dplyr::filter(dplyr::between(date, dates[1], dates[2]))\n \n\nsst = sst_db |>\n oisster::compose_filename(path = sst_path) |>\n stars::read_stars(along = list(time = sst_db$date)) |>\n rlang::set_names(\"sst\")|>\n st_to_180()\n\n\nwind_path = \"data/nbs\"\nwind_db = nbs::read_database(wind_path) |>\n dplyr::arrange(date)|>\n dplyr::filter(dplyr::between(date, dates[1], dates[2]))\n\nu_wind_db = wind_db |>\n dplyr::filter(param == \"u_wind\")|>\n dplyr::filter(dplyr::between(date, dates[1], dates[2]))\nu_wind = u_wind_db |>\n nbs::compose_filename(path = wind_path) |>\n stars::read_stars(along = list(time = u_wind_db$date)) |>\n rlang::set_names(\"u_wind\") |>\n st_to_180()\n\nv_wind_db = wind_db |>\n dplyr::filter(param == \"v_wind\")|>\n dplyr::filter(dplyr::between(date, dates[1], dates[2]))\nv_wind = v_wind_db |>\n nbs::compose_filename(path = wind_path) |>\n stars::read_stars(along = list(time = v_wind_db$date)) |>\n rlang::set_names(\"v_wind\") |>\n st_to_180()\n\nOnce we have them in hand we need to bind them together. But we need to attend to common but important issue. The sst rasters and windspeed rasters have different extents. We can’t bind them together until we warp one set to match the other. Let’s warp sst to match u_wind. And then we can bind them together.\n\nsst_warped = stars::st_warp(sst, u_wind)\nx = list(sst_warped, u_wind, v_wind)\npredictors = do.call(c, append(x, list(along = NA_integer_))) \npredictors\n\nstars object with 3 dimensions and 3 attributes\nattribute(s):\n Min. 1st Qu. Median Mean 3rd Qu. Max. NA's\nsst -1.558928 12.528449 19.5220385 17.6005908 23.501083 29.216452 11352\nu_wind -2.692028 1.144244 2.7007004 2.7228278 4.115177 13.148120 7612\nv_wind -5.431324 -1.411349 -0.3202622 -0.1398384 1.106175 4.636874 7612\ndimension(s):\n from to offset delta refsys point values x/y\nx 1 74 -76.38 0.25 WGS 84 FALSE NULL [x]\ny 1 46 46.12 -0.25 WGS 84 FALSE NULL [y]\ntime 1 12 NA NA Date NA 2019-01-01,...,2019-12-01 \n\n\nNow we can run the prediction.\n\npred = predict(model, predictors, type = 'cloglog')\npred\n\nstars object with 3 dimensions and 1 attribute\nattribute(s):\n Min. 1st Qu. Median Mean 3rd Qu. Max. NA's\npred 0.0001196393 0.1200618 0.2675931 0.3033565 0.4398977 0.8816952 11487\ndimension(s):\n from to offset delta refsys point values x/y\nx 1 74 -76.38 0.25 WGS 84 FALSE NULL [x]\ny 1 46 46.12 -0.25 WGS 84 FALSE NULL [y]\ntime 1 12 NA NA Date NA 2019-01-01,...,2019-12-01 \n\n\nSince we get a spatially mapped prediction back, we can plot it.\n\ncoast = rnaturalearth::ne_coastline(scale = 'large', returnclass = 'sf') |>\n sf::st_crop(pred)\n\nWarning: attribute variables are assumed to be spatially constant throughout\nall geometries\n\nplot_coast = function() {\n plot(sf::st_geometry(coast), col = 'green', add = TRUE)\n}\nplot(pred, hook = plot_coast)\n\n\n\n\nWell, that certainly looks appealing with higher likelihood of near shore observations occurring during the warmer months.\n\n6.2.1 How did it do?\nTo compute an ROC and AUC for each month, we have a little bit of work to do. We need to extract the observations and background for each month from the prediction maps. These we can then pass to the plot_ROC() function.\n\n\n\n\n\n\nNote\n\n\n\nWe have to modify the date for each point to be the first date of each month. That’s because our predictors are monthlies.\n\n\n\ntest_obs = obs |>\n dplyr::filter(dplyr::between(date, dates[1], dates[2])) |>\n dplyr::select(dplyr::all_of(\"date\")) |>\n dplyr::mutate(date = oisster::current_month(date))\n\ntest_bkg = bkg |>\n dplyr::filter(dplyr::between(date, dates[1], dates[2])) |>\n dplyr::select(dplyr::all_of(\"date\")) |>\n dplyr::mutate(date = oisster::current_month(date))\n\ntest_input = dplyr::bind_rows(test_obs, test_bkg)\n\nx = stars::st_extract(pred, test_input, time_column = 'date') |>\n print()\n\nSimple feature collection with 1537 features and 3 fields\nGeometry type: POINT\nDimension: XY\nBounding box: xmin: -75.99915 ymin: 35.01635 xmax: -58.83057 ymax: 45.95233\nGeodetic CRS: WGS 84\nFirst 10 features:\n pred time date geometry\n1 0.2759255 2019-05-01 2019-05-01 POINT (-67.32935 40.42509)\n2 0.7245142 2019-03-01 2019-03-01 POINT (-74.41057 36.49908)\n3 0.6664676 2019-12-01 2019-12-01 POINT (-75.3994 35.9457)\n4 0.4536477 2019-06-01 2019-06-01 POINT (-75.10864 36.94806)\n5 0.6864945 2019-04-01 2019-04-01 POINT (-74.49892 36.57275)\n6 0.3105710 2019-09-01 2019-09-01 POINT (-75.5519 36.1854)\n7 0.3874695 2019-09-01 2019-09-01 POINT (-73.6245 40.3317)\n8 0.3785449 2019-04-01 2019-04-01 POINT (-69.04389 39.82132)\n9 0.7447747 2019-04-01 2019-04-01 POINT (-74.59436 36.87291)\n10 0.7447747 2019-04-01 2019-04-01 POINT (-74.45753 36.72279)\n\n\nFinally we can build a table that merges the prediction with the labels. We are going to add the name of the month to group by that.\n\ny = x |>\n dplyr::mutate(label = c(rep(1, nrow(test_obs)), rep(0, nrow(test_bkg))),\n month = factor(format(date, \"%b\"), levels = month.abb), \n .before = 2) |>\n sf::st_drop_geometry() |>\n dplyr::select(dplyr::all_of(c(\"month\", \"label\", \"pred\"))) |>\n dplyr::group_by(month) \n\ndplyr::count(y, month, label) |>\n print(n = 24)\n\n# A tibble: 24 × 3\n# Groups: month [12]\n month label n\n <fct> <dbl> <int>\n 1 Jan 0 36\n 2 Jan 1 21\n 3 Feb 0 15\n 4 Feb 1 7\n 5 Mar 0 46\n 6 Mar 1 23\n 7 Apr 0 259\n 8 Apr 1 169\n 9 May 0 182\n10 May 1 119\n11 Jun 0 73\n12 Jun 1 53\n13 Jul 0 76\n14 Jul 1 48\n15 Aug 0 46\n16 Aug 1 39\n17 Sep 0 48\n18 Sep 1 21\n19 Oct 0 102\n20 Oct 1 79\n21 Nov 0 27\n22 Nov 1 19\n23 Dec 0 15\n24 Dec 1 14\n\n\nNow how about one ROC plot for each month? Yikes! This requires a iterative approach, using group_map(), to compute the ROC for each month. We then follow with plot wrapping by the patchwork package.\n\nrocs = dplyr::group_map(y, \n function(tbl, key){\n maxnetic::plot_ROC(tbl, title = sprintf(\"%s, n = %i\", key$month, nrow(tbl)), \n xlab = \"\", ylab = \"\")\n })\n\npatchwork::wrap_plots(rocs, ncol = 4)\n\n\n\n\nHmmm. That’s surprising, yes? Why during the summer months does our AUC go down. In fact, at times we are predicting the likelihood of not having an observation reported. It’s hard to know what to think, but consider that we are using a model generated across all months of multiple years and it might not predict a particular month and year very well. A step toward refinement, our next step is to make 12 models, one for each month."
+ "text": "6 Make a prediction\nNow we can make predictions with our basic model. We’ll do it two ways. First by simply feeding the input data used to create the model into the prediction. This might seems a bit circular, but it is perfectly reasonable to see how the model does on already labeled data. Second we’ll make a prediction for each month in 2020 using raster data.\n\n6.1 Predict with a data frame\nHere we provide a data frame, in our case the original input data, to the predict() function with type cloglog which transform the response value into the 0-1 range.\n\nprediction = predict(model, input_table, type = 'cloglog')\nhist(prediction, xlab = \"prediction\", main = \"Basic Model\")\n\n\n\n\n\n6.1.1 How did it do?\nWe can use some utilities in the maxnetic package to help us assess the model. The pAUC() function will compute statistics, include a presence-only AUC value. We need to pass it two items - the universe of predictions and the predictions for just the presence points.\n\nix = input_vector > 0\npauc = maxnetic::pAUC(prediction, prediction[ix])\nplot(pauc, title = \"v1.0 Basic Model\")\n\n\n\n\nOverall, this is telling us that the model isn’t especially strong as a prediction tool, but it is much better than a 50-50 guess (that’s when AUC is close to 0.5, and the curve follows the light grey line). Learn more about ROC and AUC here.\n\n\n\n6.2 Predict with rasters\nWe can also predict using raster inputs using our basic model. Let’s read in rasters for each month of 2019, and then run a prediction for each month.\nWe provide a function read_predictors() that will read and bind the rasters together for you given the filtered databases and paths. So, first we define the paths and filter the databases to point to just the months in 2019.\n\ndates = as.Date(c(\"2019-01-01\", \"2019-12-31\"))\n\nsst_path = \"data/oisst\"\nsst_db = oisster::read_database(sst_path) |>\n dplyr::arrange(date) |>\n dplyr::filter(dplyr::between(date, dates[1], dates[2]))\n\nwind_path = \"data/nbs\"\nwind_db = nbs::read_database(wind_path) |>\n dplyr::arrange(date)|>\n dplyr::filter(dplyr::between(date, dates[1], dates[2]))\n\nu_wind_db = wind_db |>\n dplyr::filter(param == \"u_wind\")|>\n dplyr::filter(dplyr::between(date, dates[1], dates[2]))\n\nv_wind_db = wind_db |>\n dplyr::filter(param == \"v_wind\")|>\n dplyr::filter(dplyr::between(date, dates[1], dates[2]))\n\npredictors = read_predictors(sst_db = sst_db,\n u_wind_db = u_wind_db,\n v_wind_db = v_wind_db)\npredictors\n\nstars object with 3 dimensions and 3 attributes\nattribute(s):\n Min. 1st Qu. Median Mean 3rd Qu. Max. NA's\nsst -1.558928 12.528449 19.5220385 17.6005908 23.501083 29.216452 11352\nu_wind -2.692028 1.144244 2.7007004 2.7228278 4.115177 13.148120 7612\nv_wind -5.431324 -1.411349 -0.3202622 -0.1398384 1.106175 4.636874 7612\ndimension(s):\n from to offset delta refsys point values x/y\nx 1 74 -76.38 0.25 WGS 84 FALSE NULL [x]\ny 1 46 46.12 -0.25 WGS 84 FALSE NULL [y]\ntime 1 12 NA NA Date NA 2019-01-01,...,2019-12-01 \n\n\nYou can see that we have the rasters in one object of three attributes (sst, u_wind and v_wind) each with 12 layers (Jan 2019 - Dec 2019). Now we can run the prediction.\n\npred = predict(model, predictors, type = 'cloglog')\npred\n\nstars object with 3 dimensions and 1 attribute\nattribute(s):\n Min. 1st Qu. Median Mean 3rd Qu. Max. NA's\npred 0.0001196393 0.1200618 0.2675931 0.3033565 0.4398977 0.8816952 11487\ndimension(s):\n from to offset delta refsys point values x/y\nx 1 74 -76.38 0.25 WGS 84 FALSE NULL [x]\ny 1 46 46.12 -0.25 WGS 84 FALSE NULL [y]\ntime 1 12 NA NA Date NA 2019-01-01,...,2019-12-01 \n\n\nSince we get a spatially mapped prediction back, we can plot it.\n\ncoast = rnaturalearth::ne_coastline(scale = 'large', returnclass = 'sf') |>\n sf::st_crop(pred)\n\nWarning: attribute variables are assumed to be spatially constant throughout\nall geometries\n\nplot_coast = function() {\n plot(sf::st_geometry(coast), col = 'green', add = TRUE)\n}\nplot(pred, hook = plot_coast)\n\n\n\n\nWell, that certainly looks appealing with higher likelihood of near shore observations occurring during the warmer months.\n\n6.2.1 How did it do?\nTo compute an ROC and AUC for each month, we have a little bit of work to do. We need to extract the observations locations for each month from the prediction maps. These we can then plot.\n\n\n\n\n\n\nNote\n\n\n\nWe have to modify the date for each point to be the first date of each month. That’s because our predictors are monthlies.\n\n\n\ntest_obs = obs |>\n dplyr::filter(dplyr::between(date, dates[1], dates[2])) |>\n dplyr::select(dplyr::all_of(\"date\")) |>\n dplyr::mutate(date = oisster::current_month(date))\n\nx = stars::st_extract(pred, test_obs, time_column = 'date') |>\n print()\n\nSimple feature collection with 612 features and 3 fields\nGeometry type: POINT\nDimension: XY\nBounding box: xmin: -75.7589 ymin: 35.1211 xmax: -65.48274 ymax: 43.83954\nGeodetic CRS: WGS 84\nFirst 10 features:\n pred time date geometry\n1 0.2759255 2019-05-01 2019-05-01 POINT (-67.32935 40.42509)\n2 0.7245142 2019-03-01 2019-03-01 POINT (-74.41057 36.49908)\n3 0.6664676 2019-12-01 2019-12-01 POINT (-75.3994 35.9457)\n4 0.4536477 2019-06-01 2019-06-01 POINT (-75.10864 36.94806)\n5 0.6864945 2019-04-01 2019-04-01 POINT (-74.49892 36.57275)\n6 0.3105710 2019-09-01 2019-09-01 POINT (-75.5519 36.1854)\n7 0.3874695 2019-09-01 2019-09-01 POINT (-73.6245 40.3317)\n8 0.3785449 2019-04-01 2019-04-01 POINT (-69.04389 39.82132)\n9 0.7447747 2019-04-01 2019-04-01 POINT (-74.59436 36.87291)\n10 0.7447747 2019-04-01 2019-04-01 POINT (-74.45753 36.72279)\n\n\nFinally we can build a table that merges the prediction with the labels. We are going to add the name of the month to group by that.\n\ny = x |>\n dplyr::mutate(month = factor(format(date, \"%b\"), levels = month.abb), \n .before = 1) |>\n dplyr::select(dplyr::all_of(c(\"month\", \"pred\", \"date\"))) |>\n dplyr::group_by(month) \n\ndplyr::count(y, month) |>\n print(n = 12)\n\nSimple feature collection with 12 features and 2 fields\nGeometry type: MULTIPOINT\nDimension: XY\nBounding box: xmin: -75.7589 ymin: 35.1211 xmax: -65.48274 ymax: 43.83954\nGeodetic CRS: WGS 84\n# A tibble: 12 × 3\n month n geometry\n * <fct> <int> <MULTIPOINT [°]>\n 1 Jan 21 ((-74.63902 36.26849), (-75.01758 36.49984), (-75.01801 36.72554…\n 2 Feb 7 ((-74.52432 37.24967), (-74.45561 37.16891), (-74.74373 36.72355…\n 3 Mar 23 ((-74.53117 36.26996), (-74.60195 36.72201), (-74.67127 36.72266…\n 4 Apr 169 ((-72.924 38.6733), (-73.0165 38.591), (-73.0036 38.56), (-73.10…\n 5 May 119 ((-74.56571 35.6059), (-75.2181 35.1934), (-75.3228 35.535), (-7…\n 6 Jun 53 ((-73.10608 38.72575), (-74.86204 36.27105), (-75.04656 36.34824…\n 7 Jul 48 ((-74.53554 36.19828), (-74.91756 36.27104), (-75.10905 36.27065…\n 8 Aug 39 ((-72.78628 38.68677), (-72.98868 38.61241), (-74.9889 36.2911),…\n 9 Sep 21 ((-75.3167 36.0439), (-75.5204 36.3294), (-75.5519 36.1854), (-7…\n10 Oct 79 ((-67.06445 42.91399), (-68.43614 43.83954), (-69.14391 43.16967…\n11 Nov 19 ((-72.52681 39.21286), (-71.54966 39.99385), (-67.79606 40.36107…\n12 Dec 14 ((-75.242 35.2705), (-75.3335 35.3027), (-75.436 35.1211), (-75.…\n\n\nNow how about one ROC plot for each month? Yikes! This requires a iterative approach, using group_map(), to compute the ROC for each month. We then follow with plot wrapping by the patchwork package.\n\npaucs = dplyr::group_map(y, \n function(tbl, key, pred_rasters = NULL){\n ix = key$month %in% month.abb\n x = dplyr::slice(pred_rasters, \"time\", ix)\n pauc = maxnetic::pAUC(x, tbl)\n plot(pauc,title = key$month, \n xlab = \"\", ylab = \"\")\n }, pred_rasters = pred)\n\npatchwork::wrap_plots(paucs, ncol = 4)\n\n\n\n\nHmmm. That’s surprising, yes? Why during the summer months does our AUC go down when we have the most number of observations? That seems counter intuitive."
+ },
+ {
+ "objectID": "modeling-01.html#thinking-about-auc",
+ "href": "modeling-01.html#thinking-about-auc",
+ "title": "Basic modeling",
+ "section": "7 Thinking about AUC",
+ "text": "7 Thinking about AUC\nAUC is a diagnostic that provides a peek into the predictive power of a model. But what is it? An analogy is fitting a straight line to a small set of observations verses a large set of observations and then comparing the correlation coefficients. Here’s a simple example using R’s built-in dataset cars which is a data frame of 50 observations of speed and stopping distances of cars. We’ll compute a linear model for the entire data set, and then a second for a small subsample of the data. (Learn more about linear models in R here.)\n\ndata(\"cars\")\ncars = dplyr::as_tibble(cars)\n\nall_fit = lm(dist ~ speed, data = cars)\nsummary(all_fit)\n\n\nCall:\nlm(formula = dist ~ speed, data = cars)\n\nResiduals:\n Min 1Q Median 3Q Max \n-29.069 -9.525 -2.272 9.215 43.201 \n\nCoefficients:\n Estimate Std. Error t value Pr(>|t|) \n(Intercept) -17.5791 6.7584 -2.601 0.0123 * \nspeed 3.9324 0.4155 9.464 1.49e-12 ***\n---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\nResidual standard error: 15.38 on 48 degrees of freedom\nMultiple R-squared: 0.6511, Adjusted R-squared: 0.6438 \nF-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12\n\n\n\nset.seed(5)\nsub_cars = dplyr::slice_sample(cars, n = 3)\nsub_fit = lm(dist ~ speed, data = sub_cars)\nsummary(sub_fit)\n\n\nCall:\nlm(formula = dist ~ speed, data = sub_cars)\n\nResiduals:\n 1 2 3 \n 3 3 -6 \n\nCoefficients:\n Estimate Std. Error t value Pr(>|t|)\n(Intercept) -6.5000 8.8741 -0.732 0.598\nspeed 3.3750 0.6495 5.196 0.121\n\nResidual standard error: 7.348 on 1 degrees of freedom\nMultiple R-squared: 0.9643, Adjusted R-squared: 0.9286 \nF-statistic: 27 on 1 and 1 DF, p-value: 0.121\n\n\nYou can see that the rU+00B2 value is quite high for the smaller data set, but the model may not be predictive over the full range of data. AUC is somewhat analogous to to rU+00B2 in that a relatively low score does not necessarily suggest a poor model.\n\nggplot2::ggplot(data = cars, ggplot2::aes(x = speed, y = dist)) +\n ggplot2::geom_point(color = \"blue\") +\n ggplot2::geom_abline(slope = coef(all_fit)[2], intercept = coef(all_fit)[1], color = \"blue\") + \n ggplot2::geom_point(data = sub_cars, ggplot2::aes(x = speed, y = dist), color = \"orange\") +\n ggplot2::geom_abline(slope = coef(sub_fit)[2], intercept = coef(sub_fit)[1], color = \"orange\")"
},
{
"objectID": "predictors.html",
@@ -193,13 +200,13 @@
"href": "modeling-02.html#do-we-model-every-month",
"title": "Modeling each month",
"section": "2 Do we model every month?",
- "text": "2 Do we model every month?\nLet’s do a quick check by counting each by month. Note that we drop the spatial info so that we can make simply tallies.\n\ncounts = sf::st_drop_geometry(obs) |> \n dplyr::count(month, name = \"n_obs\") |>\n dplyr::left_join(sf::st_drop_geometry(bkg) |> dplyr::count(month, name = \"n_bkg\"), \n by = 'month') |>\n print(n = 12)\n\n# A tibble: 12 × 3\n month n_obs n_bkg\n <fct> <int> <int>\n 1 Jan 33 51\n 2 Feb 40 57\n 3 Mar 50 79\n 4 Apr 341 528\n 5 May 541 943\n 6 Jun 2137 3471\n 7 Jul 2108 3233\n 8 Aug 1698 2597\n 9 Sep 725 1205\n10 Oct 328 485\n11 Nov 494 739\n12 Dec 66 90\n\n\nSo the colder months have fewer observations than the warmer months. We already knew that, but it will be interesting to see how that manifests itself in the models.\n\n2.1 Build the monthly models\n\n# A function for making one month's model\n#\n# @param tbl a data frame of one month's observations\n# @param key a data frame that holds the current iteration's month name\n# @param bkg a complete data frame of background data (which we filter for the given month)\n# @param path the path where the model is saved\n# @return a model, which is also saved in \"data/model/v2/v2.<monthname>\"\nmodel_month = function(tbl, key, bkg = NULL, path = \".\"){\n \n bkg = bkg |>\n dplyr::filter(month == key$month) |>\n sf::st_drop_geometry() |>\n dplyr::select(dplyr::all_of(c(\"sst\", \"u_wind\", \"v_wind\"))) |>\n na.omit()\n \n obs = tbl |>\n sf::st_drop_geometry() |>\n dplyr::select(dplyr::all_of(c(\"sst\", \"u_wind\", \"v_wind\"))) |>\n na.omit()\n \n # these are the predictor variables row bound\n x = dplyr::bind_rows(obs, bkg)\n \n # and the flag indicating presence/background\n flag = c(rep(1, nrow(obs)), rep(0, nrow(bkg)))\n \n model_path = file.path(path, paste0(\"v2.\", key$month, \".rds\"))\n\n model = maxnet::maxnet(flag, x) |>\n maxnetic::write_maxnet(model_path)\n \n model\n}\n\npath = file.path(\"data\", \"model\", \"v2\")\nok = dir.create(path, recursive = TRUE, showWarnings = FALSE)\nmodels = obs |>\n dplyr::group_by(month) |>\n dplyr::group_map(model_month, bkg = bkg, path = path) |>\n rlang::set_names(levels(obs$month))"
+ "text": "2 Do we model every month?\nLet’s do a quick check by counting each by month. Note that we drop the spatial info so that we can make simply tallies.\n\ncounts = sf::st_drop_geometry(obs) |> \n dplyr::count(month, name = \"n_obs\") |>\n dplyr::left_join(sf::st_drop_geometry(bkg) |> dplyr::count(month, name = \"n_bkg\"), \n by = 'month') |>\n print(n = 12)\n\n# A tibble: 12 × 3\n month n_obs n_bkg\n <fct> <int> <int>\n 1 Jan 33 51\n 2 Feb 40 57\n 3 Mar 50 79\n 4 Apr 341 528\n 5 May 541 943\n 6 Jun 2137 3471\n 7 Jul 2108 3233\n 8 Aug 1698 2597\n 9 Sep 725 1205\n10 Oct 328 485\n11 Nov 494 739\n12 Dec 66 90\n\n\nSo the colder months have fewer observations than the warmer months. We already knew that, but it will be interesting to see how that manifests itself in the models.\n\n2.1 Build the monthly models\nSince we are building 12 models (rather than one) it is useful to create a function that computes a model for any month, and then iterate through the months of the year.\n\n# A function for making one month's model\n#\n# @param tbl a data frame of one month's observations\n# @param key a data frame that holds the current iteration's month name\n# @param bkg a complete data frame of background data (which we filter for the given month)\n# @param path the path where the model is saved\n# @return a model, which is also saved in \"data/model/v2/v2.<monthname>\"\nmodel_month = function(tbl, key, bkg = NULL, path = \".\"){\n \n bkg = bkg |>\n dplyr::filter(month == key$month) |>\n sf::st_drop_geometry() |>\n dplyr::select(dplyr::all_of(c(\"sst\", \"u_wind\", \"v_wind\"))) |>\n na.omit()\n \n obs = tbl |>\n sf::st_drop_geometry() |>\n dplyr::select(dplyr::all_of(c(\"sst\", \"u_wind\", \"v_wind\"))) |>\n na.omit()\n \n # these are the predictor variables row bound\n x = dplyr::bind_rows(obs, bkg)\n \n # and the flag indicating presence/background\n flag = c(rep(1, nrow(obs)), rep(0, nrow(bkg)))\n \n model_path = file.path(path, paste0(\"v2.\", key$month, \".rds\"))\n\n model = maxnet::maxnet(flag, x) |>\n maxnetic::write_maxnet(model_path)\n \n model\n}\n\npath = file.path(\"data\", \"model\", \"v2\")\nok = dir.create(path, recursive = TRUE, showWarnings = FALSE)\nmodels = obs |>\n dplyr::group_by(month) |>\n dplyr::group_map(model_month, bkg = bkg, path = path) |>\n rlang::set_names(levels(obs$month))\n\nWe can look at the response plots for every month, but for demonstration purposes, we’ll just show one month.\n\nplot(models[['Jun']], type = 'cloglog')"
},
{
"objectID": "modeling-02.html#predict-with-rasters",
"href": "modeling-02.html#predict-with-rasters",
"title": "Modeling each month",
"section": "3 Predict with rasters",
- "text": "3 Predict with rasters\nFirst we load the raster databases as these are lightweight to pass into a function that iterates through the months.\n\n3.1 Load the raster databases (sst and u_wind and v_wind)\nWe also make sure they are in date order and add a “month” variable to each.\n\nsst_path = \"data/oisst\"\nsst_db = oisster::read_database(sst_path) |>\n dplyr::arrange(date) |>\n dplyr::mutate(month = format(date, \"%b\"))\n \n\nwind_path = \"data/nbs\"\nwind_db = nbs::read_database(wind_path) |>\n dplyr::arrange(date)|>\n dplyr::mutate(month = format(date, \"%b\"))\n\nu_wind_db = wind_db |>\n dplyr::filter(param == \"u_wind\")\n\nv_wind_db = wind_db |>\n dplyr::filter(param == \"v_wind\")\n\n\n\n3.2 Iterate through the months making predictions\nNow we can build an iterator function that will make a prediction for each month. Let’s narrow our predictions to just those for a particular year, 2019, and read the rasters in all at once.\n\ndates = as.Date(c(\"2019-01-01\", \"2019-12-31\"))\nx = read_predictors(\n sst_db = dplyr::filter(sst_db, dplyr::between(date, dates[1], dates[2])),\n u_wind_db = dplyr::filter(u_wind_db, dplyr::between(date, dates[1], dates[2])),\n v_wind_db = dplyr::filter(v_wind_db, dplyr::between(date, dates[1], dates[2]))\n)\n\nNow we can iterate through the months.\n\ndate_sequence = seq(from = dates[1], to = dates[2], by = \"month\")\npred_rasters = lapply(names(models),\n function(mon){\n ix = which(month.abb %in% mon)\n predict(models[[mon]], dplyr::slice(x, time, ix, drop), type = \"cloglog\")\n }) \npred_rasters = do.call(c, append(pred_rasters, list(along = list(time = date_sequence))))\n\nLet’s plot them.\n\ncoast = rnaturalearth::ne_coastline(scale = 'large', returnclass = 'sf') |>\n sf::st_geometry() |>\n sf::st_crop(pred_rasters)\n\nplot_coast = function() {\n plot(coast, col = 'green', add = TRUE)\n}\nplot(pred_rasters |> st_to_180(), hook = plot_coast)\n\n\n\n\nLet’s see what we can discern from the predict abilities. We can extract the predicted values at the observed locations.\n\npred_obs = stars::st_extract(pred_rasters, \n dplyr::filter(obs, dplyr::between(date, dates[1], dates[2])),\n time_column = \"month_id\") |>\n sf::st_drop_geometry() \npred_bkg = stars::st_extract(pred_rasters, \n dplyr::filter(bkg, dplyr::between(date, dates[1], dates[2])),\n time_column = \"month_id\") |>\n sf::st_drop_geometry() \n\npreds = dplyr::bind_rows(pred_obs, pred_bkg) |>\n dplyr::mutate(label = c(rep(1, nrow(pred_obs)), rep(0, nrow(pred_bkg))), .before = 1) |>\n dplyr::mutate(month = factor(format(time, \"%b\"), levels = month.abb)) |>\n dplyr::group_by(month)\n\n\naucs = dplyr::group_map(preds,\n function(x, y) {\n dplyr::tibble(month = y$month, auc = maxnetic::AUC(x))\n }) |>\n dplyr::bind_rows() |>\n dplyr::right_join(counts, by = \"month\") |>\n print(n=12)\n\n# A tibble: 12 × 4\n month auc n_obs n_bkg\n <fct> <dbl> <int> <int>\n 1 Jan 0.987 33 51\n 2 Feb 0.876 40 57\n 3 Mar 0.957 50 79\n 4 Apr 0.897 341 528\n 5 May 0.888 541 943\n 6 Jun 0.547 2137 3471\n 7 Jul 0.376 2108 3233\n 8 Aug 0.588 1698 2597\n 9 Sep 0.742 725 1205\n10 Oct 0.797 328 485\n11 Nov 0.873 494 739\n12 Dec 0.995 66 90\n\n\nOK, that’s unexpected. The months with the lower counts of observations have relatively higher AUCs. Huh? Let’s look at that graphically.\n\naucs_long = tidyr::pivot_longer(aucs, dplyr::all_of(c(\"n_obs\", \"n_bkg\")),\n names_to = \"type\", values_to = \"count\") |>\n dplyr::mutate(type = dplyr::recode(type, n_obs = \"obs\", n_bkg = \"bkg\"))\n\nggplot2::ggplot(data = aucs_long, aes(x = count, y = auc, color = type)) +\n ggplot2::geom_point() + \n ggplot2::geom_smooth(method='lm', formula= y~x)\n\n\n\n\nSurprised? Could this be overfitting resulting from sampling background in time weighted to the months when we have observations? Hmmmm."
+ "text": "3 Predict with rasters\nFirst we load the raster databases as these are lightweight to pass into a function that iterates through the months.\n\n3.1 Load the raster databases (sst and u_wind and v_wind)\nWe also make sure they are in date order and add a “month” variable to each.\n\nsst_path = \"data/oisst\"\nsst_db = oisster::read_database(sst_path) |>\n dplyr::arrange(date) |>\n dplyr::mutate(month = format(date, \"%b\"))\n \n\nwind_path = \"data/nbs\"\nwind_db = nbs::read_database(wind_path) |>\n dplyr::arrange(date)|>\n dplyr::mutate(month = format(date, \"%b\"))\n\nu_wind_db = wind_db |>\n dplyr::filter(param == \"u_wind\")\n\nv_wind_db = wind_db |>\n dplyr::filter(param == \"v_wind\")\n\n\n\n3.2 Iterate through the months making predictions\nNow we can build an iterator function that will make a prediction for each month. Let’s narrow our predictions to just those for a particular year, 2019, and read the rasters in all at once.\n\ndates = as.Date(c(\"2019-01-01\", \"2019-12-31\"))\nx = read_predictors(\n sst_db = dplyr::filter(sst_db, dplyr::between(date, dates[1], dates[2])),\n u_wind_db = dplyr::filter(u_wind_db, dplyr::between(date, dates[1], dates[2])),\n v_wind_db = dplyr::filter(v_wind_db, dplyr::between(date, dates[1], dates[2]))\n)\n\nNow we can iterate through the months.\n\ndate_sequence = seq(from = dates[1], to = dates[2], by = \"month\")\npred_rasters = lapply(names(models),\n function(mon){\n ix = which(month.abb %in% mon)\n predict(models[[mon]], dplyr::slice(x, time, ix, drop), type = \"cloglog\")\n }) \npred_rasters = do.call(c, append(pred_rasters, list(along = list(time = date_sequence))))\n\nLet’s plot them.\n\ncoast = rnaturalearth::ne_coastline(scale = 'large', returnclass = 'sf') |>\n sf::st_geometry() |>\n sf::st_crop(pred_rasters)\n\nplot_coast = function() {\n plot(coast, col = 'green', add = TRUE)\n}\nplot(pred_rasters, hook = plot_coast)\n\n\n\n\nLet’s see what we can discern from the predict abilities. We can extract the predicted values at the observed locations. Having those in hand allows us to compute pAUC for each month.\n\npred_obs = stars::st_extract(pred_rasters, \n dplyr::filter(obs, dplyr::between(date, dates[1], dates[2])),\n time_column = \"month_id\") |>\n dplyr::mutate(month = factor(format(month_id, \"%b\"), levels = month.abb)) |>\n dplyr::group_by(month)\n\npaucs = dplyr::group_map(pred_obs,\n function(x, y) {\n ix = month.abb %in% y$month\n s = dplyr::slice(pred_rasters, \"time\", ix)\n pauc = maxnetic::pAUC(s,x)\n dplyr::tibble(month = y$month, \n auc = pauc$area,\n pauc = list(pauc))\n })|>\n dplyr::bind_rows() |>\n print(n = 12)\n\n# A tibble: 12 × 3\n month auc pauc \n <fct> <dbl> <list> \n 1 Jan 0.703 <pAUC [3]>\n 2 Feb 0.689 <pAUC [3]>\n 3 Mar 0.698 <pAUC [3]>\n 4 Apr 0.677 <pAUC [3]>\n 5 May 0.654 <pAUC [3]>\n 6 Jun 0.662 <pAUC [3]>\n 7 Jul 0.665 <pAUC [3]>\n 8 Aug 0.696 <pAUC [3]>\n 9 Sep 0.663 <pAUC [3]>\n10 Oct 0.633 <pAUC [3]>\n11 Nov 0.627 <pAUC [3]>\n12 Dec 0.665 <pAUC [3]>\n\n\nNote that last element, pauc, is the result returned by the maxnetic::pAUC() function which we can plot.\n\npp = paucs |>\n dplyr::group_by(month) |>\n dplyr::group_map(\n function(tbl, key){\n plot(tbl$pauc[[1]], title = key$month, xlab = \"\", ylab = \"\")\n }\n )\npatchwork::wrap_plots(pp, ncol = 4)\n\n\n\n\nWell, it would be easy to become dispirited by this result. It would be reasonable to expect AUC values to improve if we built monthly models rather than a single model applied to any month. But it seems to not be so. Darn!"
}
]
\ No newline at end of file
diff --git a/functions/stars.R b/functions/stars.R
index e224403..b8f13fa 100644
--- a/functions/stars.R
+++ b/functions/stars.R
@@ -46,5 +46,5 @@ read_predictors = function(
} else {
x = do.call(c, append(xx, list(along = NA_integer_)))
}
- x
+ st_to_180(x)
}
\ No newline at end of file
diff --git a/modeling-01.qmd b/modeling-01.qmd
index 2975284..aa96d43 100644
--- a/modeling-01.qmd
+++ b/modeling-01.qmd
@@ -1,6 +1,6 @@
---
title: "Basic modeling"
-cache: true
+cache: false
---
So at this point we have point data for observation and background that have been joined with common environmental covariates (aka predictors). Here we show the basic steps taken to prepare, build and assess a model. Later, we'll try more sophisticated modeling, such as modeling by month or splitting the data into training-testing groups.
@@ -116,18 +116,21 @@ hist(prediction, xlab = "prediction", main = "Basic Model")
#### How did it do?
-We can use some utilities in the [maxnetic](https://github.com/BigelowLab/maxnetic) package to help us assess the model. First, we need to create a table with two columns: `label` and `pred`. Label is the simple a vector of 0/1 indicating that the predicted value is known to be either background or presence. We already have that in our `input_vector`. Pred is simple the 0-1 scale predicted value. Once we have that we can craft a [receiver operator characteristic curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic) and compute it's [AUC](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve).
+We can use some utilities in the [maxnetic](https://github.com/BigelowLab/maxnetic) package to help us assess the model. The `pAUC()` function will compute statistics, include a presence-only AUC value. We need to pass it two items - the universe of predictions and the predictions for just the presence points.
```{r}
-x = dplyr::tibble(label = input_vector, pred = as.vector(prediction))
-plot_ROC(x, title = "v1.0 Basic Model")
+ix = input_vector > 0
+pauc = maxnetic::pAUC(prediction, prediction[ix])
+plot(pauc, title = "v1.0 Basic Model")
```
Overall, this is telling us that the model isn't especially strong as a prediction tool, but it is much better than a 50-50 guess (that's when AUC is close to 0.5, and the curve follows the light grey line). Learn more about ROC and AUC [here](https://rviews.rstudio.com/2019/01/17/roc-curves/).
### Predict with rasters
-We can also predict using raster inputs using our basic model. Let's read in rasters for each month of 2018, and then run a prediction for each month.
+We can also predict using raster inputs using our basic model. Let's read in rasters for each month of 2019, and then run a prediction for each month.
+
+We provide a function `read_predictors()` that will read and bind the rasters together for you given the filtered databases and paths. So, first we define the paths and filter the databases to point to just the months in 2019.
```{r}
dates = as.Date(c("2019-01-01", "2019-12-31"))
@@ -136,14 +139,6 @@ sst_path = "data/oisst"
sst_db = oisster::read_database(sst_path) |>
dplyr::arrange(date) |>
dplyr::filter(dplyr::between(date, dates[1], dates[2]))
-
-
-sst = sst_db |>
- oisster::compose_filename(path = sst_path) |>
- stars::read_stars(along = list(time = sst_db$date)) |>
- rlang::set_names("sst")|>
- st_to_180()
-
wind_path = "data/nbs"
wind_db = nbs::read_database(wind_path) |>
@@ -153,33 +148,18 @@ wind_db = nbs::read_database(wind_path) |>
u_wind_db = wind_db |>
dplyr::filter(param == "u_wind")|>
dplyr::filter(dplyr::between(date, dates[1], dates[2]))
-u_wind = u_wind_db |>
- nbs::compose_filename(path = wind_path) |>
- stars::read_stars(along = list(time = u_wind_db$date)) |>
- rlang::set_names("u_wind") |>
- st_to_180()
v_wind_db = wind_db |>
dplyr::filter(param == "v_wind")|>
dplyr::filter(dplyr::between(date, dates[1], dates[2]))
-v_wind = v_wind_db |>
- nbs::compose_filename(path = wind_path) |>
- stars::read_stars(along = list(time = v_wind_db$date)) |>
- rlang::set_names("v_wind") |>
- st_to_180()
-```
-
-
-Once we have them in hand we need to bind them together. But we need to attend to common but important issue. The `sst` rasters and `windspeed` rasters have different extents. We can't bind them together until we warp one set to match the other. Let's warp `sst` to match `u_wind`. And then we can bind them together.
-```{r}
-sst_warped = stars::st_warp(sst, u_wind)
-x = list(sst_warped, u_wind, v_wind)
-predictors = do.call(c, append(x, list(along = NA_integer_)))
+predictors = read_predictors(sst_db = sst_db,
+ u_wind_db = u_wind_db,
+ v_wind_db = v_wind_db)
predictors
```
-Now we can run the prediction.
+You can see that we have the rasters in one object of three attributes (`sst`, `u_wind` and `v_wind`) each with 12 layers (Jan 2019 - Dec 2019). Now we can run the prediction.
```{r}
pred = predict(model, predictors, type = 'cloglog')
@@ -202,7 +182,7 @@ Well, that certainly looks appealing with higher likelihood of near shore observ
#### How did it do?
-To compute an ROC and AUC for each month, we have a little bit of work to do. We need to extract the observations and background for each month from the prediction maps. These we can then pass to the `plot_ROC()` function.
+To compute an ROC and AUC for each month, we have a little bit of work to do. We need to extract the observations locations for each month from the prediction maps. These we can then plot.
:::{.callout-note}
We have to modify the date for each point to be the first date of each month. That's because our predictors are monthlies.
@@ -214,14 +194,7 @@ test_obs = obs |>
dplyr::select(dplyr::all_of("date")) |>
dplyr::mutate(date = oisster::current_month(date))
-test_bkg = bkg |>
- dplyr::filter(dplyr::between(date, dates[1], dates[2])) |>
- dplyr::select(dplyr::all_of("date")) |>
- dplyr::mutate(date = oisster::current_month(date))
-
-test_input = dplyr::bind_rows(test_obs, test_bkg)
-
-x = stars::st_extract(pred, test_input, time_column = 'date') |>
+x = stars::st_extract(pred, test_obs, time_column = 'date') |>
print()
```
@@ -229,30 +202,63 @@ Finally we can build a table that merges the prediction with the labels. We are
```{r}
y = x |>
- dplyr::mutate(label = c(rep(1, nrow(test_obs)), rep(0, nrow(test_bkg))),
- month = factor(format(date, "%b"), levels = month.abb),
- .before = 2) |>
- sf::st_drop_geometry() |>
- dplyr::select(dplyr::all_of(c("month", "label", "pred"))) |>
+ dplyr::mutate(month = factor(format(date, "%b"), levels = month.abb),
+ .before = 1) |>
+ dplyr::select(dplyr::all_of(c("month", "pred", "date"))) |>
dplyr::group_by(month)
-dplyr::count(y, month, label) |>
- print(n = 24)
+dplyr::count(y, month) |>
+ print(n = 12)
```
Now how about one ROC plot for each month? Yikes! This requires a iterative approach, using `group_map()`, to compute the ROC for each month. We then follow with plot wrapping by the [patchwork](https://patchwork.data-imaginist.com/articles/guides/assembly.html#functional-assembly) package.
```{r}
#| width: "100%"
-rocs = dplyr::group_map(y,
- function(tbl, key){
- maxnetic::plot_ROC(tbl, title = sprintf("%s, n = %i", key$month, nrow(tbl)),
- xlab = "", ylab = "")
- })
+paucs = dplyr::group_map(y,
+ function(tbl, key, pred_rasters = NULL){
+ ix = key$month %in% month.abb
+ x = dplyr::slice(pred_rasters, "time", ix)
+ pauc = maxnetic::pAUC(x, tbl)
+ plot(pauc,title = key$month,
+ xlab = "", ylab = "")
+ }, pred_rasters = pred)
+
+patchwork::wrap_plots(paucs, ncol = 4)
+```
-patchwork::wrap_plots(rocs, ncol = 4)
+Hmmm. That's surprising, yes? Why during the summer months does our AUC go down when we have the most number of observations? That seems counter intuitive.
+
+## Thinking about AUC
+
+AUC is a diagnostic that provides a peek into the predictive power of a model. But what is it? An analogy is fitting a straight line to a small set of observations verses a large set of observations and then comparing the correlation coefficients. Here's a simple example using R's built-in dataset `cars` which is a data frame of 50 observations of speed and stopping distances of cars. We'll compute a linear model for the entire data set, and then a second for a small subsample of the data. (Learn more about linear models in R [here](https://rseek.org/?q=linear+models).)
+
+```{r}
+data("cars")
+cars = dplyr::as_tibble(cars)
+
+all_fit = lm(dist ~ speed, data = cars)
+summary(all_fit)
+```
+
+```{r}
+set.seed(5)
+sub_cars = dplyr::slice_sample(cars, n = 3)
+sub_fit = lm(dist ~ speed, data = sub_cars)
+summary(sub_fit)
```
-Hmmm. That's surprising, yes? Why during the summer months does our AUC go down. In fact, at times we are predicting the likelihood of **not** having an observation reported. It's hard to know what to think, but consider that we are using a model generated across all months of multiple years and it might not predict a particular month and year very well. A step toward refinement, our next step is to make 12 models, one for each month.
+You can see that the `rU+00B2` value is quite high for the smaller data set, but the model may not be predictive over the full range of data. AUC is somewhat analogous to to `rU+00B2` in that a relatively low score does not necessarily suggest a poor model.
+
+```{r}
+ggplot2::ggplot(data = cars, ggplot2::aes(x = speed, y = dist)) +
+ ggplot2::geom_point(color = "blue") +
+ ggplot2::geom_abline(slope = coef(all_fit)[2], intercept = coef(all_fit)[1], color = "blue") +
+ ggplot2::geom_point(data = sub_cars, ggplot2::aes(x = speed, y = dist), color = "orange") +
+ ggplot2::geom_abline(slope = coef(sub_fit)[2], intercept = coef(sub_fit)[1], color = "orange")
+```
+
+
+
diff --git a/modeling-01_files/figure-html/unnamed-chunk-11-1.png b/modeling-01_files/figure-html/unnamed-chunk-11-1.png
new file mode 100644
index 0000000..b605749
Binary files /dev/null and b/modeling-01_files/figure-html/unnamed-chunk-11-1.png differ
diff --git a/modeling-01_files/figure-html/unnamed-chunk-14-1.png b/modeling-01_files/figure-html/unnamed-chunk-14-1.png
new file mode 100644
index 0000000..6059e6c
Binary files /dev/null and b/modeling-01_files/figure-html/unnamed-chunk-14-1.png differ
diff --git a/modeling-01_files/figure-html/unnamed-chunk-17-1.png b/modeling-01_files/figure-html/unnamed-chunk-17-1.png
new file mode 100644
index 0000000..33eebe3
Binary files /dev/null and b/modeling-01_files/figure-html/unnamed-chunk-17-1.png differ
diff --git a/modeling-01_files/figure-html/unnamed-chunk-8-1.png b/modeling-01_files/figure-html/unnamed-chunk-8-1.png
index 87068ba..1102229 100644
Binary files a/modeling-01_files/figure-html/unnamed-chunk-8-1.png and b/modeling-01_files/figure-html/unnamed-chunk-8-1.png differ
diff --git a/modeling-02.qmd b/modeling-02.qmd
index 5777100..2abc3ae 100644
--- a/modeling-02.qmd
+++ b/modeling-02.qmd
@@ -37,8 +37,9 @@ So the colder months have fewer observations than the warmer months. We already
### Build the monthly models
-```{r}
+Since we are building 12 models (rather than one) it is useful to create a function that computes a model for any month, and then iterate through the months of the year.
+```{r}
# A function for making one month's model
#
# @param tbl a data frame of one month's observations
@@ -81,8 +82,13 @@ models = obs |>
rlang::set_names(levels(obs$month))
```
-## Predict with rasters
+We can look at the response plots for every month, but for demonstration purposes, we'll just show one month.
+```{r}
+plot(models[['Jun']], type = 'cloglog')
+```
+
+## Predict with rasters
First we load the raster databases as these are lightweight to pass into a function that iterates through the months.
### Load the raster databases (`sst` and `u_wind` and `v_wind`)
@@ -142,10 +148,10 @@ coast = rnaturalearth::ne_coastline(scale = 'large', returnclass = 'sf') |>
plot_coast = function() {
plot(coast, col = 'green', add = TRUE)
}
-plot(pred_rasters |> st_to_180(), hook = plot_coast)
+plot(pred_rasters, hook = plot_coast)
```
-Let's see what we can discern from the predict abilities. We can extract the predicted values at the observed locations.
+Let's see what we can discern from the predict abilities. We can extract the predicted values at the observed locations. Having those in hand allows us to compute pAUC for each month.
@@ -153,36 +159,33 @@ Let's see what we can discern from the predict abilities. We can extract the pre
pred_obs = stars::st_extract(pred_rasters,
dplyr::filter(obs, dplyr::between(date, dates[1], dates[2])),
time_column = "month_id") |>
- sf::st_drop_geometry()
-pred_bkg = stars::st_extract(pred_rasters,
- dplyr::filter(bkg, dplyr::between(date, dates[1], dates[2])),
- time_column = "month_id") |>
- sf::st_drop_geometry()
-
-preds = dplyr::bind_rows(pred_obs, pred_bkg) |>
- dplyr::mutate(label = c(rep(1, nrow(pred_obs)), rep(0, nrow(pred_bkg))), .before = 1) |>
- dplyr::mutate(month = factor(format(time, "%b"), levels = month.abb)) |>
+ dplyr::mutate(month = factor(format(month_id, "%b"), levels = month.abb)) |>
dplyr::group_by(month)
-
-aucs = dplyr::group_map(preds,
+paucs = dplyr::group_map(pred_obs,
function(x, y) {
- dplyr::tibble(month = y$month, auc = maxnetic::AUC(x))
- }) |>
+ ix = month.abb %in% y$month
+ s = dplyr::slice(pred_rasters, "time", ix)
+ pauc = maxnetic::pAUC(s,x)
+ dplyr::tibble(month = y$month,
+ auc = pauc$area,
+ pauc = list(pauc))
+ })|>
dplyr::bind_rows() |>
- dplyr::right_join(counts, by = "month") |>
- print(n=12)
+ print(n = 12)
```
-OK, that's unexpected. The months with the lower counts of observations have relatively higher AUCs. Huh? Let's look at that graphically.
+Note that last element, `pauc`, is the result returned by the `maxnetic::pAUC()` function which we can plot.
```{r}
-aucs_long = tidyr::pivot_longer(aucs, dplyr::all_of(c("n_obs", "n_bkg")),
- names_to = "type", values_to = "count") |>
- dplyr::mutate(type = dplyr::recode(type, n_obs = "obs", n_bkg = "bkg"))
-
-ggplot2::ggplot(data = aucs_long, aes(x = count, y = auc, color = type)) +
- ggplot2::geom_point() +
- ggplot2::geom_smooth(method='lm', formula= y~x)
+pp = paucs |>
+ dplyr::group_by(month) |>
+ dplyr::group_map(
+ function(tbl, key){
+ plot(tbl$pauc[[1]], title = key$month, xlab = "", ylab = "")
+ }
+ )
+patchwork::wrap_plots(pp, ncol = 4)
```
-Surprised? Could this be overfitting resulting from sampling background in time weighted to the months when we have observations? Hmmmm.
+
+Well, it would be easy to become dispirited by this result. It would be reasonable to expect AUC values to improve if we built monthly models rather than a single model applied to any month. But it seems to not be so. Darn!
\ No newline at end of file
diff --git a/modeling-02_files/figure-html/unnamed-chunk-10-1.png b/modeling-02_files/figure-html/unnamed-chunk-10-1.png
new file mode 100644
index 0000000..fcdcd3e
Binary files /dev/null and b/modeling-02_files/figure-html/unnamed-chunk-10-1.png differ
diff --git a/modeling-02_files/figure-html/unnamed-chunk-4-1.png b/modeling-02_files/figure-html/unnamed-chunk-4-1.png
new file mode 100644
index 0000000..c1388b1
Binary files /dev/null and b/modeling-02_files/figure-html/unnamed-chunk-4-1.png differ
diff --git a/modeling-02_files/figure-html/unnamed-chunk-7-1.png b/modeling-02_files/figure-html/unnamed-chunk-7-1.png
index df52a54..6b8c08d 100644
Binary files a/modeling-02_files/figure-html/unnamed-chunk-7-1.png and b/modeling-02_files/figure-html/unnamed-chunk-7-1.png differ
diff --git a/modeling-02_files/figure-html/unnamed-chunk-8-1.png b/modeling-02_files/figure-html/unnamed-chunk-8-1.png
new file mode 100644
index 0000000..6b8c08d
Binary files /dev/null and b/modeling-02_files/figure-html/unnamed-chunk-8-1.png differ
diff --git a/modeling-02_files/figure-html/unnamed-chunk-9-1.png b/modeling-02_files/figure-html/unnamed-chunk-9-1.png
index 03d16aa..fcdcd3e 100644
Binary files a/modeling-02_files/figure-html/unnamed-chunk-9-1.png and b/modeling-02_files/figure-html/unnamed-chunk-9-1.png differ