diff --git a/.DS_Store b/.DS_Store
index 7a6ea23..c9fe512 100644
Binary files a/.DS_Store and b/.DS_Store differ
diff --git a/.Rhistory b/.Rhistory
index c6ddf4b..2226a49 100644
--- a/.Rhistory
+++ b/.Rhistory
@@ -1,38 +1,3 @@
-limits = c(0.04, 0.1),
-labels = scales::percent) +
-scale_y_continuous(expand = expansion(mult = c(0, 0.002)),
-breaks = c(0, 0.01, 0.02, 0.03, 0.04, 0.05),
-limits = c(0, 0.05),
-labels = scales::percent) +
-labs(x = "Seasonally-adjusted unemployment rate",
-y = "Seasonally-adjusted vacancy rate") +
-scatter_grid()
-# Chunk 22: slope-plot
-# https://www.bls.gov/lau/
-library(ggrepel)
-unemployment <- tibble(
-time = c("October 2009", "October 2009", "October 2009", "August 2017", "August 2017", "August 2017"),
-rate = c(7.4, 7.1, 10.0, 3.9, 3.8, 6.4),
-state = c("Maryland", "Virginia", "Washington, D.C.", "Maryland", "Virginia", "Washington, D.C.")
-)
-label <- tibble(label = c("October 2009", "August 2017"))
-october <- filter(unemployment, time == "October 2009")
-august <- filter(unemployment, time == "August 2017")
-unemployment %>%
-mutate(time = factor(time, levels = c("October 2009", "August 2017")),
-state = factor(state, levels = c("Washington, D.C.", "Maryland", "Virginia"))) %>%
-ggplot() +
-geom_line(aes(time, rate, group = state, color = state), show.legend = FALSE) +
-geom_point(aes(x = time, y = rate, color = state)) +
-labs(subtitle = "Unemployment Rate") +
-theme(axis.ticks.x = element_blank(),
-axis.title.x = element_blank(),
-axis.ticks.y = element_blank(),
-axis.title.y = element_blank(),
-axis.text.y = element_blank(),
-panel.grid.major.y = element_blank(),
-panel.grid.minor.y = element_blank(),
-panel.grid.major.x = element_blank(),
axis.line = element_blank()) +
geom_text_repel(data = october, mapping = aes(x = time, y = rate, label = as.character(rate)), nudge_x = -0.06) +
geom_text_repel(data = august, mapping = aes(x = time, y = rate, label = as.character(rate)), nudge_x = 0.06)
@@ -510,3 +475,38 @@ palette_urbn_main[1:4]
palette_urbn_spacegray[1:5]
# Chunk 59: System Info and Package Versioning
sessionInfo()
+renv::status()
+# load ggsankey package
+remotes::install_github("davidsjoberg/ggsankey")
+# create a dummy dataset of housing status
+df <- data_frame(entry_status = c(rep("Housed", 7), rep("Unhoused", 15), rep("Staying w/ Family", 8)),
+exit_status = c(rep("Housed", 15), rep("Unhoused", 2), rep("Staying w/ Family", 13))) %>%
+# transform the data frame into the proper format for the sankey plot
+make_long(entry_status, exit_status) %>%
+# recode the labels to be cleaner in the plot
+mutate(x = recode(x, entry_status = "Prior Housing Status", exit_status = "Exit Housing Status"),
+next_x = recode(next_x, entry_status = "Prior Housing Status", exit_status = "Exit Housing Status"))
+library(tidyverse)
+# load ggsankey package
+remotes::install_github("davidsjoberg/ggsankey")
+library(ggsankey)
+# create a dummy dataset of housing status
+df <- data_frame(entry_status = c(rep("Housed", 7), rep("Unhoused", 15), rep("Staying w/ Family", 8)),
+exit_status = c(rep("Housed", 15), rep("Unhoused", 2), rep("Staying w/ Family", 13))) %>%
+# transform the data frame into the proper format for the sankey plot
+make_long(entry_status, exit_status) %>%
+# recode the labels to be cleaner in the plot
+mutate(x = recode(x, entry_status = "Prior Housing Status", exit_status = "Exit Housing Status"),
+next_x = recode(next_x, entry_status = "Prior Housing Status", exit_status = "Exit Housing Status"))
+# create sankey plot
+ggplot(df, aes(x = x,
+next_x = next_x,
+node = node,
+next_node = next_node,
+fill = factor(node),
+label = node)) +
+geom_sankey(flow.alpha = 0.5, node.color = 1, show.legend = FALSE) +
+# add labels to plot and style
+geom_sankey_label(size = 3.5, color = 1, fill = "white") +
+theme_sankey(base_size = 16)+
+labs(x = NULL)
diff --git a/.Rproj.user/.DS_Store b/.Rproj.user/.DS_Store
index 842952f..ce78431 100644
Binary files a/.Rproj.user/.DS_Store and b/.Rproj.user/.DS_Store differ
diff --git a/.Rproj.user/4C8EE3A7/.DS_Store b/.Rproj.user/4C8EE3A7/.DS_Store
index 97b82f5..2b644ae 100644
Binary files a/.Rproj.user/4C8EE3A7/.DS_Store and b/.Rproj.user/4C8EE3A7/.DS_Store differ
diff --git a/.Rproj.user/4C8EE3A7/pcs/source-pane.pper b/.Rproj.user/4C8EE3A7/pcs/source-pane.pper
index 544e582..902cc6f 100644
--- a/.Rproj.user/4C8EE3A7/pcs/source-pane.pper
+++ b/.Rproj.user/4C8EE3A7/pcs/source-pane.pper
@@ -1,3 +1,3 @@
{
- "activeTab": 5
+ "activeTab": 0
}
\ No newline at end of file
diff --git a/.Rproj.user/4C8EE3A7/pcs/windowlayoutstate.pper b/.Rproj.user/4C8EE3A7/pcs/windowlayoutstate.pper
index c13f616..c412cd7 100644
--- a/.Rproj.user/4C8EE3A7/pcs/windowlayoutstate.pper
+++ b/.Rproj.user/4C8EE3A7/pcs/windowlayoutstate.pper
@@ -1,13 +1,13 @@
{
"left": {
- "splitterpos": 157,
+ "splitterpos": 246,
"topwindowstate": "NORMAL",
"panelheight": 682,
"windowheight": 720
},
"right": {
- "splitterpos": 426,
- "topwindowstate": "NORMAL",
+ "splitterpos": 424,
+ "topwindowstate": "MINIMIZE",
"panelheight": 682,
"windowheight": 720
}
diff --git a/.Rproj.user/4C8EE3A7/pcs/workbench-pane.pper b/.Rproj.user/4C8EE3A7/pcs/workbench-pane.pper
index d3c7634..07157f3 100644
--- a/.Rproj.user/4C8EE3A7/pcs/workbench-pane.pper
+++ b/.Rproj.user/4C8EE3A7/pcs/workbench-pane.pper
@@ -1,5 +1,5 @@
{
"TabSet1": 0,
- "TabSet2": 1,
+ "TabSet2": 4,
"TabZoom": {}
}
\ No newline at end of file
diff --git a/.Rproj.user/4C8EE3A7/sources/prop/53D28D73 b/.Rproj.user/4C8EE3A7/sources/prop/53D28D73
index 6c9ac38..f5ad44a 100644
--- a/.Rproj.user/4C8EE3A7/sources/prop/53D28D73
+++ b/.Rproj.user/4C8EE3A7/sources/prop/53D28D73
@@ -5,7 +5,7 @@
"source_window_id": "",
"Source": "Source",
"cursorPosition": "1101,0",
- "scrollLine": "0",
+ "scrollLine": "299",
"docOutlineVisible": "1",
"rmdVisualCollapsedChunks": "",
"rmdVisualModeLocation": "33850:26770.400390625",
diff --git a/.Rproj.user/4C8EE3A7/sources/prop/91139DE3 b/.Rproj.user/4C8EE3A7/sources/prop/91139DE3
index ea0f99c..16144db 100644
--- a/.Rproj.user/4C8EE3A7/sources/prop/91139DE3
+++ b/.Rproj.user/4C8EE3A7/sources/prop/91139DE3
@@ -1,13 +1,13 @@
{
- "rmdVisualMode": "true",
+ "rmdVisualMode": "false",
"rmdVisualWrapConfigured": "true",
"tempName": "Untitled1",
"source_window_id": "",
"Source": "Source",
- "cursorPosition": "884,0",
- "scrollLine": "0",
+ "cursorPosition": "1354,15",
+ "scrollLine": "1352",
"docOutlineVisible": "1",
"rmdVisualCollapsedChunks": "",
- "rmdVisualModeLocation": "33796:16858.400390625",
+ "rmdVisualModeLocation": "50667:25980",
"chunk_output_type": "console"
}
\ No newline at end of file
diff --git a/.quarto/_freeze/optimization/execute-results/html.json b/.quarto/_freeze/optimization/execute-results/html.json
index 44868de..5f6edba 100644
--- a/.quarto/_freeze/optimization/execute-results/html.json
+++ b/.quarto/_freeze/optimization/execute-results/html.json
@@ -1,7 +1,7 @@
{
"hash": "d1ac4719dca216662dad7d103405571b",
"result": {
- "markdown": "---\noutput: \n html_document:\n includes:\n in_header: analytics.html \t\n css: styles.css\n code_folding: show\n toc: TRUE\n toc_float: TRUE\n pandoc_args:\n \"--tab-stop=2\"\n---\n\n\n\n\n::: {#header}\n\n:::\n\n\n\n\n\n# Introduction\n\nThis guide outlines tools and tips for improving the speed and execution of R code.\n\nSometimes, simply tweaking a few lines of code can lead to large performance gains in the execution of a program. Other issues may take more time to work through but can be a huge benefit to a project in the long term.\n\nAn important lesson to learn when it comes to optimising an R (or any) program is knowing both if to start and when to stop. You most likely want to optimize your code because it is \"too slow\", but what that means will vary from project to project. Be sure to consider what \"fast enough\" is for your project and how much needs to be optimized. If your program takes an hour to complete, spending 5 hours trying to make it faster can be time well spent if the script will be run regularly, and a complete waste of time if it's an ad-hoc analysis.\n\nFor more information, see the CRAN Task View [High-Performance and Parallel Computing with R](https://CRAN.R-project.org/view=HighPerformanceComputing).\n\nThe \"Performant Code\" section of Hadley Wickham's [Advanced R](http://adv-r.had.co.nz/) is another great resource and provides a deeper dive into what is covered in this guide.\n\n------------------------------------------------------------------------\n\n# Update Your Installation\n\nOne of the easiest ways to improve the performance of R is to update R. In general, R will have a big annual release (i.e., 3.5.0) in the spring and around 3-4 smaller patch releases (i.e., 3.5.1) throughout the rest of the year. If the middle digit of your installation is behind the current release, you should consider updating.\n\nFor instance, R 3.5.0 implemented an improved read from text files. A 5GB file took over 5 minutes to read in 3.4.4:\n\n![](optimization/images/data-load-3-4.png){width=\"75%\"}\n\nWhile 3.5.0 took less than half the time:\n\n![](optimization/images/data-load-3-5.png){width=\"75%\"}\n\nTo see what the R-core development team is up to, check out the [NEWS](https://cran.r-project.org/doc/manuals/r-devel/NEWS.html) file from the R project.\n\n------------------------------------------------------------------------\n\n# Profiling & Benchmarking\n\nIn order to efficiently optimize your code, you'll first need to know where it's running slowest. The `profvis` package provides a nice way of visualizing the execution time and memory useage of your program.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(profvis)\nlibrary(dplyr)\n\nprofvis({\n\tdiamonds <- read.csv(\"optimization/data/diamonds.csv\")\n\n\tdiamonds_by_cut <- diamonds %>%\n\t\tgroup_by(cut) %>%\n\t\tsummarise_if(is.numeric, mean)\n\n\twrite.csv(diamonds_by_cut, file = \"optimization/data/diamonds_by_cut.csv\")\n\n})\n```\n\n::: {.cell-output-display}\n```{=html}\n
\n\n```\n:::\n:::\n\n\nIn this toy example it looks like the `read.csv` function is the bottleneck, so\n\nwork on optimizing that first.\n\nOnce you find the bottleneck that needs to be optimized, it can be useful to\n\nbenchmark different potential solutions. The `microbenchmark` package can help\n\nyou choose between different options. Continuing with the simple example with\n\nthe `diamonds` dataset, compare the base `read.csv` function with `read_csv`\n\nfrom the `readr` package.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(microbenchmark)\n\nmicrobenchmark(\n\n read.csv(\"optimization/data/diamonds.csv\"),\n\n readr::read_csv(\"optimization/data/diamonds.csv\")\n\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nUnit: milliseconds\n expr min lq mean\n read.csv(\"optimization/data/diamonds.csv\") 64.77053 66.14555 71.04118\n readr::read_csv(\"optimization/data/diamonds.csv\") 32.04847 33.20362 37.12877\n median uq max neval\n 68.25725 69.80660 134.2046 100\n 34.07731 35.69267 158.6962 100\n```\n:::\n:::\n\n\nIn this case, `read_csv` is about twice as fast as the base R implementations.\n\n# Parallel Computing\n\nOften, time-intensive R code can be sped up by breaking the execution of\n\nthe job across additional cores of your computer. This is called parallel computing.\n\n## Learn `lapply`/`purrr::map`\n\nLearning the `lapply` (and variants) function from Base R or the `map` (and variants) function from the `purrr` package is the first step in learning to run R code in parallel. Once you understand how `lapply` and `map` work, running your code in parallel will be simple.\n\nSay you have a vector of numbers and want to find the square root of each one\n\n(ignore for now that `sqrt` is vectorized, which will be covered later).\n\nYou could write a for loop and iterate over each element of the vector:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- c(1, 4, 9, 16)\n\nout <- vector(\"list\", length(x))\n\nfor (i in seq_along(x)) {\n\n out[[i]] <- sqrt(x[[i]])\n\n}\n\nunlist(out)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\nThe `lapply` function essentially handles the overhead of constructing a for\n\nloop for you. The syntax is:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlapply(X, FUN, ...)\n```\n:::\n\n\n`lapply` will then take each element of `X` and apply the `FUN`ction to it.\n\nOur simple example then becomes:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- c(1, 4, 9, 16)\n\nout <- lapply(x, sqrt)\n\nunlist(out)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\nThose working within the `tidyverse` may use `map` from the `purrr` package equivalently:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(purrr)\n\nx <- c(1, 4, 9, 16)\n\nout <- map(x, sqrt)\n\nunlist(out)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\n## Motivating Example\n\nOnce you are comfortable with `lapply` and/or `map`, running the same code in\n\nparallel takes just an additional line of code.\n\nFor `lapply` users, the `future.apply` package contains an equivalent\n\n`future_lapply` function. Just be sure to call `plan(multiprocess)` beforehand,\n\nwhich will handle the back-end orchestration needed to run in parallel.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# install.packages(\"future.apply\")\n\nlibrary(future.apply)\n\nplan(multisession)\n\nout <- future_lapply(x, sqrt)\n\nunlist(out)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\nFor `purrr` users, the `furrr` (i.e., future purrr) package includes an\n\nequivalent `future_map` function:\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# install.packages(\"furrr\")\n\nlibrary(furrr)\n\nplan(multisession)\n\ny <- future_map(x, sqrt)\n\nunlist(y)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\nHow much faster did this simple example run in parallel?\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(future.apply)\n\nplan(multisession)\n\nx <- c(1, 4, 9, 16)\n\nmicrobenchmark::microbenchmark(\n\n sequential = lapply(x, sqrt),\n\n parallel = future_lapply(x, sqrt),\n\n unit = \"s\"\n\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nUnit: seconds\n expr min lq mean median uq\n sequential 0.000001066 0.00000123 0.00000177858 0.0000016195 0.000002132\n parallel 0.014658443 0.01518220 0.01828126942 0.0157106875 0.017450646\n max neval\n 0.000006847 100\n 0.198878864 100\n```\n:::\n:::\n\n\nParallelization was actually slower. In this case, the overhead of\n\nsetting the code to run in parallel far outweighed any performance gain. In\n\ngeneral, parallelization works well on long-running & compute intensive jobs.\n\n## A (somewhat) More Complex Example\n\nIn this example we'll use the `diamonds` dataset from `ggplot2` and perform a\n\nkmeans cluster. We'll use `lapply` to iterate the number of clusters from 2 to\n\n5:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndf <- ggplot2::diamonds\n\ndf <- dplyr::select(df, -c(cut, color, clarity))\n\ncenters = 2:5\n\nsystem.time(\n\n lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n user system elapsed \n 21.846 0.663 22.535 \n```\n:::\n:::\n\n\nA now running the same code in parallel:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(future.apply)\n\nplan(multisession)\n\nsystem.time(\n\n future_lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n user system elapsed \n 0.376 0.121 11.164 \n```\n:::\n:::\n\n\nWhile we didn't achieve perfect scaling, we still get a nice bump in execution\n\ntime.\n\n## Additional Packages\n\nFor the sake of ease and brevity, this guide focused on the `futures` framework\n\nfor parallelization. However, you should be aware that there are a number of\n\nother ways to parallelize your code.\n\n### The `parallel` Package\n\nThe `parallel` package is included in your base R installation. It includes\n\nanalogues of the various `apply` functions:\n\n- `parLapply`\n\n- `mclapply` - not available on Windows\n\nThese functions generally require more setup, especially on Windows machines.\n\n### The `doParallel` Package\n\nThe `doParallel` package builds off of `parallel` and is\n\nuseful for code that uses for loops instead of `lapply`. Like the parallel\n\npackage, it generally requires more setup, especially on Windows machines.\n\n### Machine Learning - `caret`\n\nFor those running machine learning models, the `caret` package can easily\n\nleverage `doParallel` to speed up the execution of multiple models. Lifting\n\nthe example from the package documentation:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(doParallel)\n\ncl <- makePSOCKcluster(5) # number of cores to use\n\nregisterDoParallel(cl)\n\n## All subsequent models are then run in parallel\n\nmodel <- train(y ~ ., data = training, method = \"rf\")\n\n## When you are done:\n\nstopCluster(cl)\n```\n:::\n\n\nBe sure to check out the full\n\n[documentation](http://topepo.github.io/caret/parallel-processing.html)\n\nfor more detail.\n\n------------------------------------------------------------------------\n\n# Big Data\n\nAs data collection and storage becomes easier and cheaper, it is relatively\n\nsimple to obtain relatively large data files. An important point to keep in\n\nmind is that the size of your data will generally expand when it is read\n\nfrom a storage device into R. A general rule of thumb is that a file will take\n\nsomewhere around 3-4 times more space in memory than it does on disk.\n\nFor instance, compare the size of the `iris` data set when it is saved as a\n\n.csv file locally vs the size of the object when it is read in to an R session:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfile.size(\"optimization/data/iris.csv\") / 1000\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 3.716\n```\n:::\n\n```{.r .cell-code}\ndf <- readr::read_csv(\"optimization/data/iris.csv\")\n\npryr::object_size(df)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n10.14 kB\n```\n:::\n:::\n\n\nThis means that on a standard Urban Institute desktop, you may have issues\n\nreading in files that are larger than 4 GB.\n\n## Object Size\n\nThe type of your data can have a big impact on the size of your data frame\n\nwhen you are dealing with larger files. There are four main types of atomic\n\nvectors in R:\n\n1. `logical`\n\n2. `integer`\n\n3. `double` (also called `numeric`)\n\n4. `character`\n\n## Each of these data types occupies a different amount of space in memory\n\n`logical` and `integer` vectors use 4 bytes per element, while a `double` will\n\noccupy 8 bytes. R uses a global string pool, so `character` vectors are hard\n\nto estimate, but will generally take up more space for element.\n\nConsider the following example:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- 1:100\n\npryr::object_size(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n680 B\n```\n:::\n\n```{.r .cell-code}\npryr::object_size(as.double(x))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n680 B\n```\n:::\n\n```{.r .cell-code}\npryr::object_size(as.character(x))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n1.32 kB\n```\n:::\n:::\n\n\nAn incorrect data type can easily cost you a lot of space in memory, especially\n\nat scale. This often happens when reading data from a text or csv file - data\n\nmay have a format such as `c(1.0, 2.0, 3.0)` and will be read in as a `numeric`\n\ncolumn, when `integer` is more appropriate and compact.\n\nYou may also be familiar with `factor` variables within R. Essentially a\n\n`factor` will represent your data as integers, and map them back to their\n\ncharacter representation. This can save memory when you have a compact and\n\nunique level of factors:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- sample(letters, 10000, replace = TRUE)\n\npryr::object_size(as.character(x))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n81.50 kB\n```\n:::\n\n```{.r .cell-code}\npryr::object_size(as.factor(x))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n42.10 kB\n```\n:::\n:::\n\n\nHowever if each element is unique, or if there is not a lot of overlap among\n\nelements, than the overhead will make a factor larger than its character\n\nrepresentation:\n\n\n::: {.cell}\n\n```{.r .cell-code}\npryr::object_size(as.factor(letters))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n2.22 kB\n```\n:::\n\n```{.r .cell-code}\npryr::object_size(as.character(letters))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n1.71 kB\n```\n:::\n:::\n\n\n## Cloud Computing\n\nSometimes, you will have data that are simply too large to ever fit on your\n\nlocal desktop machine. If that is the case, then the Elastic Cloud Computing\n\nEnvironment from the Office of Technology and Data Science can provide you with\n\neasy access to powerful analytic tools for computationally intensive project.\n\nThe Elastic Cloud Computing Environment allows researchers to quickly spin-up\n\nan Amazon Web Services (AWS) Elastic Cloud Compute (EC2) instance. These\n\ninstances offer increased memory to read in large datasets, along with\n\nadditional CPUs to provide the ability to process data in parallel at an\n\nimpressive scale.\n\n| Instance \\| CPU \\| Memory (GB) \\|\n\n\\|----------\\|-----\\|--------\\|\n\n| Desktop \\| 8 \\| 16 \\|\n\n| c5.4xlarge \\| 16 \\| 32 \\|\n\n| c5.9xlarge \\| 36 \\| 72 \\|\n\n| c5.18xlarge \\| 72 \\| 144 \\|\n\n| x1e.8xlarge \\| 32 \\| 976 \\|\n\n| x1e.16xlarge \\| 64 \\| 1952 \\|\n\nFeel free to contact Erika Tyagi (etyagi\\@urban.org) if this would be useful\n\nfor your project.\n\n------------------------------------------------------------------------\n\n# Common Pitfalls\n\n## For Loops and Vector Allocation\n\nA refrain you will often hear is that for loops in R are slow and need to be\n\navoided at all costs. This is not true! Rather, an improperly constructed loop\n\nin R can bring the execution of your program to a near standstill.\n\nA common for loop structure may look something like:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- 1:100\n\nout <- c()\n\nfor (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n```\n:::\n\n\nThe bottleneck in this loop is with the allocation of the vector `out`. Every\n\ntime we iterate over an item in `x` and append it to `out`, R makes a copy\n\nof all the items already in `out`. As the size of the loop grows, your code\n\nwill take longer and longer to run.\n\nA better practice is to pre-allocate `out` to be the correct length, and then\n\ninsert the results as the loop runs.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- 1:100\n\nout <- rep(NA, length(x))\n\nfor (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n}\n```\n:::\n\n\nA quick benchmark shows how much more efficient a loop with a pre-allocated\n\nresults vector is:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nbad_loop <- function(x) {\n\n out <- c()\n\n for (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n\n}\n\ngood_loop <- function(x) {\n\n out <- rep(NA, length(x))\n\n for (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n }\n\n}\n\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(x),\n\n good_loop(x)\n\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nUnit: microseconds\n expr min lq mean median uq max neval\n bad_loop(x) 896.465 967.5590 2027.18719 1054.7250 1132.6660 55959.588 100\n good_loop(x) 4.346 4.7355 21.39134 5.8425 7.9745 1437.009 100\n```\n:::\n:::\n\n\nAnd note how performance of the \"bad\" loop degrades as the loop size grows.\n\n\n::: {.cell}\n\n```{.r .cell-code}\ny <- 1:250\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(y),\n\n good_loop(y)\n\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nUnit: microseconds\n expr min lq mean median uq max\n bad_loop(y) 13175.473 17043.3310 18404.84383 17790.6995 18655.1230 65857.726\n good_loop(y) 9.717 10.2705 14.36558 11.3775 16.7485 35.711\n neval\n 100\n 100\n```\n:::\n:::\n\n\n## Vectorized Functions\n\nMany functions in R are vectorized, meaning they can accept an entire vector\n\n(and not just a single value) as input. The `sqrt` function from the\n\nprior examples is one:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- c(1, 4, 9, 16)\n\nsqrt(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\nThis removes the need to use `lapply` or a for loop. Vectorized functions in\n\nR are generally written in a compiled language like C, C++, or FORTRAN, which\n\nmakes their implementation faster.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n lapply(x, sqrt),\n\n sqrt(x)\n\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nUnit: nanoseconds\n expr min lq mean median uq max neval\n lapply(x, sqrt) 14801 15047 15325.39 15170 15334 21279 100\n sqrt(x) 205 246 341.53 287 369 1107 100\n```\n:::\n:::\n",
+ "markdown": "---\noutput: \n html_document:\n includes:\n in_header: analytics.html \t\n css: styles.css\n code_folding: show\n toc: TRUE\n toc_float: TRUE\n pandoc_args:\n \"--tab-stop=2\"\n---\n\n\n\n\n::: {#header}\n\n:::\n\n\n\n\n\n# Introduction\n\nThis guide outlines tools and tips for improving the speed and execution of R code.\n\nSometimes, simply tweaking a few lines of code can lead to large performance gains in the execution of a program. Other issues may take more time to work through but can be a huge benefit to a project in the long term.\n\nAn important lesson to learn when it comes to optimising an R (or any) program is knowing both if to start and when to stop. You most likely want to optimize your code because it is \"too slow\", but what that means will vary from project to project. Be sure to consider what \"fast enough\" is for your project and how much needs to be optimized. If your program takes an hour to complete, spending 5 hours trying to make it faster can be time well spent if the script will be run regularly, and a complete waste of time if it's an ad-hoc analysis.\n\nFor more information, see the CRAN Task View [High-Performance and Parallel Computing with R](https://CRAN.R-project.org/view=HighPerformanceComputing).\n\nThe \"Performant Code\" section of Hadley Wickham's [Advanced R](http://adv-r.had.co.nz/) is another great resource and provides a deeper dive into what is covered in this guide.\n\n------------------------------------------------------------------------\n\n# Update Your Installation\n\nOne of the easiest ways to improve the performance of R is to update R. In general, R will have a big annual release (i.e., 3.5.0) in the spring and around 3-4 smaller patch releases (i.e., 3.5.1) throughout the rest of the year. If the middle digit of your installation is behind the current release, you should consider updating.\n\nFor instance, R 3.5.0 implemented an improved read from text files. A 5GB file took over 5 minutes to read in 3.4.4:\n\n![](optimization/images/data-load-3-4.png){width=\"75%\"}\n\nWhile 3.5.0 took less than half the time:\n\n![](optimization/images/data-load-3-5.png){width=\"75%\"}\n\nTo see what the R-core development team is up to, check out the [NEWS](https://cran.r-project.org/doc/manuals/r-devel/NEWS.html) file from the R project.\n\n------------------------------------------------------------------------\n\n# Profiling & Benchmarking\n\nIn order to efficiently optimize your code, you'll first need to know where it's running slowest. The `profvis` package provides a nice way of visualizing the execution time and memory useage of your program.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(profvis)\nlibrary(dplyr)\n\nprofvis({\n\tdiamonds <- read.csv(\"optimization/data/diamonds.csv\")\n\n\tdiamonds_by_cut <- diamonds %>%\n\t\tgroup_by(cut) %>%\n\t\tsummarise_if(is.numeric, mean)\n\n\twrite.csv(diamonds_by_cut, file = \"optimization/data/diamonds_by_cut.csv\")\n\n})\n```\n\n::: {.cell-output-display}\n```{=html}\n\n\n```\n:::\n:::\n\n\nIn this toy example it looks like the `read.csv` function is the bottleneck, so\n\nwork on optimizing that first.\n\nOnce you find the bottleneck that needs to be optimized, it can be useful to\n\nbenchmark different potential solutions. The `microbenchmark` package can help\n\nyou choose between different options. Continuing with the simple example with\n\nthe `diamonds` dataset, compare the base `read.csv` function with `read_csv`\n\nfrom the `readr` package.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(microbenchmark)\n\nmicrobenchmark(\n\n read.csv(\"optimization/data/diamonds.csv\"),\n\n readr::read_csv(\"optimization/data/diamonds.csv\")\n\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nUnit: milliseconds\n expr min lq\n read.csv(\"optimization/data/diamonds.csv\") 103.14624 111.61502\n readr::read_csv(\"optimization/data/diamonds.csv\") 55.57689 59.80873\n mean median uq max neval\n 135.10956 115.71928 127.93492 453.5855 100\n 75.97688 63.64992 71.88532 372.4557 100\n```\n:::\n:::\n\n\nIn this case, `read_csv` is about twice as fast as the base R implementations.\n\n# Parallel Computing\n\nOften, time-intensive R code can be sped up by breaking the execution of\n\nthe job across additional cores of your computer. This is called parallel computing.\n\n## Learn `lapply`/`purrr::map`\n\nLearning the `lapply` (and variants) function from Base R or the `map` (and variants) function from the `purrr` package is the first step in learning to run R code in parallel. Once you understand how `lapply` and `map` work, running your code in parallel will be simple.\n\nSay you have a vector of numbers and want to find the square root of each one\n\n(ignore for now that `sqrt` is vectorized, which will be covered later).\n\nYou could write a for loop and iterate over each element of the vector:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- c(1, 4, 9, 16)\n\nout <- vector(\"list\", length(x))\n\nfor (i in seq_along(x)) {\n\n out[[i]] <- sqrt(x[[i]])\n\n}\n\nunlist(out)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\nThe `lapply` function essentially handles the overhead of constructing a for\n\nloop for you. The syntax is:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlapply(X, FUN, ...)\n```\n:::\n\n\n`lapply` will then take each element of `X` and apply the `FUN`ction to it.\n\nOur simple example then becomes:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- c(1, 4, 9, 16)\n\nout <- lapply(x, sqrt)\n\nunlist(out)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\nThose working within the `tidyverse` may use `map` from the `purrr` package equivalently:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(purrr)\n\nx <- c(1, 4, 9, 16)\n\nout <- map(x, sqrt)\n\nunlist(out)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\n## Motivating Example\n\nOnce you are comfortable with `lapply` and/or `map`, running the same code in\n\nparallel takes just an additional line of code.\n\nFor `lapply` users, the `future.apply` package contains an equivalent\n\n`future_lapply` function. Just be sure to call `plan(multiprocess)` beforehand,\n\nwhich will handle the back-end orchestration needed to run in parallel.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# install.packages(\"future.apply\")\n\nlibrary(future.apply)\n\nplan(multisession)\n\nout <- future_lapply(x, sqrt)\n\nunlist(out)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\nFor `purrr` users, the `furrr` (i.e., future purrr) package includes an\n\nequivalent `future_map` function:\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# install.packages(\"furrr\")\n\nlibrary(furrr)\n\nplan(multisession)\n\ny <- future_map(x, sqrt)\n\nunlist(y)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\nHow much faster did this simple example run in parallel?\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(future.apply)\n\nplan(multisession)\n\nx <- c(1, 4, 9, 16)\n\nmicrobenchmark::microbenchmark(\n\n sequential = lapply(x, sqrt),\n\n parallel = future_lapply(x, sqrt),\n\n unit = \"s\"\n\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nUnit: seconds\n expr min lq mean median uq\n sequential 0.000001763 0.0000020705 0.0000029848 0.0000029315 0.000003772\n parallel 0.026585548 0.0282111980 0.0333569452 0.0291357070 0.030628517\n max neval\n 0.000009799 100\n 0.337903181 100\n```\n:::\n:::\n\n\nParallelization was actually slower. In this case, the overhead of\n\nsetting the code to run in parallel far outweighed any performance gain. In\n\ngeneral, parallelization works well on long-running & compute intensive jobs.\n\n## A (somewhat) More Complex Example\n\nIn this example we'll use the `diamonds` dataset from `ggplot2` and perform a\n\nkmeans cluster. We'll use `lapply` to iterate the number of clusters from 2 to\n\n5:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndf <- ggplot2::diamonds\n\ndf <- dplyr::select(df, -c(cut, color, clarity))\n\ncenters = 2:5\n\nsystem.time(\n\n lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n user system elapsed \n 35.229 2.688 42.309 \n```\n:::\n:::\n\n\nA now running the same code in parallel:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(future.apply)\n\nplan(multisession)\n\nsystem.time(\n\n future_lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n user system elapsed \n 0.876 0.210 21.655 \n```\n:::\n:::\n\n\nWhile we didn't achieve perfect scaling, we still get a nice bump in execution\n\ntime.\n\n## Additional Packages\n\nFor the sake of ease and brevity, this guide focused on the `futures` framework\n\nfor parallelization. However, you should be aware that there are a number of\n\nother ways to parallelize your code.\n\n### The `parallel` Package\n\nThe `parallel` package is included in your base R installation. It includes\n\nanalogues of the various `apply` functions:\n\n- `parLapply`\n\n- `mclapply` - not available on Windows\n\nThese functions generally require more setup, especially on Windows machines.\n\n### The `doParallel` Package\n\nThe `doParallel` package builds off of `parallel` and is\n\nuseful for code that uses for loops instead of `lapply`. Like the parallel\n\npackage, it generally requires more setup, especially on Windows machines.\n\n### Machine Learning - `caret`\n\nFor those running machine learning models, the `caret` package can easily\n\nleverage `doParallel` to speed up the execution of multiple models. Lifting\n\nthe example from the package documentation:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(doParallel)\n\ncl <- makePSOCKcluster(5) # number of cores to use\n\nregisterDoParallel(cl)\n\n## All subsequent models are then run in parallel\n\nmodel <- train(y ~ ., data = training, method = \"rf\")\n\n## When you are done:\n\nstopCluster(cl)\n```\n:::\n\n\nBe sure to check out the full\n\n[documentation](http://topepo.github.io/caret/parallel-processing.html)\n\nfor more detail.\n\n------------------------------------------------------------------------\n\n# Big Data\n\nAs data collection and storage becomes easier and cheaper, it is relatively\n\nsimple to obtain relatively large data files. An important point to keep in\n\nmind is that the size of your data will generally expand when it is read\n\nfrom a storage device into R. A general rule of thumb is that a file will take\n\nsomewhere around 3-4 times more space in memory than it does on disk.\n\nFor instance, compare the size of the `iris` data set when it is saved as a\n\n.csv file locally vs the size of the object when it is read in to an R session:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfile.size(\"optimization/data/iris.csv\") / 1000\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 3.716\n```\n:::\n\n```{.r .cell-code}\ndf <- readr::read_csv(\"optimization/data/iris.csv\")\n\npryr::object_size(df)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n10.14 kB\n```\n:::\n:::\n\n\nThis means that on a standard Urban Institute desktop, you may have issues\n\nreading in files that are larger than 4 GB.\n\n## Object Size\n\nThe type of your data can have a big impact on the size of your data frame\n\nwhen you are dealing with larger files. There are four main types of atomic\n\nvectors in R:\n\n1. `logical`\n\n2. `integer`\n\n3. `double` (also called `numeric`)\n\n4. `character`\n\n## Each of these data types occupies a different amount of space in memory\n\n`logical` and `integer` vectors use 4 bytes per element, while a `double` will\n\noccupy 8 bytes. R uses a global string pool, so `character` vectors are hard\n\nto estimate, but will generally take up more space for element.\n\nConsider the following example:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- 1:100\n\npryr::object_size(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n680 B\n```\n:::\n\n```{.r .cell-code}\npryr::object_size(as.double(x))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n680 B\n```\n:::\n\n```{.r .cell-code}\npryr::object_size(as.character(x))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n1.32 kB\n```\n:::\n:::\n\n\nAn incorrect data type can easily cost you a lot of space in memory, especially\n\nat scale. This often happens when reading data from a text or csv file - data\n\nmay have a format such as `c(1.0, 2.0, 3.0)` and will be read in as a `numeric`\n\ncolumn, when `integer` is more appropriate and compact.\n\nYou may also be familiar with `factor` variables within R. Essentially a\n\n`factor` will represent your data as integers, and map them back to their\n\ncharacter representation. This can save memory when you have a compact and\n\nunique level of factors:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- sample(letters, 10000, replace = TRUE)\n\npryr::object_size(as.character(x))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n81.50 kB\n```\n:::\n\n```{.r .cell-code}\npryr::object_size(as.factor(x))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n42.10 kB\n```\n:::\n:::\n\n\nHowever if each element is unique, or if there is not a lot of overlap among\n\nelements, than the overhead will make a factor larger than its character\n\nrepresentation:\n\n\n::: {.cell}\n\n```{.r .cell-code}\npryr::object_size(as.factor(letters))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n2.22 kB\n```\n:::\n\n```{.r .cell-code}\npryr::object_size(as.character(letters))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n1.71 kB\n```\n:::\n:::\n\n\n## Cloud Computing\n\nSometimes, you will have data that are simply too large to ever fit on your\n\nlocal desktop machine. If that is the case, then the Elastic Cloud Computing\n\nEnvironment from the Office of Technology and Data Science can provide you with\n\neasy access to powerful analytic tools for computationally intensive project.\n\nThe Elastic Cloud Computing Environment allows researchers to quickly spin-up\n\nan Amazon Web Services (AWS) Elastic Cloud Compute (EC2) instance. These\n\ninstances offer increased memory to read in large datasets, along with\n\nadditional CPUs to provide the ability to process data in parallel at an\n\nimpressive scale.\n\n| Instance \\| CPU \\| Memory (GB) \\|\n\n\\|----------\\|-----\\|--------\\|\n\n| Desktop \\| 8 \\| 16 \\|\n\n| c5.4xlarge \\| 16 \\| 32 \\|\n\n| c5.9xlarge \\| 36 \\| 72 \\|\n\n| c5.18xlarge \\| 72 \\| 144 \\|\n\n| x1e.8xlarge \\| 32 \\| 976 \\|\n\n| x1e.16xlarge \\| 64 \\| 1952 \\|\n\nFeel free to contact Erika Tyagi (etyagi\\@urban.org) if this would be useful\n\nfor your project.\n\n------------------------------------------------------------------------\n\n# Common Pitfalls\n\n## For Loops and Vector Allocation\n\nA refrain you will often hear is that for loops in R are slow and need to be\n\navoided at all costs. This is not true! Rather, an improperly constructed loop\n\nin R can bring the execution of your program to a near standstill.\n\nA common for loop structure may look something like:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- 1:100\n\nout <- c()\n\nfor (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n```\n:::\n\n\nThe bottleneck in this loop is with the allocation of the vector `out`. Every\n\ntime we iterate over an item in `x` and append it to `out`, R makes a copy\n\nof all the items already in `out`. As the size of the loop grows, your code\n\nwill take longer and longer to run.\n\nA better practice is to pre-allocate `out` to be the correct length, and then\n\ninsert the results as the loop runs.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- 1:100\n\nout <- rep(NA, length(x))\n\nfor (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n}\n```\n:::\n\n\nA quick benchmark shows how much more efficient a loop with a pre-allocated\n\nresults vector is:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nbad_loop <- function(x) {\n\n out <- c()\n\n for (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n\n}\n\ngood_loop <- function(x) {\n\n out <- rep(NA, length(x))\n\n for (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n }\n\n}\n\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(x),\n\n good_loop(x)\n\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nUnit: microseconds\n expr min lq mean median uq max neval\n bad_loop(x) 1042.179 1267.577 1891.78264 1328.5640 1446.9720 10173.125 100\n good_loop(x) 6.191 6.437 32.23338 6.7035 11.2545 2366.725 100\n```\n:::\n:::\n\n\nAnd note how performance of the \"bad\" loop degrades as the loop size grows.\n\n\n::: {.cell}\n\n```{.r .cell-code}\ny <- 1:250\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(y),\n\n good_loop(y)\n\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nUnit: microseconds\n expr min lq mean median uq max\n bad_loop(y) 19249.582 22663.9595 24695.7231 23492.774 24909.407 81335.882\n good_loop(y) 14.022 14.5345 21.2626 23.329 26.486 64.616\n neval\n 100\n 100\n```\n:::\n:::\n\n\n## Vectorized Functions\n\nMany functions in R are vectorized, meaning they can accept an entire vector\n\n(and not just a single value) as input. The `sqrt` function from the\n\nprior examples is one:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- c(1, 4, 9, 16)\n\nsqrt(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\nThis removes the need to use `lapply` or a for loop. Vectorized functions in\n\nR are generally written in a compiled language like C, C++, or FORTRAN, which\n\nmakes their implementation faster.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n lapply(x, sqrt),\n\n sqrt(x)\n\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nUnit: nanoseconds\n expr min lq mean median uq max neval\n lapply(x, sqrt) 20172 20418 20847.27 20541 20725.5 37228 100\n sqrt(x) 287 328 397.70 369 369.0 2296 100\n```\n:::\n:::\n",
"supporting": [],
"filters": [
"rmarkdown/pagebreak.lua"
diff --git a/.quarto/crossref/index.qmd/index.html.json b/.quarto/crossref/index.qmd/index.html.json
index 2f559c9..16f4127 100644
--- a/.quarto/crossref/index.qmd/index.html.json
+++ b/.quarto/crossref/index.qmd/index.html.json
@@ -1 +1 @@
-{"entries":[],"headings":["r-users-group","sign-up-for-list-serv","section","section-1","contact-info","r-lunch-labs"]}
\ No newline at end of file
+{"headings":["r-users-group","sign-up-for-list-serv","section","section-1","contact-info","r-lunch-labs"],"entries":[]}
\ No newline at end of file
diff --git a/.quarto/crossref/optimization.qmd/optimization.html.json b/.quarto/crossref/optimization.qmd/optimization.html.json
index 04047c3..d1fddea 100644
--- a/.quarto/crossref/optimization.qmd/optimization.html.json
+++ b/.quarto/crossref/optimization.qmd/optimization.html.json
@@ -1 +1 @@
-{"headings":["introduction","update-your-installation","profiling-benchmarking","parallel-computing","learn-lapplypurrrmap","motivating-example","a-somewhat-more-complex-example","additional-packages","the-parallel-package","the-doparallel-package","machine-learning---caret","big-data","object-size","each-of-these-data-types-occupies-a-different-amount-of-space-in-memory","cloud-computing","common-pitfalls","for-loops-and-vector-allocation","vectorized-functions"],"entries":[]}
\ No newline at end of file
+{"entries":[],"headings":["introduction","update-your-installation","profiling-benchmarking","parallel-computing","learn-lapplypurrrmap","motivating-example","a-somewhat-more-complex-example","additional-packages","the-parallel-package","the-doparallel-package","machine-learning---caret","big-data","object-size","each-of-these-data-types-occupies-a-different-amount-of-space-in-memory","cloud-computing","common-pitfalls","for-loops-and-vector-allocation","vectorized-functions"]}
\ No newline at end of file
diff --git a/.quarto/idx/getting-data.qmd.json b/.quarto/idx/getting-data.qmd.json
index 3c30b92..9f536a9 100644
--- a/.quarto/idx/getting-data.qmd.json
+++ b/.quarto/idx/getting-data.qmd.json
@@ -1 +1 @@
-{"title":"source(here::here(\"getting-data\", \"census_api_key.R\"))","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}}},"headingText":"source(here::here(\"getting-data\", \"census_api_key.R\"))","containsRefs":false,"markdown":"\n\n\n\n::: {#header}\n\n:::\n\n```{r echo = FALSE}\n```\n\n```{r markdown-setup, include=FALSE}\n\nknitr::opts_chunk$set(fig.path = \"intro-to-r/www/images/\")\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(warning = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\n\n\noptions(scipen = 999)\n```\n\n# Introduction\n\nThis guide outlines some useful workflows for pulling data sets commonly used by the Urban Institute.\n\n## `library(tidycensus)`\n\n`library(tidycensus)` by Kyle Walker ([complete intro here](https://walkerke.github.io/tidycensus/)) is the best tool for accessing some Census data sets in R from the Census Bureau API. The package returns tidy data frames and can easily pull shapefiles by adding `geometry = TRUE`.\n\nYou will need to [apply for a Census API key](https://api.census.gov/data/key_signup.html) and [add it to your R session](https://walkerke.github.io/tidycensus/articles/basic-usage.html). Don't add your API key to your script and don't add it to a GitHub repository!\n\nHere is a simple example for one state with shapefiles:\n\n```{r tidycensus}\nlibrary(tidyverse)\nlibrary(purrr)\nlibrary(tidycensus)\n\n# pull median household income and shapefiles for Census tracts in Alabama\nget_acs(geography = \"tract\", \n\t\t\t\tvariables = \"B19013_001\", \n\t\t\t\tstate = \"01\",\n\t\t\t\tyear = 2015,\n\t\t\t\tgeometry = TRUE,\n\t\t\t\tprogress = FALSE)\n```\n\nSmaller geographies like Census tracts can only be pulled state-by-state. This example demonstrates how to iterate across FIPS codes to pull Census tracts for multiple states. The process is as follows:\n\n1. Pick the variables of interest\n2. Create a vector of state FIPS codes for the states of interest\n3. Create a custom function that works on a single state FIPS code\n4. Iterate the function along the vector of state FIPS codes with `map_df()` from `library(purrr)`\n\nHere is an example that pulls median household income at the Census tract level for multiple states:\n\n```{r tidycensus-iteration}\n# variables of interest\nvars <- c(\n \"B19013_001\" # median household income estimate\n)\n\n# states of interest: alabama, alaska, arizona\nstate_fips <- c(\"01\", \"02\", \"04\")\n\t\n# create a custom function that works for one state\nget_income <- function(state_fips) {\n\t\n\tincome_data <- get_acs(geography = \"tract\", \n\t\t\t\t\t\t\t\t\t\t\t\t variables = vars, \n\t\t\t\t\t\t\t\t\t\t\t\t state = state_fips,\n\t\t\t\t\t\t\t\t\t\t\t\t year = 2015)\n\t\n\treturn(income_data)\n\t\n}\n\n# iterate the function\nmap_df(.x = state_fips, # iterate along the vector of state fips codes\n\t\t\t .f = get_income) # apply get_income() to each fips_code \n```\n\n`library(tidycensus)` works well with `library(tidyverse)` and enables access to geospatial data, but it is limited to only some Census Bureau data sets. The next package has less functionality but allows for accessing any data available on the Census API.\n\n \n\n## `library(censusapi)`\n\n`library(censusapi)` by Hannah Recht ([complete intro here](https://cran.r-project.org/web/packages/censusapi/vignettes/getting-started.html)) can access any published table that is accessible through the Census Bureau API. A full listing is available [here](https://api.census.gov/data.html).\n\nYou will need to [apply for a Census API key](https://api.census.gov/data/key_signup.html) and [add it to your R session](https://cran.r-project.org/web/packages/censusapi/vignettes/getting-started.html). Don't add your API key to your script and don't add it to a GitHub repository!\n\nHere is a simple example that pulls median household income and its margin of error for Census tracts in Alabama:\n\n```{r censusapi}\nlibrary(tidyverse)\nlibrary(purrr)\nlibrary(censusapi)\nvars <- c(\n \"B19013_001E\", # median household income estimate\n \"B19013_001M\" # median household income margin of error\n)\n\ngetCensus(name = \"acs/acs5\",\n\t\t\t\t\tkey = Sys.getenv(\"CENSUS_API_KEY\"),\n\t\t\t\t\tvars = vars, \n\t\t\t\t\tregion = \"tract:*\",\n\t\t\t\t\tregionin = \"state:01\",\n\t\t\t\t\tvintage = 2015) %>%\n\tas_tibble()\n```\n\nSmaller geographies like Census tracts can only be pulled state-by-state. This example demonstrates how to iterate across FIPS codes to pull Census tracts for multiple states. The process is as follows:\n\n1. Pick the variables of interest\n2. Create a vector of state FIPS codes for the states of interest\n3. Create a custom function that works on a single state FIPS code\n4. Iterate the function along the vector of state FIPS codes with `map_df()` from `library(purrr)`\n\nHere is an example that pulls median household income at the Census tract level for multiple states:\n\n```{r censusapi-iteration}\n# variables of interest\nvars <- c(\n \"B19013_001E\", # median household income estimate\n \"B19013_001M\" # median household income margin of error\n)\n\n# states of interest: alabama, alaska, arizona\nstate_fips <- c(\"01\", \"02\", \"04\")\n\t\n# create a custom function that works for one state\nget_income <- function(state_fips) {\n\t\n\tincome_data <- getCensus(name = \"acs/acs5\", \n\t\t\t\t\t\t\t\t\t\t\t\t\t key = Sys.getenv(\"CENSUS_API_KEY\"),\n\t\t\t\t\t\t\t\t\t\t\t\t\t vars = vars, \n\t\t\t\t\t\t\t\t\t\t\t\t\t region = \"tract:*\",\n\t\t\t\t\t\t\t\t\t\t\t\t\t regionin = paste0(\"state:\", state_fips),\n\t\t\t\t\t\t\t\t\t\t\t\t\t vintage = 2015)\n\t\n\treturn(income_data)\n\t\n}\n\n# iterate the function\nmap_df(.x = state_fips, # iterate along the vector of state fips codes\n\t\t\t .f = get_income) %>% # apply get_income() to each fips_code \n\tas_tibble() \n```\n"},"formats":{"html":{"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"knitr"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[]},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"getting-data.html"},"language":{},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.2.269"},"extensions":{"book":{"multiFile":true}}}}}
\ No newline at end of file
+{"title":"source(here::here(\"getting-data\", \"census_api_key.R\"))","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}}},"headingText":"source(here::here(\"getting-data\", \"census_api_key.R\"))","containsRefs":false,"markdown":"\n\n\n\n::: {#header}\n\n:::\n\n```{r echo = FALSE}\n```\n\n```{r markdown-setup, include=FALSE}\n\nknitr::opts_chunk$set(fig.path = \"intro-to-r/www/images/\")\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(warning = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\n\n\noptions(scipen = 999)\n```\n\n# Introduction\n\nThis guide outlines some useful workflows for pulling data sets commonly used by the Urban Institute.\n\n## `library(tidycensus)`\n\n`library(tidycensus)` by Kyle Walker ([complete intro here](https://walkerke.github.io/tidycensus/)) is the best tool for accessing some Census data sets in R from the Census Bureau API. The package returns tidy data frames and can easily pull shapefiles by adding `geometry = TRUE`.\n\nYou will need to [apply for a Census API key](https://api.census.gov/data/key_signup.html) and [add it to your R session](https://walkerke.github.io/tidycensus/articles/basic-usage.html). Don't add your API key to your script and don't add it to a GitHub repository!\n\nHere is a simple example for one state with shapefiles:\n\n```{r tidycensus}\nlibrary(tidyverse)\nlibrary(purrr)\nlibrary(tidycensus)\n\n# pull median household income and shapefiles for Census tracts in Alabama\nget_acs(geography = \"tract\", \n\t\t\t\tvariables = \"B19013_001\", \n\t\t\t\tstate = \"01\",\n\t\t\t\tyear = 2015,\n\t\t\t\tgeometry = TRUE,\n\t\t\t\tprogress = FALSE)\n```\n\nSmaller geographies like Census tracts can only be pulled state-by-state. This example demonstrates how to iterate across FIPS codes to pull Census tracts for multiple states. The process is as follows:\n\n1. Pick the variables of interest\n2. Create a vector of state FIPS codes for the states of interest\n3. Create a custom function that works on a single state FIPS code\n4. Iterate the function along the vector of state FIPS codes with `map_df()` from `library(purrr)`\n\nHere is an example that pulls median household income at the Census tract level for multiple states:\n\n```{r tidycensus-iteration}\n# variables of interest\nvars <- c(\n \"B19013_001\" # median household income estimate\n)\n\n# states of interest: alabama, alaska, arizona\nstate_fips <- c(\"01\", \"02\", \"04\")\n\t\n# create a custom function that works for one state\nget_income <- function(state_fips) {\n\t\n\tincome_data <- get_acs(geography = \"tract\", \n\t\t\t\t\t\t\t\t\t\t\t\t variables = vars, \n\t\t\t\t\t\t\t\t\t\t\t\t state = state_fips,\n\t\t\t\t\t\t\t\t\t\t\t\t year = 2015)\n\t\n\treturn(income_data)\n\t\n}\n\n# iterate the function\nmap_df(.x = state_fips, # iterate along the vector of state fips codes\n\t\t\t .f = get_income) # apply get_income() to each fips_code \n```\n\n`library(tidycensus)` works well with `library(tidyverse)` and enables access to geospatial data, but it is limited to only some Census Bureau data sets. The next package has less functionality but allows for accessing any data available on the Census API.\n\n \n\n## `library(censusapi)`\n\n`library(censusapi)` by Hannah Recht ([complete intro here](https://cran.r-project.org/web/packages/censusapi/vignettes/getting-started.html)) can access any published table that is accessible through the Census Bureau API. A full listing is available [here](https://api.census.gov/data.html).\n\nYou will need to [apply for a Census API key](https://api.census.gov/data/key_signup.html) and [add it to your R session](https://cran.r-project.org/web/packages/censusapi/vignettes/getting-started.html). Don't add your API key to your script and don't add it to a GitHub repository!\n\nHere is a simple example that pulls median household income and its margin of error for Census tracts in Alabama:\n\n```{r censusapi}\nlibrary(tidyverse)\nlibrary(purrr)\nlibrary(censusapi)\nvars <- c(\n \"B19013_001E\", # median household income estimate\n \"B19013_001M\" # median household income margin of error\n)\n\ngetCensus(name = \"acs/acs5\",\n\t\t\t\t\tkey = Sys.getenv(\"CENSUS_API_KEY\"),\n\t\t\t\t\tvars = vars, \n\t\t\t\t\tregion = \"tract:*\",\n\t\t\t\t\tregionin = \"state:01\",\n\t\t\t\t\tvintage = 2015) %>%\n\tas_tibble()\n```\n\nSmaller geographies like Census tracts can only be pulled state-by-state. This example demonstrates how to iterate across FIPS codes to pull Census tracts for multiple states. The process is as follows:\n\n1. Pick the variables of interest\n2. Create a vector of state FIPS codes for the states of interest\n3. Create a custom function that works on a single state FIPS code\n4. Iterate the function along the vector of state FIPS codes with `map_df()` from `library(purrr)`\n\nHere is an example that pulls median household income at the Census tract level for multiple states:\n\n```{r censusapi-iteration}\n# variables of interest\nvars <- c(\n \"B19013_001E\", # median household income estimate\n \"B19013_001M\" # median household income margin of error\n)\n\n# states of interest: alabama, alaska, arizona\nstate_fips <- c(\"01\", \"02\", \"04\")\n\t\n# create a custom function that works for one state\nget_income <- function(state_fips) {\n\t\n\tincome_data <- getCensus(name = \"acs/acs5\", \n\t\t\t\t\t\t\t\t\t\t\t\t\t key = Sys.getenv(\"CENSUS_API_KEY\"),\n\t\t\t\t\t\t\t\t\t\t\t\t\t vars = vars, \n\t\t\t\t\t\t\t\t\t\t\t\t\t region = \"tract:*\",\n\t\t\t\t\t\t\t\t\t\t\t\t\t regionin = paste0(\"state:\", state_fips),\n\t\t\t\t\t\t\t\t\t\t\t\t\t vintage = 2015)\n\t\n\treturn(income_data)\n\t\n}\n\n# iterate the function\nmap_df(.x = state_fips, # iterate along the vector of state fips codes\n\t\t\t .f = get_income) %>% # apply get_income() to each fips_code \n\tas_tibble() \n```\n","srcMarkdownNoYaml":"\n\n\n\n::: {#header}\n\n:::\n\n```{r echo = FALSE}\n# source(here::here(\"getting-data\", \"census_api_key.R\"))\n```\n\n```{r markdown-setup, include=FALSE}\n\nknitr::opts_chunk$set(fig.path = \"intro-to-r/www/images/\")\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(warning = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\n\n\noptions(scipen = 999)\n```\n\n# Introduction\n\nThis guide outlines some useful workflows for pulling data sets commonly used by the Urban Institute.\n\n## `library(tidycensus)`\n\n`library(tidycensus)` by Kyle Walker ([complete intro here](https://walkerke.github.io/tidycensus/)) is the best tool for accessing some Census data sets in R from the Census Bureau API. The package returns tidy data frames and can easily pull shapefiles by adding `geometry = TRUE`.\n\nYou will need to [apply for a Census API key](https://api.census.gov/data/key_signup.html) and [add it to your R session](https://walkerke.github.io/tidycensus/articles/basic-usage.html). Don't add your API key to your script and don't add it to a GitHub repository!\n\nHere is a simple example for one state with shapefiles:\n\n```{r tidycensus}\nlibrary(tidyverse)\nlibrary(purrr)\nlibrary(tidycensus)\n\n# pull median household income and shapefiles for Census tracts in Alabama\nget_acs(geography = \"tract\", \n\t\t\t\tvariables = \"B19013_001\", \n\t\t\t\tstate = \"01\",\n\t\t\t\tyear = 2015,\n\t\t\t\tgeometry = TRUE,\n\t\t\t\tprogress = FALSE)\n```\n\nSmaller geographies like Census tracts can only be pulled state-by-state. This example demonstrates how to iterate across FIPS codes to pull Census tracts for multiple states. The process is as follows:\n\n1. Pick the variables of interest\n2. Create a vector of state FIPS codes for the states of interest\n3. Create a custom function that works on a single state FIPS code\n4. Iterate the function along the vector of state FIPS codes with `map_df()` from `library(purrr)`\n\nHere is an example that pulls median household income at the Census tract level for multiple states:\n\n```{r tidycensus-iteration}\n# variables of interest\nvars <- c(\n \"B19013_001\" # median household income estimate\n)\n\n# states of interest: alabama, alaska, arizona\nstate_fips <- c(\"01\", \"02\", \"04\")\n\t\n# create a custom function that works for one state\nget_income <- function(state_fips) {\n\t\n\tincome_data <- get_acs(geography = \"tract\", \n\t\t\t\t\t\t\t\t\t\t\t\t variables = vars, \n\t\t\t\t\t\t\t\t\t\t\t\t state = state_fips,\n\t\t\t\t\t\t\t\t\t\t\t\t year = 2015)\n\t\n\treturn(income_data)\n\t\n}\n\n# iterate the function\nmap_df(.x = state_fips, # iterate along the vector of state fips codes\n\t\t\t .f = get_income) # apply get_income() to each fips_code \n```\n\n`library(tidycensus)` works well with `library(tidyverse)` and enables access to geospatial data, but it is limited to only some Census Bureau data sets. The next package has less functionality but allows for accessing any data available on the Census API.\n\n \n\n## `library(censusapi)`\n\n`library(censusapi)` by Hannah Recht ([complete intro here](https://cran.r-project.org/web/packages/censusapi/vignettes/getting-started.html)) can access any published table that is accessible through the Census Bureau API. A full listing is available [here](https://api.census.gov/data.html).\n\nYou will need to [apply for a Census API key](https://api.census.gov/data/key_signup.html) and [add it to your R session](https://cran.r-project.org/web/packages/censusapi/vignettes/getting-started.html). Don't add your API key to your script and don't add it to a GitHub repository!\n\nHere is a simple example that pulls median household income and its margin of error for Census tracts in Alabama:\n\n```{r censusapi}\nlibrary(tidyverse)\nlibrary(purrr)\nlibrary(censusapi)\nvars <- c(\n \"B19013_001E\", # median household income estimate\n \"B19013_001M\" # median household income margin of error\n)\n\ngetCensus(name = \"acs/acs5\",\n\t\t\t\t\tkey = Sys.getenv(\"CENSUS_API_KEY\"),\n\t\t\t\t\tvars = vars, \n\t\t\t\t\tregion = \"tract:*\",\n\t\t\t\t\tregionin = \"state:01\",\n\t\t\t\t\tvintage = 2015) %>%\n\tas_tibble()\n```\n\nSmaller geographies like Census tracts can only be pulled state-by-state. This example demonstrates how to iterate across FIPS codes to pull Census tracts for multiple states. The process is as follows:\n\n1. Pick the variables of interest\n2. Create a vector of state FIPS codes for the states of interest\n3. Create a custom function that works on a single state FIPS code\n4. Iterate the function along the vector of state FIPS codes with `map_df()` from `library(purrr)`\n\nHere is an example that pulls median household income at the Census tract level for multiple states:\n\n```{r censusapi-iteration}\n# variables of interest\nvars <- c(\n \"B19013_001E\", # median household income estimate\n \"B19013_001M\" # median household income margin of error\n)\n\n# states of interest: alabama, alaska, arizona\nstate_fips <- c(\"01\", \"02\", \"04\")\n\t\n# create a custom function that works for one state\nget_income <- function(state_fips) {\n\t\n\tincome_data <- getCensus(name = \"acs/acs5\", \n\t\t\t\t\t\t\t\t\t\t\t\t\t key = Sys.getenv(\"CENSUS_API_KEY\"),\n\t\t\t\t\t\t\t\t\t\t\t\t\t vars = vars, \n\t\t\t\t\t\t\t\t\t\t\t\t\t region = \"tract:*\",\n\t\t\t\t\t\t\t\t\t\t\t\t\t regionin = paste0(\"state:\", state_fips),\n\t\t\t\t\t\t\t\t\t\t\t\t\t vintage = 2015)\n\t\n\treturn(income_data)\n\t\n}\n\n# iterate the function\nmap_df(.x = state_fips, # iterate along the vector of state fips codes\n\t\t\t .f = get_income) %>% # apply get_income() to each fips_code \n\tas_tibble() \n```\n"},"formats":{"html":{"identifier":{"display-name":"HTML","target-format":"html","base-format":"html"},"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"knitr"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"inline-includes":false,"preserve-yaml":false,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[],"notebook-links":true,"format-links":true},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"getting-data.html"},"language":{"toc-title-document":"Table of contents","toc-title-website":"On this page","related-formats-title":"Other Formats","related-notebooks-title":"Notebooks","source-notebooks-prefix":"Source","section-title-abstract":"Abstract","section-title-appendices":"Appendices","section-title-footnotes":"Footnotes","section-title-references":"References","section-title-reuse":"Reuse","section-title-copyright":"Copyright","section-title-citation":"Citation","appendix-attribution-cite-as":"For attribution, please cite this work as:","appendix-attribution-bibtex":"BibTeX citation:","title-block-author-single":"Author","title-block-author-plural":"Authors","title-block-affiliation-single":"Affiliation","title-block-affiliation-plural":"Affiliations","title-block-published":"Published","title-block-modified":"Modified","callout-tip-title":"Tip","callout-note-title":"Note","callout-warning-title":"Warning","callout-important-title":"Important","callout-caution-title":"Caution","code-summary":"Code","code-tools-menu-caption":"Code","code-tools-show-all-code":"Show All Code","code-tools-hide-all-code":"Hide All Code","code-tools-view-source":"View Source","code-tools-source-code":"Source Code","code-line":"Line","code-lines":"Lines","copy-button-tooltip":"Copy to Clipboard","copy-button-tooltip-success":"Copied!","repo-action-links-edit":"Edit this page","repo-action-links-source":"View source","repo-action-links-issue":"Report an issue","back-to-top":"Back to top","search-no-results-text":"No results","search-matching-documents-text":"matching documents","search-copy-link-title":"Copy link to search","search-hide-matches-text":"Hide additional matches","search-more-match-text":"more match in this document","search-more-matches-text":"more matches in this document","search-clear-button-title":"Clear","search-detached-cancel-button-title":"Cancel","search-submit-button-title":"Submit","search-label":"Search","toggle-section":"Toggle section","toggle-sidebar":"Toggle sidebar navigation","toggle-dark-mode":"Toggle dark mode","toggle-reader-mode":"Toggle reader mode","toggle-navigation":"Toggle navigation","crossref-fig-title":"Figure","crossref-tbl-title":"Table","crossref-lst-title":"Listing","crossref-thm-title":"Theorem","crossref-lem-title":"Lemma","crossref-cor-title":"Corollary","crossref-prp-title":"Proposition","crossref-cnj-title":"Conjecture","crossref-def-title":"Definition","crossref-exm-title":"Example","crossref-exr-title":"Exercise","crossref-ch-prefix":"Chapter","crossref-apx-prefix":"Appendix","crossref-sec-prefix":"Section","crossref-eq-prefix":"Equation","crossref-lof-title":"List of Figures","crossref-lot-title":"List of Tables","crossref-lol-title":"List of Listings","environment-proof-title":"Proof","environment-remark-title":"Remark","environment-solution-title":"Solution","listing-page-order-by":"Order By","listing-page-order-by-default":"Default","listing-page-order-by-date-asc":"Oldest","listing-page-order-by-date-desc":"Newest","listing-page-order-by-number-desc":"High to Low","listing-page-order-by-number-asc":"Low to High","listing-page-field-date":"Date","listing-page-field-title":"Title","listing-page-field-description":"Description","listing-page-field-author":"Author","listing-page-field-filename":"File Name","listing-page-field-filemodified":"Modified","listing-page-field-subtitle":"Subtitle","listing-page-field-readingtime":"Reading Time","listing-page-field-categories":"Categories","listing-page-minutes-compact":"{0} min","listing-page-category-all":"All","listing-page-no-matches":"No matching items"},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.3.433"},"extensions":{"book":{"multiFile":true}}}},"projectFormats":["html"]}
\ No newline at end of file
diff --git a/.quarto/idx/graphics-guide.qmd.json b/.quarto/idx/graphics-guide.qmd.json
index 91adc8d..336dd69 100644
--- a/.quarto/idx/graphics-guide.qmd.json
+++ b/.quarto/idx/graphics-guide.qmd.json
@@ -1 +1 @@
-{"title":"Urban Institute R Graphics Guide","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"editor_options":{"chunk_output_type":"console"}},"headingText":"Urban Institute R Graphics Guide","containsRefs":false,"markdown":"\n\n\n\n::: {#header}\n\n:::\n\n\n```{r setup, include=FALSE}\nlibrary(knitr)\nlibrary(datasets)\nlibrary(tidyverse)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nopts_chunk$set(fig.path = \"graphics-guide/www/images/\")\nopts_chunk$set(echo = TRUE)\nopts_chunk$set(warning = FALSE)\nopts_chunk$set(message = FALSE)\nopts_chunk$set(fig.width = 6.5)\nopts_chunk$set(fig.height = 4)\nopts_chunk$set(fig.retina = 3)\noptions(scipen = 999)\n```\n\nR is a powerful, open-source programming language and environment. R excels at data management and munging, traditional statistical analysis, machine learning, and reproducible research, but it is probably best known for its graphics. This guide contains examples and instructions for popular and lesser-known plotting techniques in R. It also includes instructions for using `urbnthemes`, the Urban Institute's R package for creating near-publication-ready plots with `ggplot2`. If you have any questions, please don't hesitate to contact Aaron Williams (awilliams\\@urban.org) or Kyle Ueyama (kueyama\\@urban.org).\n\n### Background\n\n`library(urbnthemes)` makes `ggplot2` output align more closely with [the Urban Institute's Data Visualization style guide](http://urbaninstitute.github.io/graphics-styleguide/). This package does **not produce publication ready graphics**. Visual styles must still be edited using your project/paper's normal editing workflow.\n\nExporting charts as a pdf will allow them to be more easily edited. See the Saving Plots section for more information.\n\nThe theme has been tested against `ggplot2 version 3.0.0`. It will not function properly with older versions of `ggplot2`\n\n### Using library(urbnthemes)\n\nRun the following code to install or update `urbnthemes`:\n\n install.packages(\"remotes\")\n remotes::install_github(\"UrbanInstitute/urbnthemes\")\n\nRun the following code at the top of each script:\n\n library(tidyverse)\n library(urbnthemes)\n\n set_urbn_defaults(style = \"print\")\n\n### Installing Lato {#installing_lato}\n\nYour Urban computer may not have the Lato font installed. If it is not installed, please install the free [Lato font from Google](https://www.google.com/fonts/specimen/Lato). Below are step by step instructions:\n\n1) Download the [Lato font](https://www.google.com/fonts/specimen/Lato) (as a zip file).\n2) Unzip the file on your computer.\n3) For each `.ttf` file in the unzipped `Lato/` folder, double click the file and click `Install` (on Windows) or `Install Font` (on Mac).\n4) Import and register Lato into R by running `urbnthemes::lato_import()` in the console once. Be patient as this may take a few minutes!\n5) To confirm installation, run `urbnthemes::lato_test()`. If this is successful you're done and Lato will automatically be used when creating plots with `library(urbnthemes)`. You only need to install Lato once per computer.\n\nWaffle charts with glyphs require fontawesome. `fontawesome_test()` and `fontawesome_install()` are the fontawesome versions of the above functions. Be sure to install fontawesome from [here](https://github.com/hrbrmstr/waffle/tree/master/inst/fonts) first.\n\n### Grammar of Graphics and Conventions\n\nHadley Wickham's ggplot2 is based on Leland Wilkinson's [*The Grammar of Graphics*](https://www.amazon.com/Grammar-Graphics-Statistics-Computing/dp/0387245448) and Wickham's [*A Layered Grammar of Graphics*](http://vita.had.co.nz/papers/layered-grammar.html). The layered grammar of graphics is a structured way of thinking about the components of a plot, which then lend themselves to the simple structure of ggplot2.\n\n- **Data** are what are visualizaed in a plot and **mappings** are directions for how data are mapped in a plot in a way that can be perceived by humans.\\\n- **Geoms** are representations of the actual data like points, lines, and bars.\n- **Stats** are statistical transformations that represent summaries of the data like histograms.\n- **Scales** map values in the data space to values in the aesthetic space. Scales draw legends and axes.\n- **Coordinate Systems** describe how geoms are mapped to the plane of the graphic.\\\n- **Facets** break the data into meaningful subsets like small multiples.\n- **Themes** control the finer points of a plot such as fonts, font sizes, and background colors.\n\nMore information: [ggplot2: Elegant Graphics for Data Analysis](https://www.amazon.com/ggplot2-Elegant-Graphics-Data-Analysis/dp/0387981403)\n\n### Tips and Tricks\n\n- `ggplot2` expects data to be in data frames or tibbles. It is preferable for the data frames to be \"tidy\" with each variable as a column, each obseravtion as a row, and each observational unit as a separate table. `dplyr` and `tidyr` contain concise and effective tools for \"tidying\" data.\n\n- R allows function arguments to be called explicitly by name and implicitly by position. The coding examples in this guide only contain named arguments for clarity.\n\n- Graphics will sometimes render differently on different operating systems. This is because anti-aliasing is activated in R on Mac and Linux but not activated in R on Windows. This won't be an issue once graphics are saved.\n\n- Continuous x-axes have ticks. Discrete x-axes do not have ticks. Use `remove_ticks()` to remove ticks.\n\n## Bar Plots\n\n------------------------------------------------------------------------\n\n### One Color\n\n```{r barplots}\nmtcars %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = factor(cyl), y = n)) +\n geom_col() +\n geom_text(mapping = aes(label = n), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) +\n remove_ticks() +\n remove_axis() \n```\n\n### One Color (Rotated)\n\nThis example introduces `coord_flip()` and `remove_axis(axis = \"x\", flip = TRUE)`. `remove_axis()` is from `library(urbnthemes)` and creates a custom theme for rotated bar plots.\n\n```{r barplot-rotated}\nmtcars %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = factor(cyl), y = n)) +\n geom_col() +\n geom_text(mapping = aes(label = n), hjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n coord_flip() +\n remove_axis(axis = \"x\", flip = TRUE)\n```\n\n### Three Colors\n\nThis is identical to the previous plot except colors and a legend are added with `fill = cyl`. Turning `x` into a factor with `factor(cyl)` skips 5 and 7 on the `x-axis`. Adding `fill = cyl` without `factor()` would have created a continuous color scheme and legend.\n\n```{r 3-color-barplot}\nmtcars %>%\n mutate(cyl = factor(cyl)) %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = cyl, y = n, fill = cyl)) +\n geom_col() +\n geom_text(mapping = aes(label = n), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) +\n remove_ticks() +\n remove_axis()\n```\n\n### Stacked Bar Plot\n\nAn additional aesthetic can easily be added to bar plots by adding `fill = categorical variable` to the mapping. Here, transmission type subsets each bar showing the count of cars with different numbers of cylinders.\n\n```{r stacked-bar-plot}\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>% \n group_by(am) %>%\n count(cyl) %>%\n group_by(cyl) %>%\n arrange(desc(am)) %>%\n mutate(label_height = cumsum(n)) %>%\n ggplot() +\n geom_col(mapping = aes(x = cyl, y = n, fill = am)) +\n geom_text(aes(x = cyl, y = label_height - 0.5, label = n, color = am)) +\n scale_color_manual(values = c(\"white\", \"black\")) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n remove_axis() +\n guides(color = \"none\")\n```\n\n### Stacked Bar Plot With Position = Fill\n\nThe previous examples used `geom_col()`, which takes a y value for bar height. This example uses `geom_bar()` which sums the values and generates a value for bar heights. In this example, `position = \"fill\"` in `geom_bar()` changes the y-axis from count to the proportion of each bar.\n\n```{r stacked-bar-plot-fill}\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>% \n ggplot() +\n geom_bar(mapping = aes(x = cyl, fill = am), position = \"fill\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1)), labels = scales::percent) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n guides(color = \"none\")\n```\n\n### Dodged Bar Plot\n\nSubsetted bar charts in ggplot2 are stacked by default. `position = \"dodge\"` in `geom_col()` expands the bar chart so the bars appear next to each other.\n\n```{r dodged-bar-plot}\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>%\n group_by(am) %>%\n count(cyl) %>%\n ggplot(mapping = aes(cyl, y = n, fill = factor(am))) +\n geom_col(position = \"dodge\") +\n geom_text(aes(label = n), position = position_dodge(width = 0.7), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n remove_axis()\n```\n\n### Lollipop plot/Cleveland dot plot {.tabset}\n\nLollipop plots and Cleveland dot plots are minimalist alternatives to bar plots. The key to both plots is to order the data based on the continuous variable using `arrange()` and then turn the discrete variable into a factor with the ordered levels of the continuous variable using `mutate()`. This step \"stores\" the order of the data.\n\n#### Lollipop plot\n\n```{r lollipop-plot, fig.height = 5}\nmtcars %>%\n\trownames_to_column(\"model\") %>%\n\tarrange(mpg) %>%\n\tmutate(model = factor(model, levels = .$model)) %>%\n\tggplot(aes(mpg, model)) +\n\t\tgeom_segment(aes(x = 0, xend = mpg, y = model, yend = model)) +\t\n\t\tgeom_point() +\n\t\tscale_x_continuous(expand = expansion(mult = c(0, 0)), limits = c(0, 40)) +\n\t\tlabs(x = NULL, \n\t\t\t\t y = \"Miles Per Gallon\")\n```\n\n#### Cleveland dot plot\n\n```{r cleveland-dot-plot, fig.height = 5}\nmtcars %>%\n\trownames_to_column(\"model\") %>%\n\tarrange(mpg) %>%\n\tmutate(model = factor(model, levels = .$model)) %>%\n\tggplot(aes(mpg, model)) +\n\t\tgeom_point() +\n\t\tscale_x_continuous(expand = expansion(mult = c(0, 0)), limits = c(0, 40)) +\n\t\tlabs(x = NULL, \n\t\t\t\t y = \"Miles Per Gallon\")\n```\n\n### Dumbell plot\n\n## Scatter Plots\n\n------------------------------------------------------------------------\n\n### One Color Scatter Plot\n\nScatter plots are useful for showing relationships between two or more variables. Use `scatter_grid()` from `library(urbnthemes)` to easily add vertical grid lines for scatter plots.\n\n```{r one-color-scatter-plot}\nmtcars %>%\n ggplot(mapping = aes(x = wt, y = mpg)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:8 * 5) +\n labs(x = \"Weight (thousands of pounds)\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n### High-Density Scatter Plot with Transparency\n\nLarge numbers of observations can sometimes make scatter plots tough to interpret because points overlap. Adding `alpha =` with a number between 0 and 1 adds transparency to points and clarity to plots. Now it's easy to see that jewelry stores are probably rounding up but not rounding down carats!\n\n```{r alpha-scatter-plot}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 20000),\n breaks = 0:4 * 5000,\n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n```\n\n### Hex Scatter Plot\n\nSometimes transparency isn't enough to bring clarity to a scatter plot with many observations. As n increases into the hundreds of thousands and even millions, `geom_hex` can be one of the best ways to display relationships between two variables.\n\n```{r scatter-plot-hex}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_hex(mapping = aes(fill = after_stat(count))) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 20000),\n breaks = 0:4 * 5000,\n labels = scales::dollar) +\n\tscale_fill_gradientn(labels = scales::comma) + \n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid() +\n theme(legend.position = \"right\",\n legend.direction = \"vertical\")\n```\n\n### Scatter Plots With Random Noise {.tabset}\n\nSometimes scatter plots have many overlapping points but a reasonable number of observations. `geom_jitter` adds a small amount of random noise so points are less likely to overlap. `width` and `height` control the amount of noise that is added. In the following before-and-after, notice how many more points are visible after adding jitter.\n\n#### Before\n\n```{r before-scatter-plot}\nmpg %>%\n ggplot(mapping = aes(x = displ, y = cty)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 8),\n breaks = 0:8) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:4 * 10) +\n labs(x = \"Displacement\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n#### After\n\n```{r jitter-plot}\nset.seed(2017)\nmpg %>%\n ggplot(mapping = aes(x = displ, y = cty)) +\n geom_jitter() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 8),\n breaks = 0:8) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:4 * 10) +\n labs(x = \"Displacement\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n### Scatter Plots with Varying Point Size\n\nWeights and populations can be mapped in scatter plots to the size of the points. Here, the number of households in each state is mapped to the size of each point using `aes(size = hhpop)`. Note: `ggplot2::geom_point()` is used instead of `geom_point()`.\n\n```{r geom_point-size, fig.height = 5}\nurbnmapr::statedata %>%\n ggplot(mapping = aes(x = medhhincome, y = horate)) +\n ggplot2::geom_point(mapping = aes(size = hhpop), alpha = 0.3) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(30000, 80000),\n breaks = 3:8 * 10000,\n labels = scales::dollar) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 0.8),\n breaks = 0:4 * 0.2) +\n scale_radius(range = c(3, 15),\n breaks = c(2500000, 7500000, 12500000), \n labels = scales::comma) +\n labs(x = \"Household income\",\n y = \"Homeownership rate\") +\n scatter_grid() +\n\ttheme(plot.margin = margin(r = 20))\n```\n\n### Scatter Plots with Fill\n\nA third aesthetic can be added to scatter plots. Here, color signifies the number of cylinders in each car. Before `ggplot()` is called, Cylinders is created using `library(dplyr)` and the piping operator `%>%`.\n\n```{r filled-scatter-plot}\nmtcars %>%\n mutate(cyl = paste(cyl, \"cylinders\")) %>%\n ggplot(aes(x = wt, y = mpg, color = cyl)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:8 * 5) +\n labs(x = \"Weight (thousands of pounds)\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n## Line Plots\n\n------------------------------------------------------------------------\n\n```{r line-plots}\neconomics %>%\n ggplot(mapping = aes(x = date, y = unemploy)) +\n geom_line() +\n scale_x_date(expand = expansion(mult = c(0.002, 0)), \n breaks = \"10 years\",\n limits = c(as.Date(\"1961-01-01\"), as.Date(\"2020-01-01\")),\n date_labels = \"%Y\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 4000,\n limits = c(0, 16000),\n labels = scales::comma) +\n labs(x = \"Year\", \n y = \"Number Unemployed (1,000s)\")\n```\n\n### Lines Plots With Multiple Lines\n\n```{r multiple-line-charts1}\nlibrary(gapminder)\n\ngapminder %>%\n filter(country %in% c(\"Australia\", \"Canada\", \"New Zealand\")) %>%\n mutate(country = factor(country, levels = c(\"Canada\", \"Australia\", \"New Zealand\"))) %>%\n ggplot(aes(year, gdpPercap, color = country)) +\n geom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n breaks = c(1952 + 0:12 * 5), \n limits = c(1952, 2007)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:8 * 5000,\n labels = scales::dollar, \n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Per capita GDP (US dollars)\")\n```\n\nPlotting more than one variable can be useful for seeing the relationship of variables over time, but it takes a small amount of data munging.\n\nThis is because `ggplot2` wants data in a \"long\" format instead of a \"wide\" format for line plots with multiple lines. `gather()` and `spread()` from the `tidyr` package make switching back-and-forth between \"long\" and \"wide\" painless. Essentially, variable titles go into \"key\" and variable values go into \"value\". Then ggplot2, turns the different levels of the key variable (population, unemployment) into colors.\n\n```{r multiple-line-charts2}\nas_tibble(EuStockMarkets) %>%\n\tmutate(date = time(EuStockMarkets)) %>%\n\tgather(key = \"key\", value = \"value\", -date) %>%\n\tggplot(mapping = aes(x = date, y = value, color = key)) +\n\tgeom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(1991, 1999), \n breaks = c(1991, 1993, 1995, 1997, 1999)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 2500,\n labels = scales::dollar, \n limits = c(0, 10000)) + \n\tlabs(x = \"Date\",\n\t\t\t y = \"Value\")\n```\n\n### Step plot\n\n`geom_line()` connects coordinates with the shortest possible straight line. Sometimes step plots are necessary because y values don't change between coordinates. For example, the upper-bound of the Federal Funds Rate is set at regular intervals and remains constant until it is changed.\n\n```{r step-plot}\n# downloaded from FRED on 2018-12-06\n\n# https://fred.stlouisfed.org/series/DFEDTARU\n\nfed_fund_rate <- read_csv(\n \"date, fed_funds_rate\n 2014-01-01,0.0025\n 2015-12-16,0.0050\n 2016-12-14,0.0075\n 2017-03-16,0.0100\n 2017-06-15,0.0125\n 2017-12-14,0.0150\n 2018-03-22,0.0175\n 2018-06-14,0.0200\n 2018-09-27,0.0225\n 2018-12-06,0.0225\")\n\nfed_fund_rate %>%\n ggplot(mapping = aes(x = date, y = fed_funds_rate)) + \n geom_step() +\n scale_x_date(expand = expansion(mult = c(0.002, 0)), \n breaks = \"1 year\",\n limits = c(as.Date(\"2014-01-01\"), as.Date(\"2019-01-01\")),\n date_labels = \"%Y\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = c(0, 0.01, 0.02, 0.03),\n limits = c(0, 0.03),\n labels = scales::percent) + \n\tlabs(x = \"Date\",\n\t\t\t y = \"Upper-bound of the Federal Funds Rate\")\n```\n\n### Path plot\n\nThe Beveridge curve is a macroeconomic plot that displays a relationship between the unemployment rate and the vacancy rate. Movements along the curve indicate changes in the business cyle and horizontal shifts of the curve suggest structural changes in the labor market.\n\nLines in Beveridge curves do not monotonically move from left to right. Therefore, it is necessary to use `geom_path()`.\n\n```{r, path-plot}\n# seasonally-adjusted, quarterly vacancy rate - JOLTS # seasonally-adjusted, quarterly unemployment rate - CPS\n\n# pulled from FRED on April 11, 2018. \n\nlibrary(ggrepel)\n\nbeveridge <- read_csv(\n\t\"quarter, vacanacy_rate, unempoyment_rate\n\t2006-01-01,0.0310,0.0473\n\t2006-04-01,0.0316,0.0463\n\t2006-07-01,0.0313,0.0463\n\t2006-10-01,0.0310,0.0443\n\t2007-01-01,0.0323,0.0450\n\t2007-04-01,0.0326,0.0450\n\t2007-07-01,0.0316,0.0466\n\t2007-10-01,0.0293,0.0480\n\t2008-01-01,0.0286,0.0500\n\t2008-04-01,0.0280,0.0533\n\t2008-07-01,0.0253,0.0600\n\t2008-10-01,0.0220,0.0686\n\t2009-01-01,0.0196,0.0826\n\t2009-04-01,0.0180,0.0930\n\t2009-07-01,0.0176,0.0963\n\t2009-10-01,0.0180,0.0993\n\t2010-01-01,0.0196,0.0983\n\t2010-04-01,0.0220,0.0963\n\t2010-07-01,0.0216,0.0946\n\t2010-10-01,0.0220,0.0950\n\t2011-01-01,0.0226,0.0903\n\t2011-04-01,0.0236,0.0906\n\t2011-07-01,0.0250,0.0900\n\t2011-10-01,0.0243,0.0863\n\t2012-01-01,0.0270,0.0826\n\t2012-04-01,0.0270,0.0820\n\t2012-07-01,0.0266,0.0803\n\t2012-10-01,0.0260,0.0780\n\t2013-01-01,0.0276,0.0773\n\t2013-04-01,0.0280,0.0753\n\t2013-07-01,0.0280,0.0723\n\t2013-10-01,0.0276,0.0693\n\t2014-01-01,0.0290,0.0666\n\t2014-04-01,0.0323,0.0623\n\t2014-07-01,0.0326,0.0610\n\t2014-10-01,0.0330,0.0570\n\t2015-01-01,0.0350,0.0556\n\t2015-04-01,0.0366,0.0540\n\t2015-07-01,0.0373,0.0510\n\t2015-10-01,0.0360,0.0500\n\t2016-01-01,0.0386,0.0493\n\t2016-04-01,0.0383,0.0486\n\t2016-07-01,0.0383,0.0493\n\t2016-10-01,0.0363,0.0473\n\t2017-01-01,0.0366,0.0466\n\t2017-04-01,0.0390,0.0433\n\t2017-07-01,0.0406,0.0430\n\t2017-10-01,0.0386,0.0410\")\n\nlabels <- beveridge %>%\n filter(lubridate::month(quarter) == 1)\n\nbeveridge %>%\n\tggplot() +\n\tgeom_path(mapping = aes(x = unempoyment_rate, y = vacanacy_rate), alpha = 0.5) +\n geom_point(data = labels, mapping = aes(x = unempoyment_rate, y = vacanacy_rate)) +\n geom_text_repel(data = labels, mapping = aes(x = unempoyment_rate, y = vacanacy_rate, label = lubridate::year(quarter))) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0.04, 0.1),\n labels = scales::percent) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = c(0, 0.01, 0.02, 0.03, 0.04, 0.05),\n limits = c(0, 0.05),\n labels = scales::percent) + \n\tlabs(x = \"Seasonally-adjusted unemployment rate\",\n\t\t\t y = \"Seasonally-adjusted vacancy rate\") + \n scatter_grid()\n```\n\n### Slope plots\n\n```{r slope-plot, fig.height = 5}\n# https://www.bls.gov/lau/\nlibrary(ggrepel)\n\nunemployment <- tibble(\n\ttime = c(\"October 2009\", \"October 2009\", \"October 2009\", \"August 2017\", \"August 2017\", \"August 2017\"),\n\trate = c(7.4, 7.1, 10.0, 3.9, 3.8, 6.4),\n\tstate = c(\"Maryland\", \"Virginia\", \"Washington, D.C.\", \"Maryland\", \"Virginia\", \"Washington, D.C.\")\n)\n\nlabel <- tibble(label = c(\"October 2009\", \"August 2017\"))\noctober <- filter(unemployment, time == \"October 2009\")\naugust <- filter(unemployment, time == \"August 2017\")\n\nunemployment %>%\n\tmutate(time = factor(time, levels = c(\"October 2009\", \"August 2017\")),\n\t state = factor(state, levels = c(\"Washington, D.C.\", \"Maryland\", \"Virginia\"))) %>%\n\tggplot() + \n\tgeom_line(aes(time, rate, group = state, color = state), show.legend = FALSE) +\n\tgeom_point(aes(x = time, y = rate, color = state)) +\n\tlabs(subtitle = \"Unemployment Rate\") +\n\ttheme(axis.ticks.x = element_blank(),\n\t\t\t\taxis.title.x = element_blank(),\n\t\t\t\taxis.ticks.y = element_blank(),\n axis.title.y = element_blank(), \n axis.text.y = element_blank(),\n\t\t\t\tpanel.grid.major.y = element_blank(),\n panel.grid.minor.y = element_blank(),\n panel.grid.major.x = element_blank(),\n\t\t\t\taxis.line = element_blank()) +\n\tgeom_text_repel(data = october, mapping = aes(x = time, y = rate, label = as.character(rate)), nudge_x = -0.06) + \n\tgeom_text_repel(data = august, mapping = aes(x = time, y = rate, label = as.character(rate)), nudge_x = 0.06)\n```\n\n## Univariate\n\n------------------------------------------------------------------------\n\nThere are a number of ways to explore the distributions of univariate data in R. Some methods, like strip charts, show all data points. Other methods, like the box and whisker plot, show selected data points that communicate key values like the median and 25th percentile. Finally, some methods don't show any of the underlying data but calculate density estimates. Each method has advantages and disadvantages, so it is worthwhile to understand the different forms. For more information, read [40 years of boxplots](http://vita.had.co.nz/papers/boxplots.pdf) by Hadley Wickham and Lisa Stryjewski.\n\n### Strip Chart\n\nStrip charts, the simplest univariate plot, show the distribution of values along one axis. Strip charts work best with variables that have plenty of variation. If not, the points tend to cluster on top of each other. Even if the variable has plenty of variation, it is often important to add transparency to the points with `alpha =` so overlapping values are visible.\n\n```{r stripchart, fig.height=2}\nmsleep %>%\n ggplot(aes(x = sleep_total, y = factor(1))) +\n geom_point(alpha = 0.2, size = 5) +\n labs(y = NULL) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) +\n scale_y_discrete(labels = NULL) +\n labs(title = \"Total Sleep Time of Different Mammals\",\n x = \"Total sleep time (hours)\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n```\n\n### Strip Chart with Highlighting\n\nBecause strip charts show all values, they are useful for showing where selected points lie in the distribution of a variable. The clearest way to do this is by adding `geom_point()` twice with `filter()` in the data argument. This way, the highlighted values show up on top of unhighlighted values.\n\n```{r stripchart-with-highlighting, fig.height=2}\nggplot() +\n geom_point(data = filter(msleep, name != \"Red fox\"), \n aes(x = sleep_total, \n y = factor(1)),\n alpha = 0.2, \n size = 5,\n \t\t\t\t\t color = \"grey50\") +\n geom_point(data = filter(msleep, name == \"Red fox\"),\n aes(x = sleep_total, \n y = factor(1), \n color = name),\n alpha = 0.8,\n size = 5) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) + \n scale_y_discrete(labels = NULL) +\n labs(title = \"Total Sleep Time of Different Mammals\",\n x = \"Total sleep time (hours)\",\n y = NULL,\n legend) +\n guides(color = guide_legend(title = NULL)) +\n theme(axis.ticks.y = element_blank())\n```\n\n### Subsetted Strip Chart\n\nAdd a y variable to see the distributions of the continuous variable in subsets of a categorical variable.\n\n```{r subsetted-stripchart, fig.height=3}\nlibrary(forcats)\n\nmsleep %>%\n filter(!is.na(vore)) %>%\n mutate(vore = fct_recode(vore, \n \"Insectivore\" = \"insecti\",\n \"Omnivore\" = \"omni\", \n \"Herbivore\" = \"herbi\", \n \"Carnivore\" = \"carni\"\n )) %>%\n ggplot(aes(x = sleep_total, y = vore)) +\n geom_point(alpha = 0.2, size = 5) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) + \n labs(title = \"Total Sleep Time of Different Mammals by Diet\",\n x = \"Total sleep time (hours)\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n```\n\n### Beeswarm Plots\n\nBeesward plots are a variation of strip charts that shows the distribution of data, but without the points overlaping.\n\n```{r beeswarm}\nlibrary(ggbeeswarm)\n\ntxhousing %>%\n\tfilter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>% \n ggplot(aes(x = median, y = city)) +\n geom_beeswarm(alpha = 0.2, size = 5) + \n\tscale_x_continuous(labels = scales::dollar) +\n labs(title = \"Household Sale Price by City\",\n x = \"Sale Price\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n\n```\n\n### Histograms\n\nHistograms divide the distribution of a variable into n equal-sized bins and then count and display the number of observations in each bin. Histograms are sensitive to bin width. As `?geom_histogram` notes, \"You should always override \\[the default binwidth\\] value, exploring multiple widths to find the best to illustrate the stories in your data.\"\n\n```{r histogram}\nggplot(data = diamonds, mapping = aes(x = depth)) + \n geom_histogram(bins = 100) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 100)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.2)), labels = scales::comma) +\n labs(x = \"Depth\",\n y = \"Count\")\n```\n\n### Boxplots\n\nBoxplots were invented in the 1970s by John Tukey[^1]. Instead of showing the underlying data or binned counts of the underlying data, they focus on important values like the 25th percentile, median, and 75th percentile.\n\n[^1]: Wickham, H., & Stryjewski, L. (2011). 40 years of boxplots.\n\n```{r box-plot}\nInsectSprays %>%\n ggplot(mapping = aes(x = spray, y = count)) +\n geom_boxplot() +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Type of insect spray\",\n y = \"Number of dead insects\") +\n remove_ticks()\n```\n\n### Smoothed Kernel Density Plots\n\nContinuous variables with smooth distributions are sometimes better represented with smoothed kernel density estimates than histograms or boxplots. `geom_density()` computes and plots a kernel density estimate. Notice the lumps around integers and halves in the following distribution because of rounding.\n\n```{r kernel-density-plot}\ndiamonds %>%\n ggplot(mapping = aes(carat)) +\n geom_density(color = NA) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, NA)) +\n\tscale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Carat\",\n y = \"Density\")\n```\n\n```{r kernel-density-plot-filled}\ndiamonds %>%\n mutate(cost = ifelse(price > 5500, \"More than $5,500 +\", \"$0 to $5,500\")) %>%\n ggplot(mapping = aes(carat, fill = cost)) +\n geom_density(alpha = 0.25, color = NA) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, NA)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Carat\",\n y = \"Density\")\n```\n\n### Ridgeline Plots\n\nRidgeline plots are partially overlapping smoothed kernel density plots faceted by a categorical variable that pack a lot of information into one elegant plot.\n\n```{r ridgeline-plots}\nlibrary(ggridges)\n\nggplot(diamonds, mapping = aes(x = price, y = cut)) +\n\tgeom_density_ridges(fill = \"#1696d2\") +\n labs(x = \"Price\",\n y = \"Cut\")\n```\n\n### Violin Plots\n\nViolin plots are symmetrical displays of smooth kernel density plots.\n\n```{r violin-plot}\nInsectSprays %>%\n ggplot(mapping = aes(x = spray, y = count, fill = spray)) +\n geom_violin(color = NA) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Type of insect spray\",\n y = \"Number of dead insects\") +\n remove_ticks()\n```\n\n### Bean Plot\n\nIndividual outliers and important summary values are not visible in violin plots or smoothed kernel density plots. Bean plots, [created by Peter Kampstra in 2008](https://www.jstatsoft.org/article/view/v028c01), are violin plots with data shown as small lines in a one-dimensional sstrip plot and larger lines for the mean.\n\n```{r beanplot}\nmsleep %>%\n filter(!is.na(vore)) %>%\n mutate(vore = fct_recode(vore, \n \"Insectivore\" = \"insecti\",\n \"Omnivore\" = \"omni\", \n \"Herbivore\" = \"herbi\", \n \"Carnivore\" = \"carni\"\n )) %>%\n ggplot(aes(x = vore, y = sleep_total, fill = vore)) +\n stat_summary(fun = \"mean\",\n colour = \"black\", \n size = 30,\n shape = 95,\n geom = \"point\") +\n geom_violin(color = NA) +\n geom_jitter(width = 0,\n height = 0.05,\n alpha = 0.4,\n shape = \"-\",\n size = 10,\n \t\t\t\t\t\tcolor = \"grey50\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) + \n labs(x = NULL,\n y = \"Total sleep time (hours)\") +\n theme(legend.position = \"none\") +\n remove_ticks()\n```\n\n## Area Plot\n\n------------------------------------------------------------------------\n\n### Stacked Area\n\n```{r area-plot-stack}\ntxhousing %>%\n filter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>%\n group_by(city, year) %>%\n summarize(sales = sum(sales)) %>%\n ggplot(aes(x = year, y = sales, fill = city)) +\n geom_area(position = \"stack\") +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(2000, 2015),\n breaks = 2000 + 0:15) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n labels = scales::comma) +\n labs(x = \"Year\",\n y = \"Home sales\")\n```\n\n### Filled Area\n\n```{r area-plot-fill}\ntxhousing %>%\n filter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>%\n group_by(city, year) %>%\n summarize(sales = sum(sales)) %>%\n ggplot(aes(x = year, y = sales, fill = city)) +\n geom_area(position = \"fill\") +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(2000, 2015),\n breaks = 2000 + 0:15) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.02)),\n breaks = c(0, 0.25, 0.5, 0.75, 1),\n labels = scales::percent) +\n labs(x = \"Year\",\n y = \"Home sales\")\n```\n\n## Sankey Plot\n\n------------------------------------------------------------------------\n\nSankey plots visualize flows from one set of variables to another. This can be useful for showing outcomes from the start of a program to the end. You'll need to install the `ggsankey` package to create Sankey plots in R. In this example I make a dummy data set of housing status prior to program start and at exit to show the flow of people between outcomes. A key step is to transform your data set using the `make_long` function from the package. This creates a data frame that specifies each of the initial nodes and how they flow into the next stage.\n\n```{r}\n# load ggsankey package\nremotes::install_github(\"davidsjoberg/ggsankey\")\nlibrary(ggsankey)\n\n# create a dummy dataset of housing status\ndf <- data_frame(entry_status = c(rep(\"Housed\", 7), rep(\"Unhoused\", 15), rep(\"Staying w/ Family\", 8)), \n exit_status = c(rep(\"Housed\", 15), rep(\"Unhoused\", 2), rep(\"Staying w/ Family\", 13))) %>% \n\t# transform the data frame into the proper format for the sankey plot\n make_long(entry_status, exit_status) %>% \n\t# recode the labels to be cleaner in the plot \n mutate(x = recode(x, entry_status = \"Prior Housing Status\", exit_status = \"Exit Housing Status\"),\n next_x = recode(next_x, entry_status = \"Prior Housing Status\", exit_status = \"Exit Housing Status\"))\n\n# create sankey plot\nggplot(df, aes(x = x, \n next_x = next_x, \n node = node, \n next_node = next_node,\n fill = factor(node), \n label = node)) +\n geom_sankey(flow.alpha = 0.5, node.color = 1, show.legend = FALSE) +\n # add labels to plot and style\n geom_sankey_label(size = 3.5, color = 1, fill = \"white\") +\n theme_sankey(base_size = 16)+\n labs(x = NULL)\n```\n\n## Heat Map\n\n------------------------------------------------------------------------\n\n```{r heat-map}\nlibrary(fivethirtyeight)\n\nbad_drivers %>%\n filter(state %in% c(\"Maine\", \"New Hampshire\", \"Vermont\", \"Massachusetts\", \"Connecticut\", \"New York\")) %>%\n mutate(`Number of\\nDrivers` = scale(num_drivers),\n `Percent\\nSpeeding` = scale(perc_speeding),\n `Percent\\nAlcohol` = scale(perc_alcohol),\n `Percent Not\\nDistracted` = scale(perc_not_distracted),\n `Percent No\\nPrevious` = scale(perc_no_previous),\n state = factor(state, levels = rev(state))\n ) %>%\n select(-insurance_premiums, -losses, -(num_drivers:losses)) %>%\n gather(`Number of\\nDrivers`:`Percent No\\nPrevious`, key = \"variable\", value = \"SD's from Mean\") %>%\n ggplot(aes(variable, state)) +\n geom_tile(aes(fill = `SD's from Mean`)) +\n labs(x = NULL,\n y = NULL) + \n scale_fill_gradientn() +\n theme(legend.position = \"right\",\n legend.direction = \"vertical\",\n axis.line.x = element_blank(),\n panel.grid.major.y = element_blank()) +\n remove_ticks()\n#https://learnr.wordpress.com/2010/01/26/ggplot2-quick-heatmap-plotting/\n```\n\n## Faceting and Small Multiples\n\n------------------------------------------------------------------------\n\n### facet_wrap()\n\nR's faceting system is a powerful way to make \"small multiples\".\n\nSome edits to the theme may be necessary depending upon how many rows and columns are in the plot.\n\n```{r small-multiples, fig.height=2}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n facet_wrap(~cut, ncol = 5) +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 6)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 20000), \n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n```\n\n### facet_grid()\n\n```{r faceting, fig.height=7}\ndiamonds %>%\n filter(color %in% c(\"D\", \"E\", \"F\", \"G\")) %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n facet_grid(color ~ cut) +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 4)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 20000), \n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n theme(panel.spacing = unit(20L, \"pt\")) +\n scatter_grid()\n```\n\n## Smoothers\n\n------------------------------------------------------------------------\n\n`geom_smooth()` fits and plots models to data with two or more dimensions.\n\nUnderstanding and manipulating defaults is more important for `geom_smooth()` than other geoms because it contains a number of assumptions. `geom_smooth()` automatically uses loess for datasets with fewer than 1,000 observations and a generalized additive model with `formula = y ~ s(x, bs = \"cs\")` for datasets with greater than 1,000 observations. Both default to a 95% confidence interval with the confidence interval displayed.\n\nModels are chosen with `method =` and can be set to lm(), glm(), gam(), loess(), rlm(), and more. Formulas can be specified with `formula =` and `y ~ x` syntax. Plotting the standard error is toggled with `se = TRUE` and `se = FALSE`, and level is specificed with `level =`. As always, more information can be seen in RStudio with `?geom_smooth()`.\n\n`geom_point()` adds a scatterplot to `geom_smooth()`. The order of the function calls is important. The function called second will be layed on top of the function called first.\n\n```{r geom_smooth}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n\tgeom_point(alpha = 0.05) +\n\tgeom_smooth(color = \"#ec008b\") +\n\tscale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n\t limits = c(0, 5),\n\t breaks = 0:5) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 20000), \n labels = scales::dollar) + \n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n```\n\n`geom_smooth` can be subset by categorical and factor variables. This requires subgroups to have a decent number of observations and and a fair amount of variability across the x-axis. Confidence intervals often widen at the ends so special care is needed for the chart to be meaningful and readable.\n\nThis example uses Loess with MPG = displacement.\n\n```{r subset-geom_smooth}\nggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = factor(cyl))) +\n\tgeom_point(alpha = 0.2) +\n\tgeom_smooth() +\n\tscale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n\t limits = c(0, 7),\n\t breaks = 0:7) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 60)) + \n\tlabs(x = \"Engine displacement\",\n\t\t\t y = \"Highway MPG\") +\n scatter_grid()\n```\n\nThis example uses linear models with MPG = displacement.\n\n```{r subset-geom-smooth-lm}\nggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = factor(cyl))) +\n\tgeom_point(alpha = 0.2) +\n\tgeom_smooth(method = \"lm\") +\n\tscale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n\t limits = c(0, 7),\n\t breaks = 0:7) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 60)) + \n\tlabs(x = \"Engine displacement\",\n\t\t\t y = \"Highway MPG\") +\n scatter_grid()\n```\n\n## Highlighting\n\n------------------------------------------------------------------------\n\n[`library(gghighlight)`](https://yutannihilation.github.io/gghighlight/) enables the intuitive highlighting of ggplot2 plots. `gghighlight` modifies existing ggplot2 objects, so no other code should change. All of the highlighting is handled by the function `gghighlight()`, which can handle all types of geoms.\n\n*Warning:* R will throw an error if too many colors are highlighted because of the design of `urbnthemes`. Simply decrease the number of highlighted geoms to solve this issue.\n\nThere are two main ways to highlight.\n\n### Threshold\n\nThe first way to highlight is with a threshold. Add a logical test to `gghighlight()` to describe which lines should be highlighted. Here, lines with maximum change in per-capita Gross Domestic Product greater than \\$35,000 are highlighted by `gghighlight(max(pcgpd_change) > 35000, use_direct_label = FALSE)`.\n\n```{r gghighlight-threshold}\nlibrary(gghighlight)\nlibrary(gapminder)\n\ndata <- gapminder %>%\n filter(continent %in% c(\"Europe\")) %>%\n group_by(country) %>%\n mutate(pcgpd_change = ifelse(year == 1952, 0, gdpPercap - lag(gdpPercap))) %>%\n mutate(pcgpd_change = cumsum(pcgpd_change))\n \ndata %>%\n ggplot(aes(year, pcgpd_change, group = country, color = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change) > 35000, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\")\n```\n\n### Rank\n\nThe second way to highlight is by rank. Here, the countries with the first highest values for change in per-capita Gross Domestic Product are highlighted with `gghighlight(max(pcgpd_change), max_highlight = 5, use_direct_label = FALSE)`.\n\n```{r gghighlight-rank}\ndata %>%\n ggplot(aes(year, pcgpd_change, group = country, color = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change), max_highlight = 5, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\")\n```\n\n### Faceting\n\n`gghighlight()` works well with ggplot2's faceting system.\n\n```{r gghighlight-faceting}\ndata %>%\n ggplot(aes(year, pcgpd_change, group = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change), max_highlight = 4, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\") +\n facet_wrap(~ country) +\n theme(panel.spacing = unit(20L, \"pt\"))\n```\n\n## Text and Annotation\n\n------------------------------------------------------------------------\n\nSeveral functions can be used to annotate, label, and highlight different parts of plots. `geom_text()` and `geom_text_repel()` both display variables from data frames. `annotate()`, which has several different uses, displays variables and values included in the function call.\n\n### geom_text()\n\n`geom_text()` turns text variables in data sets into geometric objects. This is useful for labeling data in plots. Both functions need `x` values and `y` values to determine placement on the coordinate plane, and a text vector of labels.\n\nThis can be used to label `geom_bar()`.\n\n```{r bar-geom_text}\ndiamonds %>%\n group_by(cut) %>%\n summarize(price = mean(price)) %>%\n ggplot(aes(cut, price)) +\n geom_bar(stat = \"identity\") +\n geom_text(aes(label = scales::dollar(price)), vjust = -1) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2)),\n \t\t\t\t\t\t\t\t\t labels = scales::dollar) +\n labs(title = \"Average Diamond Price by Diamond Cut\",\n x = \"Cut\",\n y = \"Price\") +\n remove_ticks()\n```\n\nIt can also be used to label points in a scatter plot.\n\nIt's rarely useful to label every point in a scatter plot. Use `filter()` to create a second data set that is subsetted and pass it into the labelling function.\n\n```{r scatterplot-geom_text}\nlabels <- mtcars %>%\n\trownames_to_column(\"model\") %>%\n\tfilter(model %in% c(\"Toyota Corolla\", \"Merc 240D\", \"Datsun 710\"))\n\nmtcars %>%\n\tggplot() +\n\tgeom_point(mapping = aes(x = wt, y = mpg)) +\n\tgeom_text(data = labels, mapping = aes(x = wt, y = mpg, label = model), nudge_x = 0.38) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 6)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 40)) + \n labs(x = \"Weight (Tons)\",\n y = \"Miles per gallon (MPG)\") +\n scatter_grid()\n```\n\nText too often overlaps with other text or geoms when using `geom_text()`. `library(ggrepel)` is a `library(ggplot2)` add-on that automatically positions text so it doesn't overlap with geoms or other text. To add this functionality, install and load `library(ggrepel)` and then use `geom_text_repel()` with the same syntax as `geom_text()`.\n\n### geom_text_repel()\n\n```{r scatterplot-geom_text_repel}\nlibrary(ggrepel)\n\nlabels <- mtcars %>%\n\trownames_to_column(\"model\") %>%\n\ttop_n(5, mpg)\n\nmtcars %>%\n\tggplot(mapping = aes(x = wt, y = mpg)) +\n\tgeom_point() +\n\tgeom_text_repel(data = labels, \n\t mapping = aes(label = model), \n\t nudge_x = 0.38) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 6)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 40)) + \n labs(x = \"Weight (Tons)\",\n y = \"Miles per gallon (MPG)\") +\n scatter_grid()\n```\n\n### annotate()\n\n`annotate()` doesn't use data frames. Instead, it takes values for `x =` and `y =`. It can add text, rectangles, segments, and pointrange.\n\n```{r annotate-point}\nmsleep %>%\n filter(bodywt <= 1000) %>%\n ggplot(aes(bodywt, sleep_total)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(-10, 1000),\n \t\t\t\t\t\t\t\t\t labels = scales::comma) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 25)) + \n annotate(\"text\", x = 500, y = 12, label = \"These data suggest that heavy \\n animals sleep less than light animals\") +\n labs(x = \"Body weight (pounds)\",\n y = \"Sleep time (hours)\") +\n scatter_grid() \n```\n\n```{r annotate-rect}\nlibrary(AmesHousing)\n\names <- make_ames()\n\names %>%\n mutate(square_footage = Total_Bsmt_SF - Bsmt_Unf_SF + First_Flr_SF + Second_Flr_SF) %>%\n mutate(Sale_Price = Sale_Price / 1000) %>% \n ggplot(aes(square_footage, Sale_Price)) +\n geom_point(alpha = 0.2) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(-10, 12000),\n \t\t\t\t\t\t\t\t\t labels = scales::comma) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 800),\n \t\t\t\t\t\t\t\t\t labels = scales::dollar) + \n annotate(\"rect\", xmin = 6800, xmax = 11500, ymin = 145, ymax = 210, alpha = 0.1) +\n annotate(\"text\", x = 8750, y = 230, label = \"Unfinished homes\") +\n labs(x = \"Square footage\", \n y = \"Sale price (thousands)\") +\n scatter_grid() \n```\n\n## Layered Geoms\n\n------------------------------------------------------------------------\n\nGeoms can be layered in `ggplot2`. This is useful for design and analysis.\n\nIt is often useful to add points to line plots with a small number of values across the x-axis. This example from [R for Data Science](http://r4ds.had.co.nz/tidy-data.html) shows how changing the line to grey can be appealing.\n\n### Design {.tabset}\n\n#### Before\n\n```{r layering-geoms-design}\ntable1 %>%\n\tggplot(aes(x = year, y = cases)) +\n\t\tgeom_line(aes(color = country)) +\n\t\tgeom_point(aes(color = country)) +\n\t\tscale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n\t\t labels = scales::comma) +\n\t\tscale_x_continuous(breaks = c(1999, 2000)) +\n\t\tlabs(title = \"Changes in Tuberculosis Cases in Three Countries\")\n```\n\n#### After\n\n```{r layering-geoms-design-gray}\ntable1 %>%\n\tggplot(aes(year, cases)) +\n\t\tgeom_line(aes(group = country), color = \"grey50\") +\n\t\tgeom_point(aes(color = country)) +\n\t\tscale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n\t\t labels = scales::comma) +\n\t\tscale_x_continuous(breaks = c(1999, 2000)) +\n\t\tlabs(title = \"Changes in Tuberculosis Cases in Three Countries\")\n```\n\n### Centroids\n\n```{r centroids}\nmpg_summary <- mpg %>%\n\tgroup_by(cyl) %>%\n\tsummarize(displ = mean(displ), cty = mean(cty))\n\nmpg %>%\n\tggplot() +\n\tgeom_point(aes(x = displ, y = cty, color = factor(cyl)), alpha = 0.5) +\n\tgeom_point(data = mpg_summary, aes(x = displ, y = cty), size = 5, color = \"#ec008b\") +\n\tgeom_text(data = mpg_summary, aes(x = displ, y = cty, label = cyl)) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 8)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0)), \n limits = c(0, 40)) +\n\tlabs(x = \"Displacement\",\n\t y = \"City MPG\") +\n scatter_grid()\n```\n\n## Saving Plots\n\n------------------------------------------------------------------------\n\n`ggsave()` exports ggplot2 plots. The function can be used in two ways. If `plot =` isn't specified in the function call, then `ggsave()` automatically saves the plot that was last displayed in the Viewer window. Second, if `plot =` is specified, then `ggsave()` saves the specified plot. `ggsave()` guesses the type of graphics device to use in export (.png, .pdf, .svg, etc.) from the file extension in the filename.\n\n mtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\n ggsave(filename = \"cars.png\")\n\n plot2 <- mtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\n ggsave(filename = \"cars.png\", plot = plot2)\n\nExported plots rarely look identical to the plots that show up in the Viewer window in RStudio because the overall size and aspect ratio of the Viewer is often different than the defaults for `ggsave()`. Specific sizes, aspect ratios, and resolutions can be controlled with arguments in `ggsave()`. RStudio has a useful [cheatsheet](https://www.rstudio.com/resources/cheatsheets/) called \"How Big is Your Graph?\" that should help with choosing the best size, aspect ratio, and resolution.\n\nFonts are not embedded in PDFs by default. To embed fonts in PDFs, include `device = cairo_pdf` in `ggsave()`.\n\n plot <- mtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\n ggsave(filename = \"cars.pdf\", plot = plot2, width = 6.5, height = 4, device = cairo_pdf)\n\n## urbnthemes\n\n### Overview\n\n`urbnthemes` is a set of tools for creating Urban Institute-themed plots and maps in R. The package extends `ggplot2` with print and map themes as well as tools that make plotting easier at the Urban Institute. `urbnthemes` replaces the [urban_R\\_theme](https://github.com/UrbanInstitute/urban_R_theme).\n\nAlways load `library(urbnthemes)` after `library(ggplot2)` or `library(tidyverse)`.\n\n### Usage\n\nUse `set_urbn_defaults(style = \"print\")` to set the default styles. `scatter_grid()`, `remove_ticks()`, `add_axis()`, and `remove_axis()` can all be used to improve graphics.\n\n```{r example, message=FALSE}\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nggplot(data = mtcars, mapping = aes(factor(cyl))) +\n geom_bar() + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Number of Cylinders\",\n y = \"Count\") +\n remove_ticks()\n```\n\n### Combining elements\n\n`library(urbnthemes)` contains functions for combining plot elements into graphics. `urbn_plot()` brings all of the elements together.\n\n- `urbn_logo_text()`\n- `remove_ticks()`\n- `remove_axis()`\n- `scatter_grid()`\n- `add_axis()`\n- `urbn_geofacet`\n\n```{r example2}\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nplot <- ggplot(data = mtcars, mapping = aes(factor(cyl))) +\n geom_bar() + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Number of Cylinders\",\n y = \"Count\") +\n remove_ticks()\n\nurbn_plot(plot, urbn_logo_text(), ncol = 1, heights = c(30, 1))\n```\n\nSometimes it's important to horizontally add the y-axis title above the plot. `urbn_y_title()` can be sued for this task. The following example goes one step further and adds the title between the legend and the plot.\n\n```{r}\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults()\n\nplot <- ggplot(data = mtcars, mapping = aes(x = wt, y = mpg, color = factor(cyl))) +\n geom_point() + \n\tscale_x_continuous(expand = c(0, 0),\n\t\t\t\t\t\t\t\t\t\t limits = c(0, 8)) +\n scale_y_continuous(expand = c(0, 0),\n \t\t\t\t\t\t\t\t\t limits = c(0, 40)) +\n remove_ticks() +\n\tlabs(\"\") +\n\tscatter_grid()\n\nurbn_plot(get_legend(plot),\n\t\t\t\t\turbn_y_title(\"Miles per gallon\"),\n\t\t\t\t\tremove_legend(plot), \n\t\t\t\t\turbn_logo_text(), \n\t\t\t\t\tncol = 1, \n\t\t\t\t\theights = c(3, 1, 30, 1))\n```\n\n### Palettes\n\n`urbnthemes` contains many quick-access color palettes from the [Urban Institute Data Visualization Style Guide](http://urbaninstitute.github.io/graphics-styleguide/). These palettes can be used to quickly overwrite default color palettes from `urbnthemes`.\n\n- `palette_urbn_main` is the eight color discrete palette of the Urban Institute with cyan, yellow, black, gray, magenta, green, space gray, and red.\n- `palette_urbn_diverging` is an eight color diverging palette.\n- `palette_urbn_quintile` is a five color blue palette that is good for quintiles.\n- `palette_urbn_politics` is a two color palette with blue for Democrats and red for Republicans.\n\nThere are seven palettes that are continuous palettes of the seven unique colors in the discrete Urban Institute color palette:\n\n- `palette_urbn_cyan`\n- `palette_urbn_gray`\n- `palette_urbn_yellow`\n- `palette_urbn_magenta`\n- `palette_urbn_green`\n- `palette_urbn_spacegray`\n- `palette_urbn_red`\n\nUse `view_palette()` to see the palette:\n\n```{r view-palette}\nview_palette(palette_urbn_magenta)\n```\n\nThe vectors can be subset using base R syntax. This allows for the quick selection of specific colors from a palette.\n\n```{r palette-subset1}\npalette_urbn_main[1:4]\n```\n\n```{r palette-subset2}\npalette_urbn_spacegray[1:5]\n```\n\n### Utility functions\n\n`library(urbnthemes)` contains four functions that are helpful with managing font instalations:\n\n- `lato_test()`\n- `lato_install()`\n- `fontawesome_test()`\n- `fontawesome_install()`\n\n## Bibliography and Session Information\n\n------------------------------------------------------------------------\n\n*Note:* Examples present in [this document](https://awunderground.github.io/ggplot2-themes/) by Aaron Williams were created during personal time.\n\nBob Rudis and Dave Gandy (2017). waffle: Create Waffle Chart Visualizations in R. R package version 0.7.0. https://CRAN.R-project.org/package=waffle\n\nChester Ismay and Jennifer Chunn (2017). fivethirtyeight: Data and Code Behind the Stories and Interactives at 'FiveThirtyEight'. R package version 0.3.0. https://CRAN.R-project.org/package=fivethirtyeight\n\nHadley Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2009.\n\nHadley Wickham (2017). tidyverse: Easily Install and Load the 'Tidyverse'. R package version 1.2.1. https://CRAN.R-project.org/package=tidyverse\n\nHadley Wickham (2017). forcats: Tools for Working with Categorical Variables (Factors). R package version 0.2.0. https://CRAN.R-project.org/package=forcats\n\nJennifer Bryan (2017). gapminder: Data from Gapminder. R package version 0.3.0. https://CRAN.R-project.org/package=gapminder\n\nKamil Slowikowski (2017). ggrepel: Repulsive Text and Label Geoms for 'ggplot2'. R package version 0.7.0. https://CRAN.R-project.org/package=ggrepel\n\nMax Kuhn (2017). AmesHousing: The Ames Iowa Housing Data. R package version 0.0.3. https://CRAN.R-project.org/package=AmesHousing\n\nPeter Kampstra (2008). Beanplot: A Boxplot Alternative for Visual Comparison of Distributions, Journal of Statistical Software, 2008. https://www.jstatsoft.org/article/view/v028c01\n\nR Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.\n\nWinston Chang, (2014). extrafont: Tools for using fonts. R package version 0.17. https://CRAN.R-project.org/package=extrafont\n\nYihui Xie (2018). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.19.\n\n```{r System Info and Package Versioning}\nsessionInfo()\n```\n"},"formats":{"html":{"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"knitr"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[]},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"graphics-guide.html"},"language":{},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.2.269","editor_options":{"chunk_output_type":"console"}},"extensions":{"book":{"multiFile":true}}}}}
\ No newline at end of file
+{"title":"Urban Institute R Graphics Guide","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"editor_options":{"chunk_output_type":"console"}},"headingText":"Urban Institute R Graphics Guide","containsRefs":false,"markdown":"\n\n\n\n::: {#header}\n\n:::\n\n\n```{r setup, include=FALSE}\nlibrary(knitr)\nlibrary(datasets)\nlibrary(tidyverse)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nopts_chunk$set(fig.path = \"graphics-guide/www/images/\")\nopts_chunk$set(echo = TRUE)\nopts_chunk$set(warning = FALSE)\nopts_chunk$set(message = FALSE)\nopts_chunk$set(fig.width = 6.5)\nopts_chunk$set(fig.height = 4)\nopts_chunk$set(fig.retina = 3)\noptions(scipen = 999)\n```\n\nR is a powerful, open-source programming language and environment. R excels at data management and munging, traditional statistical analysis, machine learning, and reproducible research, but it is probably best known for its graphics. This guide contains examples and instructions for popular and lesser-known plotting techniques in R. It also includes instructions for using `urbnthemes`, the Urban Institute's R package for creating near-publication-ready plots with `ggplot2`. If you have any questions, please don't hesitate to contact Aaron Williams (awilliams\\@urban.org) or Kyle Ueyama (kueyama\\@urban.org).\n\n### Background\n\n`library(urbnthemes)` makes `ggplot2` output align more closely with [the Urban Institute's Data Visualization style guide](http://urbaninstitute.github.io/graphics-styleguide/). This package does **not produce publication ready graphics**. Visual styles must still be edited using your project/paper's normal editing workflow.\n\nExporting charts as a pdf will allow them to be more easily edited. See the Saving Plots section for more information.\n\nThe theme has been tested against `ggplot2 version 3.0.0`. It will not function properly with older versions of `ggplot2`\n\n### Using library(urbnthemes)\n\nRun the following code to install or update `urbnthemes`:\n\n``` \ninstall.packages(\"remotes\")\nremotes::install_github(\"UrbanInstitute/urbnthemes\")\n```\n\nRun the following code at the top of each script:\n\n``` \nlibrary(tidyverse)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n```\n\n### Installing Lato {#installing_lato}\n\nYour Urban computer may not have the Lato font installed. If it is not installed, please install the free [Lato font from Google](https://www.google.com/fonts/specimen/Lato). Below are step by step instructions:\n\n1) Download the [Lato font](https://www.google.com/fonts/specimen/Lato) (as a zip file).\n2) Unzip the file on your computer.\n3) For each `.ttf` file in the unzipped `Lato/` folder, double click the file and click `Install` (on Windows) or `Install Font` (on Mac).\n4) Import and register Lato into R by running `urbnthemes::lato_import()` in the console once. Be patient as this may take a few minutes!\n5) To confirm installation, run `urbnthemes::lato_test()`. If this is successful you're done and Lato will automatically be used when creating plots with `library(urbnthemes)`. You only need to install Lato once per computer.\n\nWaffle charts with glyphs require fontawesome. `fontawesome_test()` and `fontawesome_install()` are the fontawesome versions of the above functions. Be sure to install fontawesome from [here](https://github.com/hrbrmstr/waffle/tree/master/inst/fonts) first.\n\n### Grammar of Graphics and Conventions\n\nHadley Wickham's ggplot2 is based on Leland Wilkinson's [*The Grammar of Graphics*](https://www.amazon.com/Grammar-Graphics-Statistics-Computing/dp/0387245448) and Wickham's [*A Layered Grammar of Graphics*](http://vita.had.co.nz/papers/layered-grammar.html). The layered grammar of graphics is a structured way of thinking about the components of a plot, which then lend themselves to the simple structure of ggplot2.\n\n- **Data** are what are visualizaed in a plot and **mappings** are directions for how data are mapped in a plot in a way that can be perceived by humans.\\\n- **Geoms** are representations of the actual data like points, lines, and bars.\n- **Stats** are statistical transformations that represent summaries of the data like histograms.\n- **Scales** map values in the data space to values in the aesthetic space. Scales draw legends and axes.\n- **Coordinate Systems** describe how geoms are mapped to the plane of the graphic.\\\n- **Facets** break the data into meaningful subsets like small multiples.\n- **Themes** control the finer points of a plot such as fonts, font sizes, and background colors.\n\nMore information: [ggplot2: Elegant Graphics for Data Analysis](https://www.amazon.com/ggplot2-Elegant-Graphics-Data-Analysis/dp/0387981403)\n\n### Tips and Tricks\n\n- `ggplot2` expects data to be in data frames or tibbles. It is preferable for the data frames to be \"tidy\" with each variable as a column, each obseravtion as a row, and each observational unit as a separate table. `dplyr` and `tidyr` contain concise and effective tools for \"tidying\" data.\n\n- R allows function arguments to be called explicitly by name and implicitly by position. The coding examples in this guide only contain named arguments for clarity.\n\n- Graphics will sometimes render differently on different operating systems. This is because anti-aliasing is activated in R on Mac and Linux but not activated in R on Windows. This won't be an issue once graphics are saved.\n\n- Continuous x-axes have ticks. Discrete x-axes do not have ticks. Use `remove_ticks()` to remove ticks.\n\n## Bar Plots\n\n------------------------------------------------------------------------\n\n### One Color\n\n```{r barplots}\nmtcars %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = factor(cyl), y = n)) +\n geom_col() +\n geom_text(mapping = aes(label = n), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) +\n remove_ticks() +\n remove_axis() \n```\n\n### One Color (Rotated)\n\nThis example introduces `coord_flip()` and `remove_axis(axis = \"x\", flip = TRUE)`. `remove_axis()` is from `library(urbnthemes)` and creates a custom theme for rotated bar plots.\n\n```{r barplot-rotated}\nmtcars %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = factor(cyl), y = n)) +\n geom_col() +\n geom_text(mapping = aes(label = n), hjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n coord_flip() +\n remove_axis(axis = \"x\", flip = TRUE)\n```\n\n### Three Colors\n\nThis is identical to the previous plot except colors and a legend are added with `fill = cyl`. Turning `x` into a factor with `factor(cyl)` skips 5 and 7 on the `x-axis`. Adding `fill = cyl` without `factor()` would have created a continuous color scheme and legend.\n\n```{r 3-color-barplot}\nmtcars %>%\n mutate(cyl = factor(cyl)) %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = cyl, y = n, fill = cyl)) +\n geom_col() +\n geom_text(mapping = aes(label = n), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) +\n remove_ticks() +\n remove_axis()\n```\n\n### Stacked Bar Plot\n\nAn additional aesthetic can easily be added to bar plots by adding `fill = categorical variable` to the mapping. Here, transmission type subsets each bar showing the count of cars with different numbers of cylinders.\n\n```{r stacked-bar-plot}\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>% \n group_by(am) %>%\n count(cyl) %>%\n group_by(cyl) %>%\n arrange(desc(am)) %>%\n mutate(label_height = cumsum(n)) %>%\n ggplot() +\n geom_col(mapping = aes(x = cyl, y = n, fill = am)) +\n geom_text(aes(x = cyl, y = label_height - 0.5, label = n, color = am)) +\n scale_color_manual(values = c(\"white\", \"black\")) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n remove_axis() +\n guides(color = \"none\")\n```\n\n### Stacked Bar Plot With Position = Fill\n\nThe previous examples used `geom_col()`, which takes a y value for bar height. This example uses `geom_bar()` which sums the values and generates a value for bar heights. In this example, `position = \"fill\"` in `geom_bar()` changes the y-axis from count to the proportion of each bar.\n\n```{r stacked-bar-plot-fill}\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>% \n ggplot() +\n geom_bar(mapping = aes(x = cyl, fill = am), position = \"fill\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1)), labels = scales::percent) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n guides(color = \"none\")\n```\n\n### Dodged Bar Plot\n\nSubsetted bar charts in ggplot2 are stacked by default. `position = \"dodge\"` in `geom_col()` expands the bar chart so the bars appear next to each other.\n\n```{r dodged-bar-plot}\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>%\n group_by(am) %>%\n count(cyl) %>%\n ggplot(mapping = aes(cyl, y = n, fill = factor(am))) +\n geom_col(position = \"dodge\") +\n geom_text(aes(label = n), position = position_dodge(width = 0.7), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n remove_axis()\n```\n\n### Lollipop plot/Cleveland dot plot {.tabset}\n\nLollipop plots and Cleveland dot plots are minimalist alternatives to bar plots. The key to both plots is to order the data based on the continuous variable using `arrange()` and then turn the discrete variable into a factor with the ordered levels of the continuous variable using `mutate()`. This step \"stores\" the order of the data.\n\n#### Lollipop plot\n\n```{r lollipop-plot, fig.height = 5}\nmtcars %>%\n\trownames_to_column(\"model\") %>%\n\tarrange(mpg) %>%\n\tmutate(model = factor(model, levels = .$model)) %>%\n\tggplot(aes(mpg, model)) +\n\t\tgeom_segment(aes(x = 0, xend = mpg, y = model, yend = model)) +\t\n\t\tgeom_point() +\n\t\tscale_x_continuous(expand = expansion(mult = c(0, 0)), limits = c(0, 40)) +\n\t\tlabs(x = NULL, \n\t\t\t\t y = \"Miles Per Gallon\")\n```\n\n#### Cleveland dot plot\n\n```{r cleveland-dot-plot, fig.height = 5}\nmtcars %>%\n\trownames_to_column(\"model\") %>%\n\tarrange(mpg) %>%\n\tmutate(model = factor(model, levels = .$model)) %>%\n\tggplot(aes(mpg, model)) +\n\t\tgeom_point() +\n\t\tscale_x_continuous(expand = expansion(mult = c(0, 0)), limits = c(0, 40)) +\n\t\tlabs(x = NULL, \n\t\t\t\t y = \"Miles Per Gallon\")\n```\n\n### Dumbell plot\n\n## Scatter Plots\n\n------------------------------------------------------------------------\n\n### One Color Scatter Plot\n\nScatter plots are useful for showing relationships between two or more variables. Use `scatter_grid()` from `library(urbnthemes)` to easily add vertical grid lines for scatter plots.\n\n```{r one-color-scatter-plot}\nmtcars %>%\n ggplot(mapping = aes(x = wt, y = mpg)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:8 * 5) +\n labs(x = \"Weight (thousands of pounds)\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n### High-Density Scatter Plot with Transparency\n\nLarge numbers of observations can sometimes make scatter plots tough to interpret because points overlap. Adding `alpha =` with a number between 0 and 1 adds transparency to points and clarity to plots. Now it's easy to see that jewelry stores are probably rounding up but not rounding down carats!\n\n```{r alpha-scatter-plot}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 20000),\n breaks = 0:4 * 5000,\n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n```\n\n### Hex Scatter Plot\n\nSometimes transparency isn't enough to bring clarity to a scatter plot with many observations. As n increases into the hundreds of thousands and even millions, `geom_hex` can be one of the best ways to display relationships between two variables.\n\n```{r scatter-plot-hex}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_hex(mapping = aes(fill = after_stat(count))) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 20000),\n breaks = 0:4 * 5000,\n labels = scales::dollar) +\n\tscale_fill_gradientn(labels = scales::comma) + \n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid() +\n theme(legend.position = \"right\",\n legend.direction = \"vertical\")\n```\n\n### Scatter Plots With Random Noise {.tabset}\n\nSometimes scatter plots have many overlapping points but a reasonable number of observations. `geom_jitter` adds a small amount of random noise so points are less likely to overlap. `width` and `height` control the amount of noise that is added. In the following before-and-after, notice how many more points are visible after adding jitter.\n\n#### Before\n\n```{r before-scatter-plot}\nmpg %>%\n ggplot(mapping = aes(x = displ, y = cty)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 8),\n breaks = 0:8) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:4 * 10) +\n labs(x = \"Displacement\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n#### After\n\n```{r jitter-plot}\nset.seed(2017)\nmpg %>%\n ggplot(mapping = aes(x = displ, y = cty)) +\n geom_jitter() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 8),\n breaks = 0:8) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:4 * 10) +\n labs(x = \"Displacement\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n### Scatter Plots with Varying Point Size\n\nWeights and populations can be mapped in scatter plots to the size of the points. Here, the number of households in each state is mapped to the size of each point using `aes(size = hhpop)`. Note: `ggplot2::geom_point()` is used instead of `geom_point()`.\n\n```{r geom_point-size, fig.height = 5}\nurbnmapr::statedata %>%\n ggplot(mapping = aes(x = medhhincome, y = horate)) +\n ggplot2::geom_point(mapping = aes(size = hhpop), alpha = 0.3) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(30000, 80000),\n breaks = 3:8 * 10000,\n labels = scales::dollar) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 0.8),\n breaks = 0:4 * 0.2) +\n scale_radius(range = c(3, 15),\n breaks = c(2500000, 7500000, 12500000), \n labels = scales::comma) +\n labs(x = \"Household income\",\n y = \"Homeownership rate\") +\n scatter_grid() +\n\ttheme(plot.margin = margin(r = 20))\n```\n\n### Scatter Plots with Fill\n\nA third aesthetic can be added to scatter plots. Here, color signifies the number of cylinders in each car. Before `ggplot()` is called, Cylinders is created using `library(dplyr)` and the piping operator `%>%`.\n\n```{r filled-scatter-plot}\nmtcars %>%\n mutate(cyl = paste(cyl, \"cylinders\")) %>%\n ggplot(aes(x = wt, y = mpg, color = cyl)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:8 * 5) +\n labs(x = \"Weight (thousands of pounds)\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n## Line Plots\n\n------------------------------------------------------------------------\n\n```{r line-plots}\neconomics %>%\n ggplot(mapping = aes(x = date, y = unemploy)) +\n geom_line() +\n scale_x_date(expand = expansion(mult = c(0.002, 0)), \n breaks = \"10 years\",\n limits = c(as.Date(\"1961-01-01\"), as.Date(\"2020-01-01\")),\n date_labels = \"%Y\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 4000,\n limits = c(0, 16000),\n labels = scales::comma) +\n labs(x = \"Year\", \n y = \"Number Unemployed (1,000s)\")\n```\n\n### Lines Plots With Multiple Lines\n\n```{r multiple-line-charts1}\nlibrary(gapminder)\n\ngapminder %>%\n filter(country %in% c(\"Australia\", \"Canada\", \"New Zealand\")) %>%\n mutate(country = factor(country, levels = c(\"Canada\", \"Australia\", \"New Zealand\"))) %>%\n ggplot(aes(year, gdpPercap, color = country)) +\n geom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n breaks = c(1952 + 0:12 * 5), \n limits = c(1952, 2007)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:8 * 5000,\n labels = scales::dollar, \n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Per capita GDP (US dollars)\")\n```\n\nPlotting more than one variable can be useful for seeing the relationship of variables over time, but it takes a small amount of data munging.\n\nThis is because `ggplot2` wants data in a \"long\" format instead of a \"wide\" format for line plots with multiple lines. `gather()` and `spread()` from the `tidyr` package make switching back-and-forth between \"long\" and \"wide\" painless. Essentially, variable titles go into \"key\" and variable values go into \"value\". Then ggplot2, turns the different levels of the key variable (population, unemployment) into colors.\n\n```{r multiple-line-charts2}\nas_tibble(EuStockMarkets) %>%\n\tmutate(date = time(EuStockMarkets)) %>%\n\tgather(key = \"key\", value = \"value\", -date) %>%\n\tggplot(mapping = aes(x = date, y = value, color = key)) +\n\tgeom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(1991, 1999), \n breaks = c(1991, 1993, 1995, 1997, 1999)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 2500,\n labels = scales::dollar, \n limits = c(0, 10000)) + \n\tlabs(x = \"Date\",\n\t\t\t y = \"Value\")\n```\n\n### Step plot\n\n`geom_line()` connects coordinates with the shortest possible straight line. Sometimes step plots are necessary because y values don't change between coordinates. For example, the upper-bound of the Federal Funds Rate is set at regular intervals and remains constant until it is changed.\n\n```{r step-plot}\n# downloaded from FRED on 2018-12-06\n\n# https://fred.stlouisfed.org/series/DFEDTARU\n\nfed_fund_rate <- read_csv(\n \"date, fed_funds_rate\n 2014-01-01,0.0025\n 2015-12-16,0.0050\n 2016-12-14,0.0075\n 2017-03-16,0.0100\n 2017-06-15,0.0125\n 2017-12-14,0.0150\n 2018-03-22,0.0175\n 2018-06-14,0.0200\n 2018-09-27,0.0225\n 2018-12-06,0.0225\")\n\nfed_fund_rate %>%\n ggplot(mapping = aes(x = date, y = fed_funds_rate)) + \n geom_step() +\n scale_x_date(expand = expansion(mult = c(0.002, 0)), \n breaks = \"1 year\",\n limits = c(as.Date(\"2014-01-01\"), as.Date(\"2019-01-01\")),\n date_labels = \"%Y\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = c(0, 0.01, 0.02, 0.03),\n limits = c(0, 0.03),\n labels = scales::percent) + \n\tlabs(x = \"Date\",\n\t\t\t y = \"Upper-bound of the Federal Funds Rate\")\n```\n\n### Path plot\n\nThe Beveridge curve is a macroeconomic plot that displays a relationship between the unemployment rate and the vacancy rate. Movements along the curve indicate changes in the business cyle and horizontal shifts of the curve suggest structural changes in the labor market.\n\nLines in Beveridge curves do not monotonically move from left to right. Therefore, it is necessary to use `geom_path()`.\n\n```{r, path-plot}\n# seasonally-adjusted, quarterly vacancy rate - JOLTS # seasonally-adjusted, quarterly unemployment rate - CPS\n\n# pulled from FRED on April 11, 2018. \n\nlibrary(ggrepel)\n\nbeveridge <- read_csv(\n\t\"quarter, vacanacy_rate, unempoyment_rate\n\t2006-01-01,0.0310,0.0473\n\t2006-04-01,0.0316,0.0463\n\t2006-07-01,0.0313,0.0463\n\t2006-10-01,0.0310,0.0443\n\t2007-01-01,0.0323,0.0450\n\t2007-04-01,0.0326,0.0450\n\t2007-07-01,0.0316,0.0466\n\t2007-10-01,0.0293,0.0480\n\t2008-01-01,0.0286,0.0500\n\t2008-04-01,0.0280,0.0533\n\t2008-07-01,0.0253,0.0600\n\t2008-10-01,0.0220,0.0686\n\t2009-01-01,0.0196,0.0826\n\t2009-04-01,0.0180,0.0930\n\t2009-07-01,0.0176,0.0963\n\t2009-10-01,0.0180,0.0993\n\t2010-01-01,0.0196,0.0983\n\t2010-04-01,0.0220,0.0963\n\t2010-07-01,0.0216,0.0946\n\t2010-10-01,0.0220,0.0950\n\t2011-01-01,0.0226,0.0903\n\t2011-04-01,0.0236,0.0906\n\t2011-07-01,0.0250,0.0900\n\t2011-10-01,0.0243,0.0863\n\t2012-01-01,0.0270,0.0826\n\t2012-04-01,0.0270,0.0820\n\t2012-07-01,0.0266,0.0803\n\t2012-10-01,0.0260,0.0780\n\t2013-01-01,0.0276,0.0773\n\t2013-04-01,0.0280,0.0753\n\t2013-07-01,0.0280,0.0723\n\t2013-10-01,0.0276,0.0693\n\t2014-01-01,0.0290,0.0666\n\t2014-04-01,0.0323,0.0623\n\t2014-07-01,0.0326,0.0610\n\t2014-10-01,0.0330,0.0570\n\t2015-01-01,0.0350,0.0556\n\t2015-04-01,0.0366,0.0540\n\t2015-07-01,0.0373,0.0510\n\t2015-10-01,0.0360,0.0500\n\t2016-01-01,0.0386,0.0493\n\t2016-04-01,0.0383,0.0486\n\t2016-07-01,0.0383,0.0493\n\t2016-10-01,0.0363,0.0473\n\t2017-01-01,0.0366,0.0466\n\t2017-04-01,0.0390,0.0433\n\t2017-07-01,0.0406,0.0430\n\t2017-10-01,0.0386,0.0410\")\n\nlabels <- beveridge %>%\n filter(lubridate::month(quarter) == 1)\n\nbeveridge %>%\n\tggplot() +\n\tgeom_path(mapping = aes(x = unempoyment_rate, y = vacanacy_rate), alpha = 0.5) +\n geom_point(data = labels, mapping = aes(x = unempoyment_rate, y = vacanacy_rate)) +\n geom_text_repel(data = labels, mapping = aes(x = unempoyment_rate, y = vacanacy_rate, label = lubridate::year(quarter))) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0.04, 0.1),\n labels = scales::percent) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = c(0, 0.01, 0.02, 0.03, 0.04, 0.05),\n limits = c(0, 0.05),\n labels = scales::percent) + \n\tlabs(x = \"Seasonally-adjusted unemployment rate\",\n\t\t\t y = \"Seasonally-adjusted vacancy rate\") + \n scatter_grid()\n```\n\n### Slope plots\n\n```{r slope-plot, fig.height = 5}\n# https://www.bls.gov/lau/\nlibrary(ggrepel)\n\nunemployment <- tibble(\n\ttime = c(\"October 2009\", \"October 2009\", \"October 2009\", \"August 2017\", \"August 2017\", \"August 2017\"),\n\trate = c(7.4, 7.1, 10.0, 3.9, 3.8, 6.4),\n\tstate = c(\"Maryland\", \"Virginia\", \"Washington, D.C.\", \"Maryland\", \"Virginia\", \"Washington, D.C.\")\n)\n\nlabel <- tibble(label = c(\"October 2009\", \"August 2017\"))\noctober <- filter(unemployment, time == \"October 2009\")\naugust <- filter(unemployment, time == \"August 2017\")\n\nunemployment %>%\n\tmutate(time = factor(time, levels = c(\"October 2009\", \"August 2017\")),\n\t state = factor(state, levels = c(\"Washington, D.C.\", \"Maryland\", \"Virginia\"))) %>%\n\tggplot() + \n\tgeom_line(aes(time, rate, group = state, color = state), show.legend = FALSE) +\n\tgeom_point(aes(x = time, y = rate, color = state)) +\n\tlabs(subtitle = \"Unemployment Rate\") +\n\ttheme(axis.ticks.x = element_blank(),\n\t\t\t\taxis.title.x = element_blank(),\n\t\t\t\taxis.ticks.y = element_blank(),\n axis.title.y = element_blank(), \n axis.text.y = element_blank(),\n\t\t\t\tpanel.grid.major.y = element_blank(),\n panel.grid.minor.y = element_blank(),\n panel.grid.major.x = element_blank(),\n\t\t\t\taxis.line = element_blank()) +\n\tgeom_text_repel(data = october, mapping = aes(x = time, y = rate, label = as.character(rate)), nudge_x = -0.06) + \n\tgeom_text_repel(data = august, mapping = aes(x = time, y = rate, label = as.character(rate)), nudge_x = 0.06)\n```\n\n## Univariate\n\n------------------------------------------------------------------------\n\nThere are a number of ways to explore the distributions of univariate data in R. Some methods, like strip charts, show all data points. Other methods, like the box and whisker plot, show selected data points that communicate key values like the median and 25th percentile. Finally, some methods don't show any of the underlying data but calculate density estimates. Each method has advantages and disadvantages, so it is worthwhile to understand the different forms. For more information, read [40 years of boxplots](http://vita.had.co.nz/papers/boxplots.pdf) by Hadley Wickham and Lisa Stryjewski.\n\n### Strip Chart\n\nStrip charts, the simplest univariate plot, show the distribution of values along one axis. Strip charts work best with variables that have plenty of variation. If not, the points tend to cluster on top of each other. Even if the variable has plenty of variation, it is often important to add transparency to the points with `alpha =` so overlapping values are visible.\n\n```{r stripchart, fig.height=2}\nmsleep %>%\n ggplot(aes(x = sleep_total, y = factor(1))) +\n geom_point(alpha = 0.2, size = 5) +\n labs(y = NULL) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) +\n scale_y_discrete(labels = NULL) +\n labs(title = \"Total Sleep Time of Different Mammals\",\n x = \"Total sleep time (hours)\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n```\n\n### Strip Chart with Highlighting\n\nBecause strip charts show all values, they are useful for showing where selected points lie in the distribution of a variable. The clearest way to do this is by adding `geom_point()` twice with `filter()` in the data argument. This way, the highlighted values show up on top of unhighlighted values.\n\n```{r stripchart-with-highlighting, fig.height=2}\nggplot() +\n geom_point(data = filter(msleep, name != \"Red fox\"), \n aes(x = sleep_total, \n y = factor(1)),\n alpha = 0.2, \n size = 5,\n \t\t\t\t\t color = \"grey50\") +\n geom_point(data = filter(msleep, name == \"Red fox\"),\n aes(x = sleep_total, \n y = factor(1), \n color = name),\n alpha = 0.8,\n size = 5) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) + \n scale_y_discrete(labels = NULL) +\n labs(title = \"Total Sleep Time of Different Mammals\",\n x = \"Total sleep time (hours)\",\n y = NULL,\n legend) +\n guides(color = guide_legend(title = NULL)) +\n theme(axis.ticks.y = element_blank())\n```\n\n### Subsetted Strip Chart\n\nAdd a y variable to see the distributions of the continuous variable in subsets of a categorical variable.\n\n```{r subsetted-stripchart, fig.height=3}\nlibrary(forcats)\n\nmsleep %>%\n filter(!is.na(vore)) %>%\n mutate(vore = fct_recode(vore, \n \"Insectivore\" = \"insecti\",\n \"Omnivore\" = \"omni\", \n \"Herbivore\" = \"herbi\", \n \"Carnivore\" = \"carni\"\n )) %>%\n ggplot(aes(x = sleep_total, y = vore)) +\n geom_point(alpha = 0.2, size = 5) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) + \n labs(title = \"Total Sleep Time of Different Mammals by Diet\",\n x = \"Total sleep time (hours)\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n```\n\n### Beeswarm Plots\n\nBeesward plots are a variation of strip charts that shows the distribution of data, but without the points overlaping.\n\n```{r beeswarm}\nlibrary(ggbeeswarm)\n\ntxhousing %>%\n\tfilter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>% \n ggplot(aes(x = median, y = city)) +\n geom_beeswarm(alpha = 0.2, size = 5) + \n\tscale_x_continuous(labels = scales::dollar) +\n labs(title = \"Household Sale Price by City\",\n x = \"Sale Price\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n\n```\n\n### Histograms\n\nHistograms divide the distribution of a variable into n equal-sized bins and then count and display the number of observations in each bin. Histograms are sensitive to bin width. As `?geom_histogram` notes, \"You should always override \\[the default binwidth\\] value, exploring multiple widths to find the best to illustrate the stories in your data.\"\n\n```{r histogram}\nggplot(data = diamonds, mapping = aes(x = depth)) + \n geom_histogram(bins = 100) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 100)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.2)), labels = scales::comma) +\n labs(x = \"Depth\",\n y = \"Count\")\n```\n\n### Boxplots\n\nBoxplots were invented in the 1970s by John Tukey[^1]. Instead of showing the underlying data or binned counts of the underlying data, they focus on important values like the 25th percentile, median, and 75th percentile.\n\n[^1]: Wickham, H., & Stryjewski, L. (2011). 40 years of boxplots.\n\n```{r box-plot}\nInsectSprays %>%\n ggplot(mapping = aes(x = spray, y = count)) +\n geom_boxplot() +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Type of insect spray\",\n y = \"Number of dead insects\") +\n remove_ticks()\n```\n\n### Smoothed Kernel Density Plots\n\nContinuous variables with smooth distributions are sometimes better represented with smoothed kernel density estimates than histograms or boxplots. `geom_density()` computes and plots a kernel density estimate. Notice the lumps around integers and halves in the following distribution because of rounding.\n\n```{r kernel-density-plot}\ndiamonds %>%\n ggplot(mapping = aes(carat)) +\n geom_density(color = NA) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, NA)) +\n\tscale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Carat\",\n y = \"Density\")\n```\n\n```{r kernel-density-plot-filled}\ndiamonds %>%\n mutate(cost = ifelse(price > 5500, \"More than $5,500 +\", \"$0 to $5,500\")) %>%\n ggplot(mapping = aes(carat, fill = cost)) +\n geom_density(alpha = 0.25, color = NA) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, NA)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Carat\",\n y = \"Density\")\n```\n\n### Ridgeline Plots\n\nRidgeline plots are partially overlapping smoothed kernel density plots faceted by a categorical variable that pack a lot of information into one elegant plot.\n\n```{r ridgeline-plots}\nlibrary(ggridges)\n\nggplot(diamonds, mapping = aes(x = price, y = cut)) +\n\tgeom_density_ridges(fill = \"#1696d2\") +\n labs(x = \"Price\",\n y = \"Cut\")\n```\n\n### Violin Plots\n\nViolin plots are symmetrical displays of smooth kernel density plots.\n\n```{r violin-plot}\nInsectSprays %>%\n ggplot(mapping = aes(x = spray, y = count, fill = spray)) +\n geom_violin(color = NA) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Type of insect spray\",\n y = \"Number of dead insects\") +\n remove_ticks()\n```\n\n### Bean Plot\n\nIndividual outliers and important summary values are not visible in violin plots or smoothed kernel density plots. Bean plots, [created by Peter Kampstra in 2008](https://www.jstatsoft.org/article/view/v028c01), are violin plots with data shown as small lines in a one-dimensional sstrip plot and larger lines for the mean.\n\n```{r beanplot}\nmsleep %>%\n filter(!is.na(vore)) %>%\n mutate(vore = fct_recode(vore, \n \"Insectivore\" = \"insecti\",\n \"Omnivore\" = \"omni\", \n \"Herbivore\" = \"herbi\", \n \"Carnivore\" = \"carni\"\n )) %>%\n ggplot(aes(x = vore, y = sleep_total, fill = vore)) +\n stat_summary(fun = \"mean\",\n colour = \"black\", \n size = 30,\n shape = 95,\n geom = \"point\") +\n geom_violin(color = NA) +\n geom_jitter(width = 0,\n height = 0.05,\n alpha = 0.4,\n shape = \"-\",\n size = 10,\n \t\t\t\t\t\tcolor = \"grey50\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) + \n labs(x = NULL,\n y = \"Total sleep time (hours)\") +\n theme(legend.position = \"none\") +\n remove_ticks()\n```\n\n## Area Plot\n\n------------------------------------------------------------------------\n\n### Stacked Area\n\n```{r area-plot-stack}\ntxhousing %>%\n filter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>%\n group_by(city, year) %>%\n summarize(sales = sum(sales)) %>%\n ggplot(aes(x = year, y = sales, fill = city)) +\n geom_area(position = \"stack\") +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(2000, 2015),\n breaks = 2000 + 0:15) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n labels = scales::comma) +\n labs(x = \"Year\",\n y = \"Home sales\")\n```\n\n### Filled Area\n\n```{r area-plot-fill}\ntxhousing %>%\n filter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>%\n group_by(city, year) %>%\n summarize(sales = sum(sales)) %>%\n ggplot(aes(x = year, y = sales, fill = city)) +\n geom_area(position = \"fill\") +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(2000, 2015),\n breaks = 2000 + 0:15) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.02)),\n breaks = c(0, 0.25, 0.5, 0.75, 1),\n labels = scales::percent) +\n labs(x = \"Year\",\n y = \"Home sales\")\n```\n\n## Sankey Plot\n\n------------------------------------------------------------------------\n\nSankey plots visualize flows from one set of variables to another. This can be useful for showing outcomes from the start of a program to the end. You'll need to install the `ggsankey` package to create Sankey plots in R. In this example I make a dummy data set of housing status prior to program start and at exit to show the flow of people between outcomes. A key step is to transform your data set using the `make_long` function from the package. This creates a data frame that specifies each of the initial nodes and how they flow into the next stage.\n\n```{r}\n# load ggsankey package\nremotes::install_github(\"davidsjoberg/ggsankey\")\nlibrary(ggsankey)\n\n# create a dummy dataset of housing status\ndf <- data_frame(entry_status = c(rep(\"Housed\", 7), rep(\"Unhoused\", 15), rep(\"Staying w/ Family\", 8)), \n exit_status = c(rep(\"Housed\", 15), rep(\"Unhoused\", 2), rep(\"Staying w/ Family\", 13))) %>% \n\t# transform the data frame into the proper format for the sankey plot\n make_long(entry_status, exit_status) %>% \n\t# recode the labels to be cleaner in the plot \n mutate(x = recode(x, entry_status = \"Prior Housing Status\", exit_status = \"Exit Housing Status\"),\n next_x = recode(next_x, entry_status = \"Prior Housing Status\", exit_status = \"Exit Housing Status\"))\n\n# create sankey plot\nggplot(df, aes(x = x, \n next_x = next_x, \n node = node, \n next_node = next_node,\n fill = factor(node), \n label = node)) +\n geom_sankey(flow.alpha = 0.5, node.color = 1, show.legend = FALSE) +\n # add labels to plot and style\n geom_sankey_label(size = 3.5, color = 1, fill = \"white\") +\n theme_sankey(base_size = 16)+\n labs(x = NULL)\n```\n\n## Heat Map\n\n------------------------------------------------------------------------\n\n```{r heat-map}\nlibrary(fivethirtyeight)\n\nbad_drivers %>%\n filter(state %in% c(\"Maine\", \"New Hampshire\", \"Vermont\", \"Massachusetts\", \"Connecticut\", \"New York\")) %>%\n mutate(`Number of\\nDrivers` = scale(num_drivers),\n `Percent\\nSpeeding` = scale(perc_speeding),\n `Percent\\nAlcohol` = scale(perc_alcohol),\n `Percent Not\\nDistracted` = scale(perc_not_distracted),\n `Percent No\\nPrevious` = scale(perc_no_previous),\n state = factor(state, levels = rev(state))\n ) %>%\n select(-insurance_premiums, -losses, -(num_drivers:losses)) %>%\n gather(`Number of\\nDrivers`:`Percent No\\nPrevious`, key = \"variable\", value = \"SD's from Mean\") %>%\n ggplot(aes(variable, state)) +\n geom_tile(aes(fill = `SD's from Mean`)) +\n labs(x = NULL,\n y = NULL) + \n scale_fill_gradientn() +\n theme(legend.position = \"right\",\n legend.direction = \"vertical\",\n axis.line.x = element_blank(),\n panel.grid.major.y = element_blank()) +\n remove_ticks()\n#https://learnr.wordpress.com/2010/01/26/ggplot2-quick-heatmap-plotting/\n```\n\n## Faceting and Small Multiples\n\n------------------------------------------------------------------------\n\n### facet_wrap()\n\nR's faceting system is a powerful way to make \"small multiples\".\n\nSome edits to the theme may be necessary depending upon how many rows and columns are in the plot.\n\n```{r small-multiples, fig.height=2}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n facet_wrap(~cut, ncol = 5) +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 6)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 20000), \n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n```\n\n### facet_grid()\n\n```{r faceting, fig.height=7}\ndiamonds %>%\n filter(color %in% c(\"D\", \"E\", \"F\", \"G\")) %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n facet_grid(color ~ cut) +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 4)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 20000), \n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n theme(panel.spacing = unit(20L, \"pt\")) +\n scatter_grid()\n```\n\n## Smoothers\n\n------------------------------------------------------------------------\n\n`geom_smooth()` fits and plots models to data with two or more dimensions.\n\nUnderstanding and manipulating defaults is more important for `geom_smooth()` than other geoms because it contains a number of assumptions. `geom_smooth()` automatically uses loess for datasets with fewer than 1,000 observations and a generalized additive model with `formula = y ~ s(x, bs = \"cs\")` for datasets with greater than 1,000 observations. Both default to a 95% confidence interval with the confidence interval displayed.\n\nModels are chosen with `method =` and can be set to lm(), glm(), gam(), loess(), rlm(), and more. Formulas can be specified with `formula =` and `y ~ x` syntax. Plotting the standard error is toggled with `se = TRUE` and `se = FALSE`, and level is specificed with `level =`. As always, more information can be seen in RStudio with `?geom_smooth()`.\n\n`geom_point()` adds a scatterplot to `geom_smooth()`. The order of the function calls is important. The function called second will be layed on top of the function called first.\n\n```{r geom_smooth}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n\tgeom_point(alpha = 0.05) +\n\tgeom_smooth(color = \"#ec008b\") +\n\tscale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n\t limits = c(0, 5),\n\t breaks = 0:5) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 20000), \n labels = scales::dollar) + \n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n```\n\n`geom_smooth` can be subset by categorical and factor variables. This requires subgroups to have a decent number of observations and and a fair amount of variability across the x-axis. Confidence intervals often widen at the ends so special care is needed for the chart to be meaningful and readable.\n\nThis example uses Loess with MPG = displacement.\n\n```{r subset-geom_smooth}\nggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = factor(cyl))) +\n\tgeom_point(alpha = 0.2) +\n\tgeom_smooth() +\n\tscale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n\t limits = c(0, 7),\n\t breaks = 0:7) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 60)) + \n\tlabs(x = \"Engine displacement\",\n\t\t\t y = \"Highway MPG\") +\n scatter_grid()\n```\n\nThis example uses linear models with MPG = displacement.\n\n```{r subset-geom-smooth-lm}\nggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = factor(cyl))) +\n\tgeom_point(alpha = 0.2) +\n\tgeom_smooth(method = \"lm\") +\n\tscale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n\t limits = c(0, 7),\n\t breaks = 0:7) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 60)) + \n\tlabs(x = \"Engine displacement\",\n\t\t\t y = \"Highway MPG\") +\n scatter_grid()\n```\n\n## Highlighting\n\n------------------------------------------------------------------------\n\n[`library(gghighlight)`](https://yutannihilation.github.io/gghighlight/) enables the intuitive highlighting of ggplot2 plots. `gghighlight` modifies existing ggplot2 objects, so no other code should change. All of the highlighting is handled by the function `gghighlight()`, which can handle all types of geoms.\n\n*Warning:* R will throw an error if too many colors are highlighted because of the design of `urbnthemes`. Simply decrease the number of highlighted geoms to solve this issue.\n\nThere are two main ways to highlight.\n\n### Threshold\n\nThe first way to highlight is with a threshold. Add a logical test to `gghighlight()` to describe which lines should be highlighted. Here, lines with maximum change in per-capita Gross Domestic Product greater than \\$35,000 are highlighted by `gghighlight(max(pcgpd_change) > 35000, use_direct_label = FALSE)`.\n\n```{r gghighlight-threshold}\nlibrary(gghighlight)\nlibrary(gapminder)\n\ndata <- gapminder %>%\n filter(continent %in% c(\"Europe\")) %>%\n group_by(country) %>%\n mutate(pcgpd_change = ifelse(year == 1952, 0, gdpPercap - lag(gdpPercap))) %>%\n mutate(pcgpd_change = cumsum(pcgpd_change))\n \ndata %>%\n ggplot(aes(year, pcgpd_change, group = country, color = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change) > 35000, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\")\n```\n\n### Rank\n\nThe second way to highlight is by rank. Here, the countries with the first highest values for change in per-capita Gross Domestic Product are highlighted with `gghighlight(max(pcgpd_change), max_highlight = 5, use_direct_label = FALSE)`.\n\n```{r gghighlight-rank}\ndata %>%\n ggplot(aes(year, pcgpd_change, group = country, color = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change), max_highlight = 5, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\")\n```\n\n### Faceting\n\n`gghighlight()` works well with ggplot2's faceting system.\n\n```{r gghighlight-faceting}\ndata %>%\n ggplot(aes(year, pcgpd_change, group = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change), max_highlight = 4, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\") +\n facet_wrap(~ country) +\n theme(panel.spacing = unit(20L, \"pt\"))\n```\n\n## Text and Annotation\n\n------------------------------------------------------------------------\n\nSeveral functions can be used to annotate, label, and highlight different parts of plots. `geom_text()` and `geom_text_repel()` both display variables from data frames. `annotate()`, which has several different uses, displays variables and values included in the function call.\n\n### geom_text()\n\n`geom_text()` turns text variables in data sets into geometric objects. This is useful for labeling data in plots. Both functions need `x` values and `y` values to determine placement on the coordinate plane, and a text vector of labels.\n\nThis can be used to label `geom_bar()`.\n\n```{r bar-geom_text}\ndiamonds %>%\n group_by(cut) %>%\n summarize(price = mean(price)) %>%\n ggplot(aes(cut, price)) +\n geom_bar(stat = \"identity\") +\n geom_text(aes(label = scales::dollar(price)), vjust = -1) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2)),\n \t\t\t\t\t\t\t\t\t labels = scales::dollar) +\n labs(title = \"Average Diamond Price by Diamond Cut\",\n x = \"Cut\",\n y = \"Price\") +\n remove_ticks()\n```\n\nIt can also be used to label points in a scatter plot.\n\nIt's rarely useful to label every point in a scatter plot. Use `filter()` to create a second data set that is subsetted and pass it into the labelling function.\n\n```{r scatterplot-geom_text}\nlabels <- mtcars %>%\n\trownames_to_column(\"model\") %>%\n\tfilter(model %in% c(\"Toyota Corolla\", \"Merc 240D\", \"Datsun 710\"))\n\nmtcars %>%\n\tggplot() +\n\tgeom_point(mapping = aes(x = wt, y = mpg)) +\n\tgeom_text(data = labels, mapping = aes(x = wt, y = mpg, label = model), nudge_x = 0.38) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 6)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 40)) + \n labs(x = \"Weight (Tons)\",\n y = \"Miles per gallon (MPG)\") +\n scatter_grid()\n```\n\nText too often overlaps with other text or geoms when using `geom_text()`. `library(ggrepel)` is a `library(ggplot2)` add-on that automatically positions text so it doesn't overlap with geoms or other text. To add this functionality, install and load `library(ggrepel)` and then use `geom_text_repel()` with the same syntax as `geom_text()`.\n\n### geom_text_repel()\n\n```{r scatterplot-geom_text_repel}\nlibrary(ggrepel)\n\nlabels <- mtcars %>%\n\trownames_to_column(\"model\") %>%\n\ttop_n(5, mpg)\n\nmtcars %>%\n\tggplot(mapping = aes(x = wt, y = mpg)) +\n\tgeom_point() +\n\tgeom_text_repel(data = labels, \n\t mapping = aes(label = model), \n\t nudge_x = 0.38) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 6)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 40)) + \n labs(x = \"Weight (Tons)\",\n y = \"Miles per gallon (MPG)\") +\n scatter_grid()\n```\n\n### annotate()\n\n`annotate()` doesn't use data frames. Instead, it takes values for `x =` and `y =`. It can add text, rectangles, segments, and pointrange.\n\n```{r annotate-point}\nmsleep %>%\n filter(bodywt <= 1000) %>%\n ggplot(aes(bodywt, sleep_total)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(-10, 1000),\n \t\t\t\t\t\t\t\t\t labels = scales::comma) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 25)) + \n annotate(\"text\", x = 500, y = 12, label = \"These data suggest that heavy \\n animals sleep less than light animals\") +\n labs(x = \"Body weight (pounds)\",\n y = \"Sleep time (hours)\") +\n scatter_grid() \n```\n\n```{r annotate-rect}\nlibrary(AmesHousing)\n\names <- make_ames()\n\names %>%\n mutate(square_footage = Total_Bsmt_SF - Bsmt_Unf_SF + First_Flr_SF + Second_Flr_SF) %>%\n mutate(Sale_Price = Sale_Price / 1000) %>% \n ggplot(aes(square_footage, Sale_Price)) +\n geom_point(alpha = 0.2) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(-10, 12000),\n \t\t\t\t\t\t\t\t\t labels = scales::comma) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 800),\n \t\t\t\t\t\t\t\t\t labels = scales::dollar) + \n annotate(\"rect\", xmin = 6800, xmax = 11500, ymin = 145, ymax = 210, alpha = 0.1) +\n annotate(\"text\", x = 8750, y = 230, label = \"Unfinished homes\") +\n labs(x = \"Square footage\", \n y = \"Sale price (thousands)\") +\n scatter_grid() \n```\n\n## Layered Geoms\n\n------------------------------------------------------------------------\n\nGeoms can be layered in `ggplot2`. This is useful for design and analysis.\n\nIt is often useful to add points to line plots with a small number of values across the x-axis. This example from [R for Data Science](http://r4ds.had.co.nz/tidy-data.html) shows how changing the line to grey can be appealing.\n\n### Design {.tabset}\n\n#### Before\n\n```{r layering-geoms-design}\ntable1 %>%\n\tggplot(aes(x = year, y = cases)) +\n\t\tgeom_line(aes(color = country)) +\n\t\tgeom_point(aes(color = country)) +\n\t\tscale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n\t\t labels = scales::comma) +\n\t\tscale_x_continuous(breaks = c(1999, 2000)) +\n\t\tlabs(title = \"Changes in Tuberculosis Cases in Three Countries\")\n```\n\n#### After\n\n```{r layering-geoms-design-gray}\ntable1 %>%\n\tggplot(aes(year, cases)) +\n\t\tgeom_line(aes(group = country), color = \"grey50\") +\n\t\tgeom_point(aes(color = country)) +\n\t\tscale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n\t\t labels = scales::comma) +\n\t\tscale_x_continuous(breaks = c(1999, 2000)) +\n\t\tlabs(title = \"Changes in Tuberculosis Cases in Three Countries\")\n```\n\n### Centroids\n\n```{r centroids}\nmpg_summary <- mpg %>%\n\tgroup_by(cyl) %>%\n\tsummarize(displ = mean(displ), cty = mean(cty))\n\nmpg %>%\n\tggplot() +\n\tgeom_point(aes(x = displ, y = cty, color = factor(cyl)), alpha = 0.5) +\n\tgeom_point(data = mpg_summary, aes(x = displ, y = cty), size = 5, color = \"#ec008b\") +\n\tgeom_text(data = mpg_summary, aes(x = displ, y = cty, label = cyl)) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 8)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0)), \n limits = c(0, 40)) +\n\tlabs(x = \"Displacement\",\n\t y = \"City MPG\") +\n scatter_grid()\n```\n\n## Saving Plots\n\n------------------------------------------------------------------------\n\n`ggsave()` exports ggplot2 plots. The function can be used in two ways. If `plot =` isn't specified in the function call, then `ggsave()` automatically saves the plot that was last displayed in the Viewer window. Second, if `plot =` is specified, then `ggsave()` saves the specified plot. `ggsave()` guesses the type of graphics device to use in export (.png, .pdf, .svg, etc.) from the file extension in the filename.\n\n``` \nmtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\nggsave(filename = \"cars.png\")\n\nplot2 <- mtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\nggsave(filename = \"cars.png\", plot = plot2)\n```\n\nExported plots rarely look identical to the plots that show up in the Viewer window in RStudio because the overall size and aspect ratio of the Viewer is often different than the defaults for `ggsave()`. Specific sizes, aspect ratios, and resolutions can be controlled with arguments in `ggsave()`. RStudio has a useful [cheatsheet](https://www.rstudio.com/resources/cheatsheets/) called \"How Big is Your Graph?\" that should help with choosing the best size, aspect ratio, and resolution.\n\nFonts are not embedded in PDFs by default. To embed fonts in PDFs, include `device = cairo_pdf` in `ggsave()`.\n\n``` \nplot <- mtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\nggsave(filename = \"cars.pdf\", plot = plot2, width = 6.5, height = 4, device = cairo_pdf)\n```\n\n## Interactive Plots\n\nWe can make any of the previous plots interactive with the powerful and easy `plotly` library. All we have to do is wrap a ggplot object in the `ggplotly` function. *Note:* You can't add `ggplotly` to the end of a ggplot object, but have to actually save the ggplot as a variable and then wrap that in the function call as shown below.\n\nYou can customize the tooltip text by adding a value to `text` in `aes()` and then specifying `tooltip = \"text\"` in the `ggplotly` call.\n\n```{r}\nlibrary(plotly)\n\nstock_plot <- as_tibble(EuStockMarkets) %>% \n mutate(date = time(EuStockMarkets)) %>% \n gather(key = \"key\", value = \"value\", -date) %>% \n ggplot(mapping = aes(x = date, y = value, color = key,\n \t\t\t\t\t\t\t\t\t\t # sometimes ggplotly messes with line charts,\n \t\t\t\t\t\t\t\t\t\t # adding a group value usually helps with that\n \t\t\t\t\t\t\t\t\t\t group = key,\n \t\t\t\t\t\t\t\t\t\t # customize the tooltip with the text aes\n \t\t\t\t\t\t\t\t\t\t text = paste0(\"Value: \", round(value, 2), \" \",\n \t\t\t\t\t\t\t\t\t\t \t\t\t\t\t\t\t\"Date: \", round(date, 3), \" \",\n \t\t\t\t\t\t\t\t\t\t \t\t\t\t\t\t\t\"Key: \", key))\n \t\t\t\t\t\t\t\t\t\t ) +\n geom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(1991, 1999), \n breaks = c(1991, 1993, 1995, 1997, 1999)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 2500,\n labels = scales::dollar, \n limits = c(0, 10000)) + \n labs(x = \"Date\",\n y = \"Value\")\n\n# make interactive with gggplotly\n# Uncomment pipe to hide the interative toolbar in the top right \nggplotly(stock_plot, tooltip = \"text\") # %>% config(displayModeBar = FALSE)\n```\n\n## urbnthemes\n\n### Overview\n\n`urbnthemes` is a set of tools for creating Urban Institute-themed plots and maps in R. The package extends `ggplot2` with print and map themes as well as tools that make plotting easier at the Urban Institute. `urbnthemes` replaces the [urban_R_theme](https://github.com/UrbanInstitute/urban_R_theme).\n\nAlways load `library(urbnthemes)` after `library(ggplot2)` or `library(tidyverse)`.\n\n### Usage\n\nUse `set_urbn_defaults(style = \"print\")` to set the default styles. `scatter_grid()`, `remove_ticks()`, `add_axis()`, and `remove_axis()` can all be used to improve graphics.\n\n```{r example, message=FALSE}\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nggplot(data = mtcars, mapping = aes(factor(cyl))) +\n geom_bar() + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Number of Cylinders\",\n y = \"Count\") +\n remove_ticks()\n```\n\n### Combining elements\n\n`library(urbnthemes)` contains functions for combining plot elements into graphics. `urbn_plot()` brings all of the elements together.\n\n- `urbn_logo_text()`\n- `remove_ticks()`\n- `remove_axis()`\n- `scatter_grid()`\n- `add_axis()`\n- `urbn_geofacet`\n\n```{r example2}\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nplot <- ggplot(data = mtcars, mapping = aes(factor(cyl))) +\n geom_bar() + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Number of Cylinders\",\n y = \"Count\") +\n remove_ticks()\n\nurbn_plot(plot, urbn_logo_text(), ncol = 1, heights = c(30, 1))\n```\n\nSometimes it's important to horizontally add the y-axis title above the plot. `urbn_y_title()` can be sued for this task. The following example goes one step further and adds the title between the legend and the plot.\n\n```{r}\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults()\n\nplot <- ggplot(data = mtcars, mapping = aes(x = wt, y = mpg, color = factor(cyl))) +\n geom_point() + \n\tscale_x_continuous(expand = c(0, 0),\n\t\t\t\t\t\t\t\t\t\t limits = c(0, 8)) +\n scale_y_continuous(expand = c(0, 0),\n \t\t\t\t\t\t\t\t\t limits = c(0, 40)) +\n remove_ticks() +\n\tlabs(\"\") +\n\tscatter_grid()\n\nurbn_plot(get_legend(plot),\n\t\t\t\t\turbn_y_title(\"Miles per gallon\"),\n\t\t\t\t\tremove_legend(plot), \n\t\t\t\t\turbn_logo_text(), \n\t\t\t\t\tncol = 1, \n\t\t\t\t\theights = c(3, 1, 30, 1))\n```\n\n### Palettes\n\n`urbnthemes` contains many quick-access color palettes from the [Urban Institute Data Visualization Style Guide](http://urbaninstitute.github.io/graphics-styleguide/). These palettes can be used to quickly overwrite default color palettes from `urbnthemes`.\n\n- `palette_urbn_main` is the eight color discrete palette of the Urban Institute with cyan, yellow, black, gray, magenta, green, space gray, and red.\n- `palette_urbn_diverging` is an eight color diverging palette.\n- `palette_urbn_quintile` is a five color blue palette that is good for quintiles.\n- `palette_urbn_politics` is a two color palette with blue for Democrats and red for Republicans.\n\nThere are seven palettes that are continuous palettes of the seven unique colors in the discrete Urban Institute color palette:\n\n- `palette_urbn_cyan`\n- `palette_urbn_gray`\n- `palette_urbn_yellow`\n- `palette_urbn_magenta`\n- `palette_urbn_green`\n- `palette_urbn_spacegray`\n- `palette_urbn_red`\n\nUse `view_palette()` to see the palette:\n\n```{r view-palette}\nview_palette(palette_urbn_magenta)\n```\n\nThe vectors can be subset using base R syntax. This allows for the quick selection of specific colors from a palette.\n\n```{r palette-subset1}\npalette_urbn_main[1:4]\n```\n\n```{r palette-subset2}\npalette_urbn_spacegray[1:5]\n```\n\n### Utility functions\n\n`library(urbnthemes)` contains four functions that are helpful with managing font instalations:\n\n- `lato_test()`\n- `lato_install()`\n- `fontawesome_test()`\n- `fontawesome_install()`\n\n## Bibliography and Session Information\n\n------------------------------------------------------------------------\n\n*Note:* Examples present in [this document](https://awunderground.github.io/ggplot2-themes/) by Aaron Williams were created during personal time.\n\nBob Rudis and Dave Gandy (2017). waffle: Create Waffle Chart Visualizations in R. R package version 0.7.0. https://CRAN.R-project.org/package=waffle\n\nChester Ismay and Jennifer Chunn (2017). fivethirtyeight: Data and Code Behind the Stories and Interactives at 'FiveThirtyEight'. R package version 0.3.0. https://CRAN.R-project.org/package=fivethirtyeight\n\nHadley Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2009.\n\nHadley Wickham (2017). tidyverse: Easily Install and Load the 'Tidyverse'. R package version 1.2.1. https://CRAN.R-project.org/package=tidyverse\n\nHadley Wickham (2017). forcats: Tools for Working with Categorical Variables (Factors). R package version 0.2.0. https://CRAN.R-project.org/package=forcats\n\nJennifer Bryan (2017). gapminder: Data from Gapminder. R package version 0.3.0. https://CRAN.R-project.org/package=gapminder\n\nKamil Slowikowski (2017). ggrepel: Repulsive Text and Label Geoms for 'ggplot2'. R package version 0.7.0. https://CRAN.R-project.org/package=ggrepel\n\nMax Kuhn (2017). AmesHousing: The Ames Iowa Housing Data. R package version 0.0.3. https://CRAN.R-project.org/package=AmesHousing\n\nPeter Kampstra (2008). Beanplot: A Boxplot Alternative for Visual Comparison of Distributions, Journal of Statistical Software, 2008. https://www.jstatsoft.org/article/view/v028c01\n\nR Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.\n\nWinston Chang, (2014). extrafont: Tools for using fonts. R package version 0.17. https://CRAN.R-project.org/package=extrafont\n\nYihui Xie (2018). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.19.\n\n```{r System Info and Package Versioning}\nsessionInfo()\n```\n","srcMarkdownNoYaml":"\n\n\n\n::: {#header}\n\n:::\n\n# Urban Institute R Graphics Guide\n\n```{r setup, include=FALSE}\nlibrary(knitr)\nlibrary(datasets)\nlibrary(tidyverse)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nopts_chunk$set(fig.path = \"graphics-guide/www/images/\")\nopts_chunk$set(echo = TRUE)\nopts_chunk$set(warning = FALSE)\nopts_chunk$set(message = FALSE)\nopts_chunk$set(fig.width = 6.5)\nopts_chunk$set(fig.height = 4)\nopts_chunk$set(fig.retina = 3)\noptions(scipen = 999)\n```\n\nR is a powerful, open-source programming language and environment. R excels at data management and munging, traditional statistical analysis, machine learning, and reproducible research, but it is probably best known for its graphics. This guide contains examples and instructions for popular and lesser-known plotting techniques in R. It also includes instructions for using `urbnthemes`, the Urban Institute's R package for creating near-publication-ready plots with `ggplot2`. If you have any questions, please don't hesitate to contact Aaron Williams (awilliams\\@urban.org) or Kyle Ueyama (kueyama\\@urban.org).\n\n### Background\n\n`library(urbnthemes)` makes `ggplot2` output align more closely with [the Urban Institute's Data Visualization style guide](http://urbaninstitute.github.io/graphics-styleguide/). This package does **not produce publication ready graphics**. Visual styles must still be edited using your project/paper's normal editing workflow.\n\nExporting charts as a pdf will allow them to be more easily edited. See the Saving Plots section for more information.\n\nThe theme has been tested against `ggplot2 version 3.0.0`. It will not function properly with older versions of `ggplot2`\n\n### Using library(urbnthemes)\n\nRun the following code to install or update `urbnthemes`:\n\n``` \ninstall.packages(\"remotes\")\nremotes::install_github(\"UrbanInstitute/urbnthemes\")\n```\n\nRun the following code at the top of each script:\n\n``` \nlibrary(tidyverse)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n```\n\n### Installing Lato {#installing_lato}\n\nYour Urban computer may not have the Lato font installed. If it is not installed, please install the free [Lato font from Google](https://www.google.com/fonts/specimen/Lato). Below are step by step instructions:\n\n1) Download the [Lato font](https://www.google.com/fonts/specimen/Lato) (as a zip file).\n2) Unzip the file on your computer.\n3) For each `.ttf` file in the unzipped `Lato/` folder, double click the file and click `Install` (on Windows) or `Install Font` (on Mac).\n4) Import and register Lato into R by running `urbnthemes::lato_import()` in the console once. Be patient as this may take a few minutes!\n5) To confirm installation, run `urbnthemes::lato_test()`. If this is successful you're done and Lato will automatically be used when creating plots with `library(urbnthemes)`. You only need to install Lato once per computer.\n\nWaffle charts with glyphs require fontawesome. `fontawesome_test()` and `fontawesome_install()` are the fontawesome versions of the above functions. Be sure to install fontawesome from [here](https://github.com/hrbrmstr/waffle/tree/master/inst/fonts) first.\n\n### Grammar of Graphics and Conventions\n\nHadley Wickham's ggplot2 is based on Leland Wilkinson's [*The Grammar of Graphics*](https://www.amazon.com/Grammar-Graphics-Statistics-Computing/dp/0387245448) and Wickham's [*A Layered Grammar of Graphics*](http://vita.had.co.nz/papers/layered-grammar.html). The layered grammar of graphics is a structured way of thinking about the components of a plot, which then lend themselves to the simple structure of ggplot2.\n\n- **Data** are what are visualizaed in a plot and **mappings** are directions for how data are mapped in a plot in a way that can be perceived by humans.\\\n- **Geoms** are representations of the actual data like points, lines, and bars.\n- **Stats** are statistical transformations that represent summaries of the data like histograms.\n- **Scales** map values in the data space to values in the aesthetic space. Scales draw legends and axes.\n- **Coordinate Systems** describe how geoms are mapped to the plane of the graphic.\\\n- **Facets** break the data into meaningful subsets like small multiples.\n- **Themes** control the finer points of a plot such as fonts, font sizes, and background colors.\n\nMore information: [ggplot2: Elegant Graphics for Data Analysis](https://www.amazon.com/ggplot2-Elegant-Graphics-Data-Analysis/dp/0387981403)\n\n### Tips and Tricks\n\n- `ggplot2` expects data to be in data frames or tibbles. It is preferable for the data frames to be \"tidy\" with each variable as a column, each obseravtion as a row, and each observational unit as a separate table. `dplyr` and `tidyr` contain concise and effective tools for \"tidying\" data.\n\n- R allows function arguments to be called explicitly by name and implicitly by position. The coding examples in this guide only contain named arguments for clarity.\n\n- Graphics will sometimes render differently on different operating systems. This is because anti-aliasing is activated in R on Mac and Linux but not activated in R on Windows. This won't be an issue once graphics are saved.\n\n- Continuous x-axes have ticks. Discrete x-axes do not have ticks. Use `remove_ticks()` to remove ticks.\n\n## Bar Plots\n\n------------------------------------------------------------------------\n\n### One Color\n\n```{r barplots}\nmtcars %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = factor(cyl), y = n)) +\n geom_col() +\n geom_text(mapping = aes(label = n), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) +\n remove_ticks() +\n remove_axis() \n```\n\n### One Color (Rotated)\n\nThis example introduces `coord_flip()` and `remove_axis(axis = \"x\", flip = TRUE)`. `remove_axis()` is from `library(urbnthemes)` and creates a custom theme for rotated bar plots.\n\n```{r barplot-rotated}\nmtcars %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = factor(cyl), y = n)) +\n geom_col() +\n geom_text(mapping = aes(label = n), hjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n coord_flip() +\n remove_axis(axis = \"x\", flip = TRUE)\n```\n\n### Three Colors\n\nThis is identical to the previous plot except colors and a legend are added with `fill = cyl`. Turning `x` into a factor with `factor(cyl)` skips 5 and 7 on the `x-axis`. Adding `fill = cyl` without `factor()` would have created a continuous color scheme and legend.\n\n```{r 3-color-barplot}\nmtcars %>%\n mutate(cyl = factor(cyl)) %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = cyl, y = n, fill = cyl)) +\n geom_col() +\n geom_text(mapping = aes(label = n), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) +\n remove_ticks() +\n remove_axis()\n```\n\n### Stacked Bar Plot\n\nAn additional aesthetic can easily be added to bar plots by adding `fill = categorical variable` to the mapping. Here, transmission type subsets each bar showing the count of cars with different numbers of cylinders.\n\n```{r stacked-bar-plot}\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>% \n group_by(am) %>%\n count(cyl) %>%\n group_by(cyl) %>%\n arrange(desc(am)) %>%\n mutate(label_height = cumsum(n)) %>%\n ggplot() +\n geom_col(mapping = aes(x = cyl, y = n, fill = am)) +\n geom_text(aes(x = cyl, y = label_height - 0.5, label = n, color = am)) +\n scale_color_manual(values = c(\"white\", \"black\")) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n remove_axis() +\n guides(color = \"none\")\n```\n\n### Stacked Bar Plot With Position = Fill\n\nThe previous examples used `geom_col()`, which takes a y value for bar height. This example uses `geom_bar()` which sums the values and generates a value for bar heights. In this example, `position = \"fill\"` in `geom_bar()` changes the y-axis from count to the proportion of each bar.\n\n```{r stacked-bar-plot-fill}\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>% \n ggplot() +\n geom_bar(mapping = aes(x = cyl, fill = am), position = \"fill\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1)), labels = scales::percent) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n guides(color = \"none\")\n```\n\n### Dodged Bar Plot\n\nSubsetted bar charts in ggplot2 are stacked by default. `position = \"dodge\"` in `geom_col()` expands the bar chart so the bars appear next to each other.\n\n```{r dodged-bar-plot}\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>%\n group_by(am) %>%\n count(cyl) %>%\n ggplot(mapping = aes(cyl, y = n, fill = factor(am))) +\n geom_col(position = \"dodge\") +\n geom_text(aes(label = n), position = position_dodge(width = 0.7), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n remove_axis()\n```\n\n### Lollipop plot/Cleveland dot plot {.tabset}\n\nLollipop plots and Cleveland dot plots are minimalist alternatives to bar plots. The key to both plots is to order the data based on the continuous variable using `arrange()` and then turn the discrete variable into a factor with the ordered levels of the continuous variable using `mutate()`. This step \"stores\" the order of the data.\n\n#### Lollipop plot\n\n```{r lollipop-plot, fig.height = 5}\nmtcars %>%\n\trownames_to_column(\"model\") %>%\n\tarrange(mpg) %>%\n\tmutate(model = factor(model, levels = .$model)) %>%\n\tggplot(aes(mpg, model)) +\n\t\tgeom_segment(aes(x = 0, xend = mpg, y = model, yend = model)) +\t\n\t\tgeom_point() +\n\t\tscale_x_continuous(expand = expansion(mult = c(0, 0)), limits = c(0, 40)) +\n\t\tlabs(x = NULL, \n\t\t\t\t y = \"Miles Per Gallon\")\n```\n\n#### Cleveland dot plot\n\n```{r cleveland-dot-plot, fig.height = 5}\nmtcars %>%\n\trownames_to_column(\"model\") %>%\n\tarrange(mpg) %>%\n\tmutate(model = factor(model, levels = .$model)) %>%\n\tggplot(aes(mpg, model)) +\n\t\tgeom_point() +\n\t\tscale_x_continuous(expand = expansion(mult = c(0, 0)), limits = c(0, 40)) +\n\t\tlabs(x = NULL, \n\t\t\t\t y = \"Miles Per Gallon\")\n```\n\n### Dumbell plot\n\n## Scatter Plots\n\n------------------------------------------------------------------------\n\n### One Color Scatter Plot\n\nScatter plots are useful for showing relationships between two or more variables. Use `scatter_grid()` from `library(urbnthemes)` to easily add vertical grid lines for scatter plots.\n\n```{r one-color-scatter-plot}\nmtcars %>%\n ggplot(mapping = aes(x = wt, y = mpg)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:8 * 5) +\n labs(x = \"Weight (thousands of pounds)\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n### High-Density Scatter Plot with Transparency\n\nLarge numbers of observations can sometimes make scatter plots tough to interpret because points overlap. Adding `alpha =` with a number between 0 and 1 adds transparency to points and clarity to plots. Now it's easy to see that jewelry stores are probably rounding up but not rounding down carats!\n\n```{r alpha-scatter-plot}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 20000),\n breaks = 0:4 * 5000,\n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n```\n\n### Hex Scatter Plot\n\nSometimes transparency isn't enough to bring clarity to a scatter plot with many observations. As n increases into the hundreds of thousands and even millions, `geom_hex` can be one of the best ways to display relationships between two variables.\n\n```{r scatter-plot-hex}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_hex(mapping = aes(fill = after_stat(count))) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 20000),\n breaks = 0:4 * 5000,\n labels = scales::dollar) +\n\tscale_fill_gradientn(labels = scales::comma) + \n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid() +\n theme(legend.position = \"right\",\n legend.direction = \"vertical\")\n```\n\n### Scatter Plots With Random Noise {.tabset}\n\nSometimes scatter plots have many overlapping points but a reasonable number of observations. `geom_jitter` adds a small amount of random noise so points are less likely to overlap. `width` and `height` control the amount of noise that is added. In the following before-and-after, notice how many more points are visible after adding jitter.\n\n#### Before\n\n```{r before-scatter-plot}\nmpg %>%\n ggplot(mapping = aes(x = displ, y = cty)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 8),\n breaks = 0:8) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:4 * 10) +\n labs(x = \"Displacement\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n#### After\n\n```{r jitter-plot}\nset.seed(2017)\nmpg %>%\n ggplot(mapping = aes(x = displ, y = cty)) +\n geom_jitter() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 8),\n breaks = 0:8) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:4 * 10) +\n labs(x = \"Displacement\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n### Scatter Plots with Varying Point Size\n\nWeights and populations can be mapped in scatter plots to the size of the points. Here, the number of households in each state is mapped to the size of each point using `aes(size = hhpop)`. Note: `ggplot2::geom_point()` is used instead of `geom_point()`.\n\n```{r geom_point-size, fig.height = 5}\nurbnmapr::statedata %>%\n ggplot(mapping = aes(x = medhhincome, y = horate)) +\n ggplot2::geom_point(mapping = aes(size = hhpop), alpha = 0.3) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(30000, 80000),\n breaks = 3:8 * 10000,\n labels = scales::dollar) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 0.8),\n breaks = 0:4 * 0.2) +\n scale_radius(range = c(3, 15),\n breaks = c(2500000, 7500000, 12500000), \n labels = scales::comma) +\n labs(x = \"Household income\",\n y = \"Homeownership rate\") +\n scatter_grid() +\n\ttheme(plot.margin = margin(r = 20))\n```\n\n### Scatter Plots with Fill\n\nA third aesthetic can be added to scatter plots. Here, color signifies the number of cylinders in each car. Before `ggplot()` is called, Cylinders is created using `library(dplyr)` and the piping operator `%>%`.\n\n```{r filled-scatter-plot}\nmtcars %>%\n mutate(cyl = paste(cyl, \"cylinders\")) %>%\n ggplot(aes(x = wt, y = mpg, color = cyl)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:8 * 5) +\n labs(x = \"Weight (thousands of pounds)\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n## Line Plots\n\n------------------------------------------------------------------------\n\n```{r line-plots}\neconomics %>%\n ggplot(mapping = aes(x = date, y = unemploy)) +\n geom_line() +\n scale_x_date(expand = expansion(mult = c(0.002, 0)), \n breaks = \"10 years\",\n limits = c(as.Date(\"1961-01-01\"), as.Date(\"2020-01-01\")),\n date_labels = \"%Y\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 4000,\n limits = c(0, 16000),\n labels = scales::comma) +\n labs(x = \"Year\", \n y = \"Number Unemployed (1,000s)\")\n```\n\n### Lines Plots With Multiple Lines\n\n```{r multiple-line-charts1}\nlibrary(gapminder)\n\ngapminder %>%\n filter(country %in% c(\"Australia\", \"Canada\", \"New Zealand\")) %>%\n mutate(country = factor(country, levels = c(\"Canada\", \"Australia\", \"New Zealand\"))) %>%\n ggplot(aes(year, gdpPercap, color = country)) +\n geom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n breaks = c(1952 + 0:12 * 5), \n limits = c(1952, 2007)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:8 * 5000,\n labels = scales::dollar, \n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Per capita GDP (US dollars)\")\n```\n\nPlotting more than one variable can be useful for seeing the relationship of variables over time, but it takes a small amount of data munging.\n\nThis is because `ggplot2` wants data in a \"long\" format instead of a \"wide\" format for line plots with multiple lines. `gather()` and `spread()` from the `tidyr` package make switching back-and-forth between \"long\" and \"wide\" painless. Essentially, variable titles go into \"key\" and variable values go into \"value\". Then ggplot2, turns the different levels of the key variable (population, unemployment) into colors.\n\n```{r multiple-line-charts2}\nas_tibble(EuStockMarkets) %>%\n\tmutate(date = time(EuStockMarkets)) %>%\n\tgather(key = \"key\", value = \"value\", -date) %>%\n\tggplot(mapping = aes(x = date, y = value, color = key)) +\n\tgeom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(1991, 1999), \n breaks = c(1991, 1993, 1995, 1997, 1999)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 2500,\n labels = scales::dollar, \n limits = c(0, 10000)) + \n\tlabs(x = \"Date\",\n\t\t\t y = \"Value\")\n```\n\n### Step plot\n\n`geom_line()` connects coordinates with the shortest possible straight line. Sometimes step plots are necessary because y values don't change between coordinates. For example, the upper-bound of the Federal Funds Rate is set at regular intervals and remains constant until it is changed.\n\n```{r step-plot}\n# downloaded from FRED on 2018-12-06\n\n# https://fred.stlouisfed.org/series/DFEDTARU\n\nfed_fund_rate <- read_csv(\n \"date, fed_funds_rate\n 2014-01-01,0.0025\n 2015-12-16,0.0050\n 2016-12-14,0.0075\n 2017-03-16,0.0100\n 2017-06-15,0.0125\n 2017-12-14,0.0150\n 2018-03-22,0.0175\n 2018-06-14,0.0200\n 2018-09-27,0.0225\n 2018-12-06,0.0225\")\n\nfed_fund_rate %>%\n ggplot(mapping = aes(x = date, y = fed_funds_rate)) + \n geom_step() +\n scale_x_date(expand = expansion(mult = c(0.002, 0)), \n breaks = \"1 year\",\n limits = c(as.Date(\"2014-01-01\"), as.Date(\"2019-01-01\")),\n date_labels = \"%Y\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = c(0, 0.01, 0.02, 0.03),\n limits = c(0, 0.03),\n labels = scales::percent) + \n\tlabs(x = \"Date\",\n\t\t\t y = \"Upper-bound of the Federal Funds Rate\")\n```\n\n### Path plot\n\nThe Beveridge curve is a macroeconomic plot that displays a relationship between the unemployment rate and the vacancy rate. Movements along the curve indicate changes in the business cyle and horizontal shifts of the curve suggest structural changes in the labor market.\n\nLines in Beveridge curves do not monotonically move from left to right. Therefore, it is necessary to use `geom_path()`.\n\n```{r, path-plot}\n# seasonally-adjusted, quarterly vacancy rate - JOLTS # seasonally-adjusted, quarterly unemployment rate - CPS\n\n# pulled from FRED on April 11, 2018. \n\nlibrary(ggrepel)\n\nbeveridge <- read_csv(\n\t\"quarter, vacanacy_rate, unempoyment_rate\n\t2006-01-01,0.0310,0.0473\n\t2006-04-01,0.0316,0.0463\n\t2006-07-01,0.0313,0.0463\n\t2006-10-01,0.0310,0.0443\n\t2007-01-01,0.0323,0.0450\n\t2007-04-01,0.0326,0.0450\n\t2007-07-01,0.0316,0.0466\n\t2007-10-01,0.0293,0.0480\n\t2008-01-01,0.0286,0.0500\n\t2008-04-01,0.0280,0.0533\n\t2008-07-01,0.0253,0.0600\n\t2008-10-01,0.0220,0.0686\n\t2009-01-01,0.0196,0.0826\n\t2009-04-01,0.0180,0.0930\n\t2009-07-01,0.0176,0.0963\n\t2009-10-01,0.0180,0.0993\n\t2010-01-01,0.0196,0.0983\n\t2010-04-01,0.0220,0.0963\n\t2010-07-01,0.0216,0.0946\n\t2010-10-01,0.0220,0.0950\n\t2011-01-01,0.0226,0.0903\n\t2011-04-01,0.0236,0.0906\n\t2011-07-01,0.0250,0.0900\n\t2011-10-01,0.0243,0.0863\n\t2012-01-01,0.0270,0.0826\n\t2012-04-01,0.0270,0.0820\n\t2012-07-01,0.0266,0.0803\n\t2012-10-01,0.0260,0.0780\n\t2013-01-01,0.0276,0.0773\n\t2013-04-01,0.0280,0.0753\n\t2013-07-01,0.0280,0.0723\n\t2013-10-01,0.0276,0.0693\n\t2014-01-01,0.0290,0.0666\n\t2014-04-01,0.0323,0.0623\n\t2014-07-01,0.0326,0.0610\n\t2014-10-01,0.0330,0.0570\n\t2015-01-01,0.0350,0.0556\n\t2015-04-01,0.0366,0.0540\n\t2015-07-01,0.0373,0.0510\n\t2015-10-01,0.0360,0.0500\n\t2016-01-01,0.0386,0.0493\n\t2016-04-01,0.0383,0.0486\n\t2016-07-01,0.0383,0.0493\n\t2016-10-01,0.0363,0.0473\n\t2017-01-01,0.0366,0.0466\n\t2017-04-01,0.0390,0.0433\n\t2017-07-01,0.0406,0.0430\n\t2017-10-01,0.0386,0.0410\")\n\nlabels <- beveridge %>%\n filter(lubridate::month(quarter) == 1)\n\nbeveridge %>%\n\tggplot() +\n\tgeom_path(mapping = aes(x = unempoyment_rate, y = vacanacy_rate), alpha = 0.5) +\n geom_point(data = labels, mapping = aes(x = unempoyment_rate, y = vacanacy_rate)) +\n geom_text_repel(data = labels, mapping = aes(x = unempoyment_rate, y = vacanacy_rate, label = lubridate::year(quarter))) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0.04, 0.1),\n labels = scales::percent) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = c(0, 0.01, 0.02, 0.03, 0.04, 0.05),\n limits = c(0, 0.05),\n labels = scales::percent) + \n\tlabs(x = \"Seasonally-adjusted unemployment rate\",\n\t\t\t y = \"Seasonally-adjusted vacancy rate\") + \n scatter_grid()\n```\n\n### Slope plots\n\n```{r slope-plot, fig.height = 5}\n# https://www.bls.gov/lau/\nlibrary(ggrepel)\n\nunemployment <- tibble(\n\ttime = c(\"October 2009\", \"October 2009\", \"October 2009\", \"August 2017\", \"August 2017\", \"August 2017\"),\n\trate = c(7.4, 7.1, 10.0, 3.9, 3.8, 6.4),\n\tstate = c(\"Maryland\", \"Virginia\", \"Washington, D.C.\", \"Maryland\", \"Virginia\", \"Washington, D.C.\")\n)\n\nlabel <- tibble(label = c(\"October 2009\", \"August 2017\"))\noctober <- filter(unemployment, time == \"October 2009\")\naugust <- filter(unemployment, time == \"August 2017\")\n\nunemployment %>%\n\tmutate(time = factor(time, levels = c(\"October 2009\", \"August 2017\")),\n\t state = factor(state, levels = c(\"Washington, D.C.\", \"Maryland\", \"Virginia\"))) %>%\n\tggplot() + \n\tgeom_line(aes(time, rate, group = state, color = state), show.legend = FALSE) +\n\tgeom_point(aes(x = time, y = rate, color = state)) +\n\tlabs(subtitle = \"Unemployment Rate\") +\n\ttheme(axis.ticks.x = element_blank(),\n\t\t\t\taxis.title.x = element_blank(),\n\t\t\t\taxis.ticks.y = element_blank(),\n axis.title.y = element_blank(), \n axis.text.y = element_blank(),\n\t\t\t\tpanel.grid.major.y = element_blank(),\n panel.grid.minor.y = element_blank(),\n panel.grid.major.x = element_blank(),\n\t\t\t\taxis.line = element_blank()) +\n\tgeom_text_repel(data = october, mapping = aes(x = time, y = rate, label = as.character(rate)), nudge_x = -0.06) + \n\tgeom_text_repel(data = august, mapping = aes(x = time, y = rate, label = as.character(rate)), nudge_x = 0.06)\n```\n\n## Univariate\n\n------------------------------------------------------------------------\n\nThere are a number of ways to explore the distributions of univariate data in R. Some methods, like strip charts, show all data points. Other methods, like the box and whisker plot, show selected data points that communicate key values like the median and 25th percentile. Finally, some methods don't show any of the underlying data but calculate density estimates. Each method has advantages and disadvantages, so it is worthwhile to understand the different forms. For more information, read [40 years of boxplots](http://vita.had.co.nz/papers/boxplots.pdf) by Hadley Wickham and Lisa Stryjewski.\n\n### Strip Chart\n\nStrip charts, the simplest univariate plot, show the distribution of values along one axis. Strip charts work best with variables that have plenty of variation. If not, the points tend to cluster on top of each other. Even if the variable has plenty of variation, it is often important to add transparency to the points with `alpha =` so overlapping values are visible.\n\n```{r stripchart, fig.height=2}\nmsleep %>%\n ggplot(aes(x = sleep_total, y = factor(1))) +\n geom_point(alpha = 0.2, size = 5) +\n labs(y = NULL) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) +\n scale_y_discrete(labels = NULL) +\n labs(title = \"Total Sleep Time of Different Mammals\",\n x = \"Total sleep time (hours)\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n```\n\n### Strip Chart with Highlighting\n\nBecause strip charts show all values, they are useful for showing where selected points lie in the distribution of a variable. The clearest way to do this is by adding `geom_point()` twice with `filter()` in the data argument. This way, the highlighted values show up on top of unhighlighted values.\n\n```{r stripchart-with-highlighting, fig.height=2}\nggplot() +\n geom_point(data = filter(msleep, name != \"Red fox\"), \n aes(x = sleep_total, \n y = factor(1)),\n alpha = 0.2, \n size = 5,\n \t\t\t\t\t color = \"grey50\") +\n geom_point(data = filter(msleep, name == \"Red fox\"),\n aes(x = sleep_total, \n y = factor(1), \n color = name),\n alpha = 0.8,\n size = 5) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) + \n scale_y_discrete(labels = NULL) +\n labs(title = \"Total Sleep Time of Different Mammals\",\n x = \"Total sleep time (hours)\",\n y = NULL,\n legend) +\n guides(color = guide_legend(title = NULL)) +\n theme(axis.ticks.y = element_blank())\n```\n\n### Subsetted Strip Chart\n\nAdd a y variable to see the distributions of the continuous variable in subsets of a categorical variable.\n\n```{r subsetted-stripchart, fig.height=3}\nlibrary(forcats)\n\nmsleep %>%\n filter(!is.na(vore)) %>%\n mutate(vore = fct_recode(vore, \n \"Insectivore\" = \"insecti\",\n \"Omnivore\" = \"omni\", \n \"Herbivore\" = \"herbi\", \n \"Carnivore\" = \"carni\"\n )) %>%\n ggplot(aes(x = sleep_total, y = vore)) +\n geom_point(alpha = 0.2, size = 5) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) + \n labs(title = \"Total Sleep Time of Different Mammals by Diet\",\n x = \"Total sleep time (hours)\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n```\n\n### Beeswarm Plots\n\nBeesward plots are a variation of strip charts that shows the distribution of data, but without the points overlaping.\n\n```{r beeswarm}\nlibrary(ggbeeswarm)\n\ntxhousing %>%\n\tfilter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>% \n ggplot(aes(x = median, y = city)) +\n geom_beeswarm(alpha = 0.2, size = 5) + \n\tscale_x_continuous(labels = scales::dollar) +\n labs(title = \"Household Sale Price by City\",\n x = \"Sale Price\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n\n```\n\n### Histograms\n\nHistograms divide the distribution of a variable into n equal-sized bins and then count and display the number of observations in each bin. Histograms are sensitive to bin width. As `?geom_histogram` notes, \"You should always override \\[the default binwidth\\] value, exploring multiple widths to find the best to illustrate the stories in your data.\"\n\n```{r histogram}\nggplot(data = diamonds, mapping = aes(x = depth)) + \n geom_histogram(bins = 100) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 100)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.2)), labels = scales::comma) +\n labs(x = \"Depth\",\n y = \"Count\")\n```\n\n### Boxplots\n\nBoxplots were invented in the 1970s by John Tukey[^1]. Instead of showing the underlying data or binned counts of the underlying data, they focus on important values like the 25th percentile, median, and 75th percentile.\n\n[^1]: Wickham, H., & Stryjewski, L. (2011). 40 years of boxplots.\n\n```{r box-plot}\nInsectSprays %>%\n ggplot(mapping = aes(x = spray, y = count)) +\n geom_boxplot() +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Type of insect spray\",\n y = \"Number of dead insects\") +\n remove_ticks()\n```\n\n### Smoothed Kernel Density Plots\n\nContinuous variables with smooth distributions are sometimes better represented with smoothed kernel density estimates than histograms or boxplots. `geom_density()` computes and plots a kernel density estimate. Notice the lumps around integers and halves in the following distribution because of rounding.\n\n```{r kernel-density-plot}\ndiamonds %>%\n ggplot(mapping = aes(carat)) +\n geom_density(color = NA) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, NA)) +\n\tscale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Carat\",\n y = \"Density\")\n```\n\n```{r kernel-density-plot-filled}\ndiamonds %>%\n mutate(cost = ifelse(price > 5500, \"More than $5,500 +\", \"$0 to $5,500\")) %>%\n ggplot(mapping = aes(carat, fill = cost)) +\n geom_density(alpha = 0.25, color = NA) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, NA)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Carat\",\n y = \"Density\")\n```\n\n### Ridgeline Plots\n\nRidgeline plots are partially overlapping smoothed kernel density plots faceted by a categorical variable that pack a lot of information into one elegant plot.\n\n```{r ridgeline-plots}\nlibrary(ggridges)\n\nggplot(diamonds, mapping = aes(x = price, y = cut)) +\n\tgeom_density_ridges(fill = \"#1696d2\") +\n labs(x = \"Price\",\n y = \"Cut\")\n```\n\n### Violin Plots\n\nViolin plots are symmetrical displays of smooth kernel density plots.\n\n```{r violin-plot}\nInsectSprays %>%\n ggplot(mapping = aes(x = spray, y = count, fill = spray)) +\n geom_violin(color = NA) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Type of insect spray\",\n y = \"Number of dead insects\") +\n remove_ticks()\n```\n\n### Bean Plot\n\nIndividual outliers and important summary values are not visible in violin plots or smoothed kernel density plots. Bean plots, [created by Peter Kampstra in 2008](https://www.jstatsoft.org/article/view/v028c01), are violin plots with data shown as small lines in a one-dimensional sstrip plot and larger lines for the mean.\n\n```{r beanplot}\nmsleep %>%\n filter(!is.na(vore)) %>%\n mutate(vore = fct_recode(vore, \n \"Insectivore\" = \"insecti\",\n \"Omnivore\" = \"omni\", \n \"Herbivore\" = \"herbi\", \n \"Carnivore\" = \"carni\"\n )) %>%\n ggplot(aes(x = vore, y = sleep_total, fill = vore)) +\n stat_summary(fun = \"mean\",\n colour = \"black\", \n size = 30,\n shape = 95,\n geom = \"point\") +\n geom_violin(color = NA) +\n geom_jitter(width = 0,\n height = 0.05,\n alpha = 0.4,\n shape = \"-\",\n size = 10,\n \t\t\t\t\t\tcolor = \"grey50\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) + \n labs(x = NULL,\n y = \"Total sleep time (hours)\") +\n theme(legend.position = \"none\") +\n remove_ticks()\n```\n\n## Area Plot\n\n------------------------------------------------------------------------\n\n### Stacked Area\n\n```{r area-plot-stack}\ntxhousing %>%\n filter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>%\n group_by(city, year) %>%\n summarize(sales = sum(sales)) %>%\n ggplot(aes(x = year, y = sales, fill = city)) +\n geom_area(position = \"stack\") +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(2000, 2015),\n breaks = 2000 + 0:15) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n labels = scales::comma) +\n labs(x = \"Year\",\n y = \"Home sales\")\n```\n\n### Filled Area\n\n```{r area-plot-fill}\ntxhousing %>%\n filter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>%\n group_by(city, year) %>%\n summarize(sales = sum(sales)) %>%\n ggplot(aes(x = year, y = sales, fill = city)) +\n geom_area(position = \"fill\") +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(2000, 2015),\n breaks = 2000 + 0:15) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.02)),\n breaks = c(0, 0.25, 0.5, 0.75, 1),\n labels = scales::percent) +\n labs(x = \"Year\",\n y = \"Home sales\")\n```\n\n## Sankey Plot\n\n------------------------------------------------------------------------\n\nSankey plots visualize flows from one set of variables to another. This can be useful for showing outcomes from the start of a program to the end. You'll need to install the `ggsankey` package to create Sankey plots in R. In this example I make a dummy data set of housing status prior to program start and at exit to show the flow of people between outcomes. A key step is to transform your data set using the `make_long` function from the package. This creates a data frame that specifies each of the initial nodes and how they flow into the next stage.\n\n```{r}\n# load ggsankey package\nremotes::install_github(\"davidsjoberg/ggsankey\")\nlibrary(ggsankey)\n\n# create a dummy dataset of housing status\ndf <- data_frame(entry_status = c(rep(\"Housed\", 7), rep(\"Unhoused\", 15), rep(\"Staying w/ Family\", 8)), \n exit_status = c(rep(\"Housed\", 15), rep(\"Unhoused\", 2), rep(\"Staying w/ Family\", 13))) %>% \n\t# transform the data frame into the proper format for the sankey plot\n make_long(entry_status, exit_status) %>% \n\t# recode the labels to be cleaner in the plot \n mutate(x = recode(x, entry_status = \"Prior Housing Status\", exit_status = \"Exit Housing Status\"),\n next_x = recode(next_x, entry_status = \"Prior Housing Status\", exit_status = \"Exit Housing Status\"))\n\n# create sankey plot\nggplot(df, aes(x = x, \n next_x = next_x, \n node = node, \n next_node = next_node,\n fill = factor(node), \n label = node)) +\n geom_sankey(flow.alpha = 0.5, node.color = 1, show.legend = FALSE) +\n # add labels to plot and style\n geom_sankey_label(size = 3.5, color = 1, fill = \"white\") +\n theme_sankey(base_size = 16)+\n labs(x = NULL)\n```\n\n## Heat Map\n\n------------------------------------------------------------------------\n\n```{r heat-map}\nlibrary(fivethirtyeight)\n\nbad_drivers %>%\n filter(state %in% c(\"Maine\", \"New Hampshire\", \"Vermont\", \"Massachusetts\", \"Connecticut\", \"New York\")) %>%\n mutate(`Number of\\nDrivers` = scale(num_drivers),\n `Percent\\nSpeeding` = scale(perc_speeding),\n `Percent\\nAlcohol` = scale(perc_alcohol),\n `Percent Not\\nDistracted` = scale(perc_not_distracted),\n `Percent No\\nPrevious` = scale(perc_no_previous),\n state = factor(state, levels = rev(state))\n ) %>%\n select(-insurance_premiums, -losses, -(num_drivers:losses)) %>%\n gather(`Number of\\nDrivers`:`Percent No\\nPrevious`, key = \"variable\", value = \"SD's from Mean\") %>%\n ggplot(aes(variable, state)) +\n geom_tile(aes(fill = `SD's from Mean`)) +\n labs(x = NULL,\n y = NULL) + \n scale_fill_gradientn() +\n theme(legend.position = \"right\",\n legend.direction = \"vertical\",\n axis.line.x = element_blank(),\n panel.grid.major.y = element_blank()) +\n remove_ticks()\n#https://learnr.wordpress.com/2010/01/26/ggplot2-quick-heatmap-plotting/\n```\n\n## Faceting and Small Multiples\n\n------------------------------------------------------------------------\n\n### facet_wrap()\n\nR's faceting system is a powerful way to make \"small multiples\".\n\nSome edits to the theme may be necessary depending upon how many rows and columns are in the plot.\n\n```{r small-multiples, fig.height=2}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n facet_wrap(~cut, ncol = 5) +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 6)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 20000), \n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n```\n\n### facet_grid()\n\n```{r faceting, fig.height=7}\ndiamonds %>%\n filter(color %in% c(\"D\", \"E\", \"F\", \"G\")) %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n facet_grid(color ~ cut) +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 4)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 20000), \n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n theme(panel.spacing = unit(20L, \"pt\")) +\n scatter_grid()\n```\n\n## Smoothers\n\n------------------------------------------------------------------------\n\n`geom_smooth()` fits and plots models to data with two or more dimensions.\n\nUnderstanding and manipulating defaults is more important for `geom_smooth()` than other geoms because it contains a number of assumptions. `geom_smooth()` automatically uses loess for datasets with fewer than 1,000 observations and a generalized additive model with `formula = y ~ s(x, bs = \"cs\")` for datasets with greater than 1,000 observations. Both default to a 95% confidence interval with the confidence interval displayed.\n\nModels are chosen with `method =` and can be set to lm(), glm(), gam(), loess(), rlm(), and more. Formulas can be specified with `formula =` and `y ~ x` syntax. Plotting the standard error is toggled with `se = TRUE` and `se = FALSE`, and level is specificed with `level =`. As always, more information can be seen in RStudio with `?geom_smooth()`.\n\n`geom_point()` adds a scatterplot to `geom_smooth()`. The order of the function calls is important. The function called second will be layed on top of the function called first.\n\n```{r geom_smooth}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n\tgeom_point(alpha = 0.05) +\n\tgeom_smooth(color = \"#ec008b\") +\n\tscale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n\t limits = c(0, 5),\n\t breaks = 0:5) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 20000), \n labels = scales::dollar) + \n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n```\n\n`geom_smooth` can be subset by categorical and factor variables. This requires subgroups to have a decent number of observations and and a fair amount of variability across the x-axis. Confidence intervals often widen at the ends so special care is needed for the chart to be meaningful and readable.\n\nThis example uses Loess with MPG = displacement.\n\n```{r subset-geom_smooth}\nggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = factor(cyl))) +\n\tgeom_point(alpha = 0.2) +\n\tgeom_smooth() +\n\tscale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n\t limits = c(0, 7),\n\t breaks = 0:7) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 60)) + \n\tlabs(x = \"Engine displacement\",\n\t\t\t y = \"Highway MPG\") +\n scatter_grid()\n```\n\nThis example uses linear models with MPG = displacement.\n\n```{r subset-geom-smooth-lm}\nggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = factor(cyl))) +\n\tgeom_point(alpha = 0.2) +\n\tgeom_smooth(method = \"lm\") +\n\tscale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n\t limits = c(0, 7),\n\t breaks = 0:7) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 60)) + \n\tlabs(x = \"Engine displacement\",\n\t\t\t y = \"Highway MPG\") +\n scatter_grid()\n```\n\n## Highlighting\n\n------------------------------------------------------------------------\n\n[`library(gghighlight)`](https://yutannihilation.github.io/gghighlight/) enables the intuitive highlighting of ggplot2 plots. `gghighlight` modifies existing ggplot2 objects, so no other code should change. All of the highlighting is handled by the function `gghighlight()`, which can handle all types of geoms.\n\n*Warning:* R will throw an error if too many colors are highlighted because of the design of `urbnthemes`. Simply decrease the number of highlighted geoms to solve this issue.\n\nThere are two main ways to highlight.\n\n### Threshold\n\nThe first way to highlight is with a threshold. Add a logical test to `gghighlight()` to describe which lines should be highlighted. Here, lines with maximum change in per-capita Gross Domestic Product greater than \\$35,000 are highlighted by `gghighlight(max(pcgpd_change) > 35000, use_direct_label = FALSE)`.\n\n```{r gghighlight-threshold}\nlibrary(gghighlight)\nlibrary(gapminder)\n\ndata <- gapminder %>%\n filter(continent %in% c(\"Europe\")) %>%\n group_by(country) %>%\n mutate(pcgpd_change = ifelse(year == 1952, 0, gdpPercap - lag(gdpPercap))) %>%\n mutate(pcgpd_change = cumsum(pcgpd_change))\n \ndata %>%\n ggplot(aes(year, pcgpd_change, group = country, color = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change) > 35000, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\")\n```\n\n### Rank\n\nThe second way to highlight is by rank. Here, the countries with the first highest values for change in per-capita Gross Domestic Product are highlighted with `gghighlight(max(pcgpd_change), max_highlight = 5, use_direct_label = FALSE)`.\n\n```{r gghighlight-rank}\ndata %>%\n ggplot(aes(year, pcgpd_change, group = country, color = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change), max_highlight = 5, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\")\n```\n\n### Faceting\n\n`gghighlight()` works well with ggplot2's faceting system.\n\n```{r gghighlight-faceting}\ndata %>%\n ggplot(aes(year, pcgpd_change, group = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change), max_highlight = 4, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\") +\n facet_wrap(~ country) +\n theme(panel.spacing = unit(20L, \"pt\"))\n```\n\n## Text and Annotation\n\n------------------------------------------------------------------------\n\nSeveral functions can be used to annotate, label, and highlight different parts of plots. `geom_text()` and `geom_text_repel()` both display variables from data frames. `annotate()`, which has several different uses, displays variables and values included in the function call.\n\n### geom_text()\n\n`geom_text()` turns text variables in data sets into geometric objects. This is useful for labeling data in plots. Both functions need `x` values and `y` values to determine placement on the coordinate plane, and a text vector of labels.\n\nThis can be used to label `geom_bar()`.\n\n```{r bar-geom_text}\ndiamonds %>%\n group_by(cut) %>%\n summarize(price = mean(price)) %>%\n ggplot(aes(cut, price)) +\n geom_bar(stat = \"identity\") +\n geom_text(aes(label = scales::dollar(price)), vjust = -1) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2)),\n \t\t\t\t\t\t\t\t\t labels = scales::dollar) +\n labs(title = \"Average Diamond Price by Diamond Cut\",\n x = \"Cut\",\n y = \"Price\") +\n remove_ticks()\n```\n\nIt can also be used to label points in a scatter plot.\n\nIt's rarely useful to label every point in a scatter plot. Use `filter()` to create a second data set that is subsetted and pass it into the labelling function.\n\n```{r scatterplot-geom_text}\nlabels <- mtcars %>%\n\trownames_to_column(\"model\") %>%\n\tfilter(model %in% c(\"Toyota Corolla\", \"Merc 240D\", \"Datsun 710\"))\n\nmtcars %>%\n\tggplot() +\n\tgeom_point(mapping = aes(x = wt, y = mpg)) +\n\tgeom_text(data = labels, mapping = aes(x = wt, y = mpg, label = model), nudge_x = 0.38) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 6)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 40)) + \n labs(x = \"Weight (Tons)\",\n y = \"Miles per gallon (MPG)\") +\n scatter_grid()\n```\n\nText too often overlaps with other text or geoms when using `geom_text()`. `library(ggrepel)` is a `library(ggplot2)` add-on that automatically positions text so it doesn't overlap with geoms or other text. To add this functionality, install and load `library(ggrepel)` and then use `geom_text_repel()` with the same syntax as `geom_text()`.\n\n### geom_text_repel()\n\n```{r scatterplot-geom_text_repel}\nlibrary(ggrepel)\n\nlabels <- mtcars %>%\n\trownames_to_column(\"model\") %>%\n\ttop_n(5, mpg)\n\nmtcars %>%\n\tggplot(mapping = aes(x = wt, y = mpg)) +\n\tgeom_point() +\n\tgeom_text_repel(data = labels, \n\t mapping = aes(label = model), \n\t nudge_x = 0.38) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 6)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 40)) + \n labs(x = \"Weight (Tons)\",\n y = \"Miles per gallon (MPG)\") +\n scatter_grid()\n```\n\n### annotate()\n\n`annotate()` doesn't use data frames. Instead, it takes values for `x =` and `y =`. It can add text, rectangles, segments, and pointrange.\n\n```{r annotate-point}\nmsleep %>%\n filter(bodywt <= 1000) %>%\n ggplot(aes(bodywt, sleep_total)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(-10, 1000),\n \t\t\t\t\t\t\t\t\t labels = scales::comma) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 25)) + \n annotate(\"text\", x = 500, y = 12, label = \"These data suggest that heavy \\n animals sleep less than light animals\") +\n labs(x = \"Body weight (pounds)\",\n y = \"Sleep time (hours)\") +\n scatter_grid() \n```\n\n```{r annotate-rect}\nlibrary(AmesHousing)\n\names <- make_ames()\n\names %>%\n mutate(square_footage = Total_Bsmt_SF - Bsmt_Unf_SF + First_Flr_SF + Second_Flr_SF) %>%\n mutate(Sale_Price = Sale_Price / 1000) %>% \n ggplot(aes(square_footage, Sale_Price)) +\n geom_point(alpha = 0.2) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(-10, 12000),\n \t\t\t\t\t\t\t\t\t labels = scales::comma) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 800),\n \t\t\t\t\t\t\t\t\t labels = scales::dollar) + \n annotate(\"rect\", xmin = 6800, xmax = 11500, ymin = 145, ymax = 210, alpha = 0.1) +\n annotate(\"text\", x = 8750, y = 230, label = \"Unfinished homes\") +\n labs(x = \"Square footage\", \n y = \"Sale price (thousands)\") +\n scatter_grid() \n```\n\n## Layered Geoms\n\n------------------------------------------------------------------------\n\nGeoms can be layered in `ggplot2`. This is useful for design and analysis.\n\nIt is often useful to add points to line plots with a small number of values across the x-axis. This example from [R for Data Science](http://r4ds.had.co.nz/tidy-data.html) shows how changing the line to grey can be appealing.\n\n### Design {.tabset}\n\n#### Before\n\n```{r layering-geoms-design}\ntable1 %>%\n\tggplot(aes(x = year, y = cases)) +\n\t\tgeom_line(aes(color = country)) +\n\t\tgeom_point(aes(color = country)) +\n\t\tscale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n\t\t labels = scales::comma) +\n\t\tscale_x_continuous(breaks = c(1999, 2000)) +\n\t\tlabs(title = \"Changes in Tuberculosis Cases in Three Countries\")\n```\n\n#### After\n\n```{r layering-geoms-design-gray}\ntable1 %>%\n\tggplot(aes(year, cases)) +\n\t\tgeom_line(aes(group = country), color = \"grey50\") +\n\t\tgeom_point(aes(color = country)) +\n\t\tscale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n\t\t labels = scales::comma) +\n\t\tscale_x_continuous(breaks = c(1999, 2000)) +\n\t\tlabs(title = \"Changes in Tuberculosis Cases in Three Countries\")\n```\n\n### Centroids\n\n```{r centroids}\nmpg_summary <- mpg %>%\n\tgroup_by(cyl) %>%\n\tsummarize(displ = mean(displ), cty = mean(cty))\n\nmpg %>%\n\tggplot() +\n\tgeom_point(aes(x = displ, y = cty, color = factor(cyl)), alpha = 0.5) +\n\tgeom_point(data = mpg_summary, aes(x = displ, y = cty), size = 5, color = \"#ec008b\") +\n\tgeom_text(data = mpg_summary, aes(x = displ, y = cty, label = cyl)) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 8)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0)), \n limits = c(0, 40)) +\n\tlabs(x = \"Displacement\",\n\t y = \"City MPG\") +\n scatter_grid()\n```\n\n## Saving Plots\n\n------------------------------------------------------------------------\n\n`ggsave()` exports ggplot2 plots. The function can be used in two ways. If `plot =` isn't specified in the function call, then `ggsave()` automatically saves the plot that was last displayed in the Viewer window. Second, if `plot =` is specified, then `ggsave()` saves the specified plot. `ggsave()` guesses the type of graphics device to use in export (.png, .pdf, .svg, etc.) from the file extension in the filename.\n\n``` \nmtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\nggsave(filename = \"cars.png\")\n\nplot2 <- mtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\nggsave(filename = \"cars.png\", plot = plot2)\n```\n\nExported plots rarely look identical to the plots that show up in the Viewer window in RStudio because the overall size and aspect ratio of the Viewer is often different than the defaults for `ggsave()`. Specific sizes, aspect ratios, and resolutions can be controlled with arguments in `ggsave()`. RStudio has a useful [cheatsheet](https://www.rstudio.com/resources/cheatsheets/) called \"How Big is Your Graph?\" that should help with choosing the best size, aspect ratio, and resolution.\n\nFonts are not embedded in PDFs by default. To embed fonts in PDFs, include `device = cairo_pdf` in `ggsave()`.\n\n``` \nplot <- mtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\nggsave(filename = \"cars.pdf\", plot = plot2, width = 6.5, height = 4, device = cairo_pdf)\n```\n\n## Interactive Plots\n\nWe can make any of the previous plots interactive with the powerful and easy `plotly` library. All we have to do is wrap a ggplot object in the `ggplotly` function. *Note:* You can't add `ggplotly` to the end of a ggplot object, but have to actually save the ggplot as a variable and then wrap that in the function call as shown below.\n\nYou can customize the tooltip text by adding a value to `text` in `aes()` and then specifying `tooltip = \"text\"` in the `ggplotly` call.\n\n```{r}\nlibrary(plotly)\n\nstock_plot <- as_tibble(EuStockMarkets) %>% \n mutate(date = time(EuStockMarkets)) %>% \n gather(key = \"key\", value = \"value\", -date) %>% \n ggplot(mapping = aes(x = date, y = value, color = key,\n \t\t\t\t\t\t\t\t\t\t # sometimes ggplotly messes with line charts,\n \t\t\t\t\t\t\t\t\t\t # adding a group value usually helps with that\n \t\t\t\t\t\t\t\t\t\t group = key,\n \t\t\t\t\t\t\t\t\t\t # customize the tooltip with the text aes\n \t\t\t\t\t\t\t\t\t\t text = paste0(\"Value: \", round(value, 2), \" \",\n \t\t\t\t\t\t\t\t\t\t \t\t\t\t\t\t\t\"Date: \", round(date, 3), \" \",\n \t\t\t\t\t\t\t\t\t\t \t\t\t\t\t\t\t\"Key: \", key))\n \t\t\t\t\t\t\t\t\t\t ) +\n geom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(1991, 1999), \n breaks = c(1991, 1993, 1995, 1997, 1999)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 2500,\n labels = scales::dollar, \n limits = c(0, 10000)) + \n labs(x = \"Date\",\n y = \"Value\")\n\n# make interactive with gggplotly\n# Uncomment pipe to hide the interative toolbar in the top right \nggplotly(stock_plot, tooltip = \"text\") # %>% config(displayModeBar = FALSE)\n```\n\n## urbnthemes\n\n### Overview\n\n`urbnthemes` is a set of tools for creating Urban Institute-themed plots and maps in R. The package extends `ggplot2` with print and map themes as well as tools that make plotting easier at the Urban Institute. `urbnthemes` replaces the [urban_R_theme](https://github.com/UrbanInstitute/urban_R_theme).\n\nAlways load `library(urbnthemes)` after `library(ggplot2)` or `library(tidyverse)`.\n\n### Usage\n\nUse `set_urbn_defaults(style = \"print\")` to set the default styles. `scatter_grid()`, `remove_ticks()`, `add_axis()`, and `remove_axis()` can all be used to improve graphics.\n\n```{r example, message=FALSE}\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nggplot(data = mtcars, mapping = aes(factor(cyl))) +\n geom_bar() + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Number of Cylinders\",\n y = \"Count\") +\n remove_ticks()\n```\n\n### Combining elements\n\n`library(urbnthemes)` contains functions for combining plot elements into graphics. `urbn_plot()` brings all of the elements together.\n\n- `urbn_logo_text()`\n- `remove_ticks()`\n- `remove_axis()`\n- `scatter_grid()`\n- `add_axis()`\n- `urbn_geofacet`\n\n```{r example2}\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nplot <- ggplot(data = mtcars, mapping = aes(factor(cyl))) +\n geom_bar() + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Number of Cylinders\",\n y = \"Count\") +\n remove_ticks()\n\nurbn_plot(plot, urbn_logo_text(), ncol = 1, heights = c(30, 1))\n```\n\nSometimes it's important to horizontally add the y-axis title above the plot. `urbn_y_title()` can be sued for this task. The following example goes one step further and adds the title between the legend and the plot.\n\n```{r}\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults()\n\nplot <- ggplot(data = mtcars, mapping = aes(x = wt, y = mpg, color = factor(cyl))) +\n geom_point() + \n\tscale_x_continuous(expand = c(0, 0),\n\t\t\t\t\t\t\t\t\t\t limits = c(0, 8)) +\n scale_y_continuous(expand = c(0, 0),\n \t\t\t\t\t\t\t\t\t limits = c(0, 40)) +\n remove_ticks() +\n\tlabs(\"\") +\n\tscatter_grid()\n\nurbn_plot(get_legend(plot),\n\t\t\t\t\turbn_y_title(\"Miles per gallon\"),\n\t\t\t\t\tremove_legend(plot), \n\t\t\t\t\turbn_logo_text(), \n\t\t\t\t\tncol = 1, \n\t\t\t\t\theights = c(3, 1, 30, 1))\n```\n\n### Palettes\n\n`urbnthemes` contains many quick-access color palettes from the [Urban Institute Data Visualization Style Guide](http://urbaninstitute.github.io/graphics-styleguide/). These palettes can be used to quickly overwrite default color palettes from `urbnthemes`.\n\n- `palette_urbn_main` is the eight color discrete palette of the Urban Institute with cyan, yellow, black, gray, magenta, green, space gray, and red.\n- `palette_urbn_diverging` is an eight color diverging palette.\n- `palette_urbn_quintile` is a five color blue palette that is good for quintiles.\n- `palette_urbn_politics` is a two color palette with blue for Democrats and red for Republicans.\n\nThere are seven palettes that are continuous palettes of the seven unique colors in the discrete Urban Institute color palette:\n\n- `palette_urbn_cyan`\n- `palette_urbn_gray`\n- `palette_urbn_yellow`\n- `palette_urbn_magenta`\n- `palette_urbn_green`\n- `palette_urbn_spacegray`\n- `palette_urbn_red`\n\nUse `view_palette()` to see the palette:\n\n```{r view-palette}\nview_palette(palette_urbn_magenta)\n```\n\nThe vectors can be subset using base R syntax. This allows for the quick selection of specific colors from a palette.\n\n```{r palette-subset1}\npalette_urbn_main[1:4]\n```\n\n```{r palette-subset2}\npalette_urbn_spacegray[1:5]\n```\n\n### Utility functions\n\n`library(urbnthemes)` contains four functions that are helpful with managing font instalations:\n\n- `lato_test()`\n- `lato_install()`\n- `fontawesome_test()`\n- `fontawesome_install()`\n\n## Bibliography and Session Information\n\n------------------------------------------------------------------------\n\n*Note:* Examples present in [this document](https://awunderground.github.io/ggplot2-themes/) by Aaron Williams were created during personal time.\n\nBob Rudis and Dave Gandy (2017). waffle: Create Waffle Chart Visualizations in R. R package version 0.7.0. https://CRAN.R-project.org/package=waffle\n\nChester Ismay and Jennifer Chunn (2017). fivethirtyeight: Data and Code Behind the Stories and Interactives at 'FiveThirtyEight'. R package version 0.3.0. https://CRAN.R-project.org/package=fivethirtyeight\n\nHadley Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2009.\n\nHadley Wickham (2017). tidyverse: Easily Install and Load the 'Tidyverse'. R package version 1.2.1. https://CRAN.R-project.org/package=tidyverse\n\nHadley Wickham (2017). forcats: Tools for Working with Categorical Variables (Factors). R package version 0.2.0. https://CRAN.R-project.org/package=forcats\n\nJennifer Bryan (2017). gapminder: Data from Gapminder. R package version 0.3.0. https://CRAN.R-project.org/package=gapminder\n\nKamil Slowikowski (2017). ggrepel: Repulsive Text and Label Geoms for 'ggplot2'. R package version 0.7.0. https://CRAN.R-project.org/package=ggrepel\n\nMax Kuhn (2017). AmesHousing: The Ames Iowa Housing Data. R package version 0.0.3. https://CRAN.R-project.org/package=AmesHousing\n\nPeter Kampstra (2008). Beanplot: A Boxplot Alternative for Visual Comparison of Distributions, Journal of Statistical Software, 2008. https://www.jstatsoft.org/article/view/v028c01\n\nR Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.\n\nWinston Chang, (2014). extrafont: Tools for using fonts. R package version 0.17. https://CRAN.R-project.org/package=extrafont\n\nYihui Xie (2018). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.19.\n\n```{r System Info and Package Versioning}\nsessionInfo()\n```\n"},"formats":{"html":{"identifier":{"display-name":"HTML","target-format":"html","base-format":"html"},"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"knitr"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"inline-includes":false,"preserve-yaml":false,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[],"notebook-links":true,"format-links":true},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"graphics-guide.html"},"language":{"toc-title-document":"Table of contents","toc-title-website":"On this page","related-formats-title":"Other Formats","related-notebooks-title":"Notebooks","source-notebooks-prefix":"Source","section-title-abstract":"Abstract","section-title-appendices":"Appendices","section-title-footnotes":"Footnotes","section-title-references":"References","section-title-reuse":"Reuse","section-title-copyright":"Copyright","section-title-citation":"Citation","appendix-attribution-cite-as":"For attribution, please cite this work as:","appendix-attribution-bibtex":"BibTeX citation:","title-block-author-single":"Author","title-block-author-plural":"Authors","title-block-affiliation-single":"Affiliation","title-block-affiliation-plural":"Affiliations","title-block-published":"Published","title-block-modified":"Modified","callout-tip-title":"Tip","callout-note-title":"Note","callout-warning-title":"Warning","callout-important-title":"Important","callout-caution-title":"Caution","code-summary":"Code","code-tools-menu-caption":"Code","code-tools-show-all-code":"Show All Code","code-tools-hide-all-code":"Hide All Code","code-tools-view-source":"View Source","code-tools-source-code":"Source Code","code-line":"Line","code-lines":"Lines","copy-button-tooltip":"Copy to Clipboard","copy-button-tooltip-success":"Copied!","repo-action-links-edit":"Edit this page","repo-action-links-source":"View source","repo-action-links-issue":"Report an issue","back-to-top":"Back to top","search-no-results-text":"No results","search-matching-documents-text":"matching documents","search-copy-link-title":"Copy link to search","search-hide-matches-text":"Hide additional matches","search-more-match-text":"more match in this document","search-more-matches-text":"more matches in this document","search-clear-button-title":"Clear","search-detached-cancel-button-title":"Cancel","search-submit-button-title":"Submit","search-label":"Search","toggle-section":"Toggle section","toggle-sidebar":"Toggle sidebar navigation","toggle-dark-mode":"Toggle dark mode","toggle-reader-mode":"Toggle reader mode","toggle-navigation":"Toggle navigation","crossref-fig-title":"Figure","crossref-tbl-title":"Table","crossref-lst-title":"Listing","crossref-thm-title":"Theorem","crossref-lem-title":"Lemma","crossref-cor-title":"Corollary","crossref-prp-title":"Proposition","crossref-cnj-title":"Conjecture","crossref-def-title":"Definition","crossref-exm-title":"Example","crossref-exr-title":"Exercise","crossref-ch-prefix":"Chapter","crossref-apx-prefix":"Appendix","crossref-sec-prefix":"Section","crossref-eq-prefix":"Equation","crossref-lof-title":"List of Figures","crossref-lot-title":"List of Tables","crossref-lol-title":"List of Listings","environment-proof-title":"Proof","environment-remark-title":"Remark","environment-solution-title":"Solution","listing-page-order-by":"Order By","listing-page-order-by-default":"Default","listing-page-order-by-date-asc":"Oldest","listing-page-order-by-date-desc":"Newest","listing-page-order-by-number-desc":"High to Low","listing-page-order-by-number-asc":"Low to High","listing-page-field-date":"Date","listing-page-field-title":"Title","listing-page-field-description":"Description","listing-page-field-author":"Author","listing-page-field-filename":"File Name","listing-page-field-filemodified":"Modified","listing-page-field-subtitle":"Subtitle","listing-page-field-readingtime":"Reading Time","listing-page-field-categories":"Categories","listing-page-minutes-compact":"{0} min","listing-page-category-all":"All","listing-page-no-matches":"No matching items"},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.3.433","editor_options":{"chunk_output_type":"console"}},"extensions":{"book":{"multiFile":true}}}},"projectFormats":["html"]}
\ No newline at end of file
diff --git a/.quarto/idx/index.qmd.json b/.quarto/idx/index.qmd.json
index 626c16b..d1d8383 100644
--- a/.quarto/idx/index.qmd.json
+++ b/.quarto/idx/index.qmd.json
@@ -1 +1 @@
-{"title":"R Users Group","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"output_dir":"."}}},"headingText":"R Users Group","containsRefs":false,"markdown":"\n\n\n\n
\n\n
\n\n\nThis website contains resources for using R at the Urban Institute for analysis, visualization, mapping, and more. Click on the links above to get started learning about R!\n\n*The Urban Institute R Users Group is committed to exposing researchers to the joy and power of R; developing beginner, intermediate, and advanced R skills; encouraging and supporting novel applications of R to public policy research; and building a diverse and mutually supportive community of R Users.*\n\n\n\n\n\ngif credits: [Allison Horst](https://twitter.com/allison_horst)\n\n## Sign up for List Serv!\n\nPlease fill out the following form to receive email updates about upcoming RUG events and trainings. We promise not to spam your inbox:\n\n###\n\n###\n\n
\n\n
\n\n
\n\nFill out [this Smartsheet form](https://app.smartsheet.com/b/form/0e9d04ced47b489b8d14971ae6c2fb15) to unsubscribe from the RUG List Serv.\n\n
\n\n## Contact Info\n\nPlease don't hesitate to contact Aaron Williams (awilliams@urban.org) or Amy Rogin (arogin@urban.org) with any thoughts or questions about R at the Urban Institute. \n\n## R Lunch Labs\n\nThe Urban Institute R Users Group hosts weekly lunch labs. R Lunch Labs are hands-on trainings for R users of all skill levels and soon-to-be R users. Each meeting begins with a 5-10 minute quick tip. Afterwards, attendees break into small groups and work on a range of topics including introduction to R, data management and plotting, mapping, and machine learning. Most users bring laptops, but there are a few extras for users without laptops. \n\nWe have currently paused R Lunch Labs, but they will be back soon! If you have an idea for a topic you want to present informally at a lunch lab, please let us know!\n\n"},"formats":{"html":{"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"output_dir":"."}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"markdown"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[]},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"index.html"},"language":{},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.2.269"},"extensions":{"book":{"multiFile":true}}}}}
\ No newline at end of file
+{"title":"R Users Group","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"output_dir":"."}}},"headingText":"R Users Group","containsRefs":false,"markdown":"\n\n\n\n
\n\n
\n\n\nThis website contains resources for using R at the Urban Institute for analysis, visualization, mapping, and more. Click on the links above to get started learning about R!\n\n*The Urban Institute R Users Group is committed to exposing researchers to the joy and power of R; developing beginner, intermediate, and advanced R skills; encouraging and supporting novel applications of R to public policy research; and building a diverse and mutually supportive community of R Users.*\n\n\n\n\n\ngif credits: [Allison Horst](https://twitter.com/allison_horst)\n\n## Sign up for List Serv!\n\nPlease fill out the following form to receive email updates about upcoming RUG events and trainings. We promise not to spam your inbox:\n\n###\n\n###\n\n
\n\n
\n\n
\n\nFill out [this Smartsheet form](https://app.smartsheet.com/b/form/0e9d04ced47b489b8d14971ae6c2fb15) to unsubscribe from the RUG List Serv.\n\n
\n\n## Contact Info\n\nPlease don't hesitate to contact Aaron Williams (awilliams@urban.org) or Amy Rogin (arogin@urban.org) with any thoughts or questions about R at the Urban Institute. \n\n## R Lunch Labs\n\nThe Urban Institute R Users Group hosts weekly lunch labs. R Lunch Labs are hands-on trainings for R users of all skill levels and soon-to-be R users. Each meeting begins with a 5-10 minute quick tip. Afterwards, attendees break into small groups and work on a range of topics including introduction to R, data management and plotting, mapping, and machine learning. Most users bring laptops, but there are a few extras for users without laptops. \n\nWe have currently paused R Lunch Labs, but they will be back soon! If you have an idea for a topic you want to present informally at a lunch lab, please let us know!\n\n","srcMarkdownNoYaml":"\n\n\n\n
\n\n
\n\n## R Users Group\n\nThis website contains resources for using R at the Urban Institute for analysis, visualization, mapping, and more. Click on the links above to get started learning about R!\n\n*The Urban Institute R Users Group is committed to exposing researchers to the joy and power of R; developing beginner, intermediate, and advanced R skills; encouraging and supporting novel applications of R to public policy research; and building a diverse and mutually supportive community of R Users.*\n\n\n\n\n\ngif credits: [Allison Horst](https://twitter.com/allison_horst)\n\n## Sign up for List Serv!\n\nPlease fill out the following form to receive email updates about upcoming RUG events and trainings. We promise not to spam your inbox:\n\n###\n\n###\n\n
\n\n
\n\n
\n\nFill out [this Smartsheet form](https://app.smartsheet.com/b/form/0e9d04ced47b489b8d14971ae6c2fb15) to unsubscribe from the RUG List Serv.\n\n
\n\n## Contact Info\n\nPlease don't hesitate to contact Aaron Williams (awilliams@urban.org) or Amy Rogin (arogin@urban.org) with any thoughts or questions about R at the Urban Institute. \n\n## R Lunch Labs\n\nThe Urban Institute R Users Group hosts weekly lunch labs. R Lunch Labs are hands-on trainings for R users of all skill levels and soon-to-be R users. Each meeting begins with a 5-10 minute quick tip. Afterwards, attendees break into small groups and work on a range of topics including introduction to R, data management and plotting, mapping, and machine learning. Most users bring laptops, but there are a few extras for users without laptops. \n\nWe have currently paused R Lunch Labs, but they will be back soon! If you have an idea for a topic you want to present informally at a lunch lab, please let us know!\n\n"},"formats":{"html":{"identifier":{"display-name":"HTML","target-format":"html","base-format":"html"},"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"output_dir":"."}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"markdown"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"inline-includes":false,"preserve-yaml":false,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[],"notebook-links":true,"format-links":true},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"index.html"},"language":{"toc-title-document":"Table of contents","toc-title-website":"On this page","related-formats-title":"Other Formats","related-notebooks-title":"Notebooks","source-notebooks-prefix":"Source","section-title-abstract":"Abstract","section-title-appendices":"Appendices","section-title-footnotes":"Footnotes","section-title-references":"References","section-title-reuse":"Reuse","section-title-copyright":"Copyright","section-title-citation":"Citation","appendix-attribution-cite-as":"For attribution, please cite this work as:","appendix-attribution-bibtex":"BibTeX citation:","title-block-author-single":"Author","title-block-author-plural":"Authors","title-block-affiliation-single":"Affiliation","title-block-affiliation-plural":"Affiliations","title-block-published":"Published","title-block-modified":"Modified","callout-tip-title":"Tip","callout-note-title":"Note","callout-warning-title":"Warning","callout-important-title":"Important","callout-caution-title":"Caution","code-summary":"Code","code-tools-menu-caption":"Code","code-tools-show-all-code":"Show All Code","code-tools-hide-all-code":"Hide All Code","code-tools-view-source":"View Source","code-tools-source-code":"Source Code","code-line":"Line","code-lines":"Lines","copy-button-tooltip":"Copy to Clipboard","copy-button-tooltip-success":"Copied!","repo-action-links-edit":"Edit this page","repo-action-links-source":"View source","repo-action-links-issue":"Report an issue","back-to-top":"Back to top","search-no-results-text":"No results","search-matching-documents-text":"matching documents","search-copy-link-title":"Copy link to search","search-hide-matches-text":"Hide additional matches","search-more-match-text":"more match in this document","search-more-matches-text":"more matches in this document","search-clear-button-title":"Clear","search-detached-cancel-button-title":"Cancel","search-submit-button-title":"Submit","search-label":"Search","toggle-section":"Toggle section","toggle-sidebar":"Toggle sidebar navigation","toggle-dark-mode":"Toggle dark mode","toggle-reader-mode":"Toggle reader mode","toggle-navigation":"Toggle navigation","crossref-fig-title":"Figure","crossref-tbl-title":"Table","crossref-lst-title":"Listing","crossref-thm-title":"Theorem","crossref-lem-title":"Lemma","crossref-cor-title":"Corollary","crossref-prp-title":"Proposition","crossref-cnj-title":"Conjecture","crossref-def-title":"Definition","crossref-exm-title":"Example","crossref-exr-title":"Exercise","crossref-ch-prefix":"Chapter","crossref-apx-prefix":"Appendix","crossref-sec-prefix":"Section","crossref-eq-prefix":"Equation","crossref-lof-title":"List of Figures","crossref-lot-title":"List of Tables","crossref-lol-title":"List of Listings","environment-proof-title":"Proof","environment-remark-title":"Remark","environment-solution-title":"Solution","listing-page-order-by":"Order By","listing-page-order-by-default":"Default","listing-page-order-by-date-asc":"Oldest","listing-page-order-by-date-desc":"Newest","listing-page-order-by-number-desc":"High to Low","listing-page-order-by-number-asc":"Low to High","listing-page-field-date":"Date","listing-page-field-title":"Title","listing-page-field-description":"Description","listing-page-field-author":"Author","listing-page-field-filename":"File Name","listing-page-field-filemodified":"Modified","listing-page-field-subtitle":"Subtitle","listing-page-field-readingtime":"Reading Time","listing-page-field-categories":"Categories","listing-page-minutes-compact":"{0} min","listing-page-category-all":"All","listing-page-no-matches":"No matching items"},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.3.433"},"extensions":{"book":{"multiFile":true}}}},"projectFormats":["html"]}
\ No newline at end of file
diff --git a/.quarto/idx/intro-to-r.qmd.json b/.quarto/idx/intro-to-r.qmd.json
index d30fb0c..c1ba4b1 100644
--- a/.quarto/idx/intro-to-r.qmd.json
+++ b/.quarto/idx/intro-to-r.qmd.json
@@ -1 +1 @@
-{"title":"Introduction","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}}},"headingText":"Introduction","containsRefs":false,"markdown":"\n\n\n\n::: {#header}\n\n:::\n\n```{r markdown setup, include=FALSE}\n\nknitr::opts_chunk$set(fig.path = \"intro-to-r/www/images/\")\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\nknitr::opts_chunk$set(warning = FALSE)\n\noptions(scipen = 999)\n```\n\n\nR is one of two premier programming languages for data science and one of the [fastest growing programming languages.](https://stackoverflow.blog/2017/10/10/impressive-growth-r/) Created by researchers for researchers (with some help from software engineers), R offers rich, intuitive tools that make it perfect for visualization, public policy analysis, econometrics, geospatial analysis, and statistics.\n\nR doesn't come in a box. R was never wrapped in cellophane and it definitely isn't purchased at a store. R's pricelessness and open-source development are two of its greatest strengths, but it can often leave new users without the anchor of the box and booklet often provided with proprietary software.\n\nThis guide is meant to be an on-ramp for soon-to-be R Users and a fill-in-the-gap guide for existing R Users. It starts with the most basic question, \"what is R?\" and progresses to advanced topics like organizing analyses. Along the way it even demonstrates how to read XKCD comics in R.\n\nR boasts a strong community in the world and inside the Urban Institute. Please don't hesitate to contact Aaron Williams (awilliams\\@urban.org) or Amy Rogin (arogin\\@urban.org) with thoughts or questions about R.\n\n## What is R?\n\n![](intro-to-r/images/r-logo.png){width=\"15%\"}\n\n[Source](https://www.r-project.org/logo/)\n\nR is a free, open-source software for statistical computing. It is known for intuitive, crisp graphics and an extensive, growing library of statistical and analytic methods. Above all, R boasts an enthusiastic community of developers, instructors, and users.\n\nThe copyright and documentation for R is held by a not-for-profit organization called [The R Foundation](https://www.r-project.org/foundation/).\n\n![](intro-to-r/images/r-studio-logo.png){width=\"15%\"}\n\n[Source, Fair use](https://en.wikipedia.org/w/index.php?curid=48590482)\n\nRStudio is a free, open-source integrated development environment (IDE) that runs on top of R. In practice, R users almost exclusively open RStudio and rarely directly open R.\n\nRStudio is developed by a for-profit company called [RStudio](https://www.rstudio.com/). RStudio, the company, employs some of the R community's most prolific, open-source developers and creates many open-source tools and trainings.\n\nWhile R code can be written in any text editor, the RStudio IDE is a powerful tool with a console, syntax-highlighting, and debugging tools. [This cheatsheet](https://github.com/rstudio/cheatsheets/raw/master/rstudio-ide.pdf) outlines the power of RStudio.\n\n## Installation and Updates\n\n------------------------------------------------------------------------\n\n### When should you update?\n\nAll Urban computers should come pre-installed with R and Rstudio. However your R version may be out of date and require updating. We recommend having at least R version 3.6.0 or higher. You can check what version of R you have installed by opening Rstudio and submitting the following line of code to the console: `R.Version()$version.string`.\n\nIf you're working on a personal computer, you may not have R or Rstudio installed. So follow this guide to install both on your computer.\n\n### Updating/Installing R\n\n1) Visit https://cran.r-project.org/bin/windows/base/. The latest R version will be the downloadable link at the top. As of 1/1/2020, that R version is 3.6.2. Click on the link at the top and download the `R-x.x.x-win.exe` file.\n\n2) Open the R-x.x.x-win.exe\\` file. Click next, accept all the defaults, and install R. After R has been installed, click the Finish button. You should not need admin privileges for this.\n\n3) Check that your version of R has been updated in Rstudio. If Rstudio is already open, first close it. Then open Rstudio and retype in `R.Version()$version.string`. You should see an updated version number printed out on the console.\n\n4) Test that R packages are loading as expected. Packages you already had installed should continue to work with newer versions of R. But in some cases, you may need to re-install the packages to work properly with new versions of R.\n\n### Updating/Installing Rstudio\n\n1) Open Rstudio and go to Help \\> Check for Updates to see if RStudio is up-to-date\n\n2) If it is out-of-date, download the [appropriate update](https://rstudio.com/products/rstudio/download/#download).\n\n3) Before you run the installer, contact IT at helpdesk\\@urban.org for administrative approval as the program requires admin access.\n\n4) Run the installer and accept all defaults.\n\nMoving forward, RStudio will automatically and regularly update on Windows computers at the Urban Institute.\n\n## Learning R\n\n------------------------------------------------------------------------\n\n### What to Learn\n\nThere is often more than one way to accomplish a goal in R because of the language's flexibility. At first, this flexibility can be overwhelming. That's why it is useful to pick and master one set of tools in R before branching out and learning everything R.\n\nFortunately, [Hadley Wickham's tidyverse](https://www.tidyverse.org/) offers a comprehensive set of tools for data analysis that are good for both beginners and experts. The tidyverse is self-described as \"an opinionated collection of R packages designed for data science.\" The tidyverse consists of almost two dozen clear and concise tools for every part of an analysis workflow. At first, focus on the function `read_csv()` for loading data, the package `dplyr` for manipulating data, and the package `ggplot2` for plotting.\n\nHere's a quick example that reads a .csv, filters the data, and creates a publishable column plot in just fifteen lines of code:\n\n```{r quick example}\n# load packages and source the Urban Institute ggplot2 theme\nlibrary(tidyverse) # contains read_csv, library(dplyr), and library(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\n# read bankdata.csv\nbank <- read_csv(\"intro-to-r/data/bankdata.csv\") \n\nbank_subset <- bank %>%\n\t# filter to observations of unmarried mothers less than age 30\n\tfilter(married == \"NO\" & age < 30) %>%\t\n\t# drop all variables except children and income\n\tselect(children, income)\t\t\t\t\t\t\t\t\n\n# plot!\nbank_subset %>%\n\tggplot(mapping = aes(x = children, y = income)) +\n\tgeom_bar(stat = \"summary\", fun.y = \"mean\") +\n\tscale_y_continuous(expand = c(0, 0), labels = scales::dollar) +\n\tlabs(title = \"Mean income\",\n\t\t\t subtitle = \"Unmarried mothers less than age 30\",\n\t\t\t caption = \"Urban Institute analysis of bank data\",\n\t\t\t x = \"Number of children\",\n\t\t\t y = \"Income\")\n```\n\n### Resources for Learning\n\n*R for Data Science* by Hadley Wickham and Garrett Grolemund is the best print resource for learning R and the tidyverse. The book is available [online](http://r4ds.had.co.nz/index.html) for free and *begins* with visualization which is motivating and practical. *R for Data Science* contains dozens of worthwhile exercises but no solutions guide. Please check your solutions against the [Urban Institute r4ds solutions guide on GitHub](https://github.com/UI-Research/r4ds-exercises.git) and please contribute if the exercise isn't already in the guide!\n\nRStudio publishes a number of cheat sheets that cover the tidyverse. The main cheat sheets can be accessed in RStudio at Help \\> Cheat Sheets. Additional cheat sheets are accessible here on the [RStudio website](https://www.rstudio.com/resources/cheatsheets/).\n\nDavid Robinson, a data scientist from Data Camp, has a new [video course](https://www.datacamp.com/instructors/drobinson) about the tidyverse. Few people know as much about R and communicate as effectively as David Robinson.\n\n*Advanced R* by Hadley Wickham is a good resource for new R users that have experience with other programming languages and computer science. It is available [online](http://adv-r.had.co.nz/) for free.\n\n### Library\n\nIt's easy to feel overwhelmed by the frenetic development of the extended R universe. Books are an invaluable resource for slowing down and focusing on fully-formed ideas.\n\nAaron Williams (awilliams\\@urban.org) has a number of books that can be checked out:\n\n- [The Art of R Programming](https://www.nostarch.com/artofr.htm)\n- [ggplot2](http://www.springer.com/us/book/9780387981413)\n- [Efficient R Programming](http://shop.oreilly.com/product/0636920047995.do) ([Online!](https://csgillespie.github.io/efficientR/))\n- [Text Mining with R](http://shop.oreilly.com/product/0636920067153.do) ([Online!](https://www.tidytextmining.com/))\n- [Reasoning with Data](https://www.guilford.com/books/Reasoning-with-Data/Jeffrey-Stanton/9781462530267/reviews)\n- [Practical Statistics for Data Scientists](http://shop.oreilly.com/product/0636920048992.do)\n\n### Built-in Data Sets\n\nR has many built-in data sets that are useful for practice and even more data sets are accessible through R packages.\n\nSubmitting `data()` shows a list of all available data sets. `cars` and `iris` are two classic sets that are used in many examples.\n\n`library(tidyverse)` loads many more \"tidy\" data sets including `diamonds` and `starwars`.\n\n```{r tidyverse}\nlibrary(tidyverse)\nstarwars %>%\n\tcount(species) %>%\n\tarrange(desc(n)) %>%\n\thead()\n```\n\n`library(dslabs)` by [Rafael Irizarry](https://simplystatistics.org/2018/01/22/the-dslabs-package-provides-datasets-for-teaching-data-science/) includes varied data sets that are intentionally imperfect that are useful for practice. Students of econometrics will enjoy `library(wooldridge)`. It loads 105 data sets from *Introductory Econometrics: A Modern Approach* by Jeffrey Wooldridge. Now you can practice estimating your hedonic pricing models in R!\n\n```{r psid}\nlibrary(wooldridge)\nlibrary(tidyverse)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nas_tibble(hprice1) %>%\n\tggplot(aes(x = sqrft, y = price)) +\n\tgeom_point() +\n\tscale_y_continuous(expand = c(0, 0), lim = c(0, 800)) +\n\tlabs(title = '\"hprice1\" data from Wooldridge') \n```\n\n### Getting Help\n\nEven the best R programmers spend hours each week searching the Internet for answers. Here are some of the best ways to find answers:\n\nSubmit `?` and any function name without parentheses (ex. `?mean`) to see the function documentation in RStudio.\n\nWhen Googling, set the search range to the last year to avoid out-of-date solutions and to focus on up-to-date practices.\n\n[Stack Overflow](https://stackoverflow.com/) contains numerous solutions. Add `[r]` to any search to limit results to R. If a problem is particularly perplexing, it is simple to submit questions. Exercise caution when submitting questions because the Stack Overflow community has strict norms about questions and loose norms about respecting novices.\n\n[RStudio Community](https://community.rstudio.com/) is a new forum for R Users. It has a smaller back catalog than Stack Overflow but users are friendlier than on Stack Overflow.\n\nFinally, Aaron Williams (awilliams\\@urban.org) from IBP and Amy Rogin (arogin\\@urban.org) from METRO are available to solve problems, offer guidance, and share R enthusiasm.\n\n### CRAN Task Views\n\nR has sub-communities, frameworks, and tools focused on different subject-matter and and methodological areas. [CRAN Task Views](https://cran.r-project.org/web/views/) is invaluable for understanding these communities and finding the best frameworks and tools for different disciplines in R.\n\nCRAN Task Views has 35 pages focused on subcategories of R ranging from [econometrics](https://cran.r-project.org/web/views/Econometrics.html) to natural language processing. Each page is maintained by a subject-matter expert and contains methods, packages, books, and mailing lists that are useful for researchers.\n\nThe econometrics page alone contains detailed information on basic linear regression, microeconometrics, instrumental variables, panel data models, further regression models, time series data and models, data sets, CRAN packages, articles, books, and more.\n\n## R Code\n\n------------------------------------------------------------------------\n\nIt's time to start writing R code. Remember, most R users never open R and exclusively use RStudio. Go ahead and open R once to admire its dated text editor. Then, close R and never directly open it again. Now, open RStudio.\n\n### Submitting Code\n\nRStudio has four main panels: code editor (top left by default), R console (bottom left by default), environment and history (top right by default), and files, plots, packages, help, and viewer pane (bottom right by default).\n\nThere are two main ways to submit code:\n\n1) Type code to the right of ![](intro-to-r/images/code-console.png) in the R console and hit enter. **Note:** R won't create a long-term record of this code.\n2) Click ![](intro-to-r/images/new-script.png) in the top left to create a new R script in the code editor panel. Type code in the script. Highlight desired code and either click run the in top right of the code editor panel or type Ctrl/command-enter to run code. Scripts can be saved, so they are the best way to write code that will be used again.\n\nFor practice, submit `state.name` in the R console to create a vector with all fifty state names (sorry statehood advocates, no Washington, D.C.). Next, create a script, paste `state.name`, highlight the text, and click run at the top right of the code editor. You should get the same output both times.\n\n```{r state names}\nstate.name\n```\n\n### Syntax\n\nThe are five fundamental pieces of syntax in R.\n\n- `<-` is the assignment operator. An object created on the right side of an assignment operator is assigned to a name on the left side of an assignment operator. Assignment operators are important for saving the consequences of operations and functions. Operations without assignment operators will typically be printed to the console but not saved.\n- `#` begins a comment. Comments are useful for explaining decisions in scripts. As Haldey Wickham notes in the [Tidyverse styleguide](http://style.tidyverse.org/), 'In code, use comments to explain the \"why\" not the \"what\" or \"how\".\n- `c()` combines similar vectors into larger vectors. For example, `c(1, 2, 3)` is a numeric vector of length three made up of three numeric vectors of length one.\n- `?` in front of any function name without parentheses returns function documentation. For example, `?mean`.\n- `%>%` from `library(magrittr)` and `library(tidyverse)` is the \"pipe operator\". It passes the output from one function to another function. This is useful because strings of operations can be \"piped\" together instead of each individual operation needing to be assigned to an object.\n\n### Vectors\n\nVectors are the fundamental piece of data in R. R has six vector types (you can't mix vector types): logical, integer, double, character, complex, and raw. . You can check the type of a vector with `typeof()` and the length with `length()`\n\n### Data frames\n\nData frames are combinations of equal length vectors. Data analysis in R is built around the data frames. As a guiding principle working with data frames, you want to have \"tidy data\" whenever possible. A tidy data frame means that :\n\n1. Each variable has its own column.\n\n2. Each observation has its own row.\n\n3. Each value has its own cell.\n\n[![\\[Source\\](https://r4ds.had.co.nz/tidy-data.html)](intro-to-r/images/tidy-data.png)](https://r4ds.had.co.nz/tidy-data.html)\n\nHaving data in a tidy format allows R's vectorized nature to shine and many of the `tidyverse` functions are designed for tidy data.\n\n### Missing values\n\nR stores missing values as `NA`. A single `NA` in a calculation can cause the entire result to return as `NA`.\n\n```{r}\nsum(c(2, 2, NA))\n```\n\nThe contagiousness of `NA` is good, it makes users explicitly acknowledge dropping missing values with `na.rm = TRUE`.\n\n```{r}\nsum(c(2, 2, NA), na.rm = TRUE)\n```\n\n`== NA` does not test for missing values. Instead, use `is.na()`.\n\n- is.na() and math with booleans\n- complete.cases\n\n### Functions\n\nFunctions in R are collections of code that when called cause certain actions. R contains hundreds of functions and thousands of more functions can be accessed through packages.\n\nMost functions take arguments. For example, the function `mean()` has arguments `x`, `trim`, `na.rm`, and `...`. The first argument in most functions, in this case `x`, is an input object. Arguments can be passed to functions by name or position. `mean(c(1, 2, 3))` is equivalent to `mean(x = c(1, 2, 3))`.\n\nNotice how the other three arguments were skipped. Most arguments in functions have default values. The best way to see default values is to submit the function name with a question mark, like `?mean`. In this case, `trim = 0`, `na.rm = FALSE`, and no further arguments were passed through with `...`.\n\nIn the previous example, the `c()` function was nested inside of the `mean()` function. It is also possible to assign a vector of 1, 2, and 3 to a name and pass the name to the mean function.\n\n```{r mean, eval = FALSE}\napples <- c(1, 2, 3)\n\nmean(apples)\n```\n\nR is a [functional programming language](http://adv-r.had.co.nz/Functional-programming.html). In addition to having many pre-made functions like `mean()`, R has powerful tools for creating and manipulating custom functions. This is useful because:\n\n- It avoids tedious and error-prone copying-and-pasting and makes iterating processes simple;\n- Is a powerful way to organize sets of operations;\n- Is a standardized way to save code for later and to share operations with others.\n\nThis last bullet is key to the package system in R.\n\n### Packages\n\nOpening RStudio automatically loads \"base R\", a fundamental collection of code and functions that handles simple operations like math and system management. R can be extended with collections of code and functions developed by the R community called packages. This sounds wild, but most packages are created and maintained by some of the best statisticians and developers in the world.\n\nMost packages can be installed with `install.packages(\"dplyr\")`, where the string between the quotation marks is the name of the package. Packages installed with `install.packages()` come from CRAN and must pass certain checks for performance and documentation. Popular packages on CRAN, like dplyr, have as much, if not more support, standards, and quality than code in proprietary software packages like Stata or SAS.\n\nIt is possible, but less common, to install packages from places like GitHub. This is less secure and the functionality of the packages is more likely to change over time. `install.packages()` need only be run once per version of package per machine and should rarely be included in .R scripts.\n\nPackages are loaded once per R session with the function `library()`. It is a good idea to include `library(package-name)` at the top of scripts for each package used in the script. This way it is obvious at the top of the script which packages are installed and loaded.\n\n**Note:** `install.packages()` uses quoted package names and `library()` uses unquoted package names.\n\nFor practice, submit the following three lines of code to install `RXKCD`, load `library(RXKCD)`, and get a random [XKCD comic](https://www.xkcd.com/).\n\n```{r xkcd, eval=FALSE}\ninstall.packages(\"RXKCD\")\nlibrary(RXKCD)\ngetXKCD(\"random\")\n```\n\n```{r xkcd run, echo=FALSE}\nlibrary(RXKCD)\n# assignment to hide text output\ncomic <- getXKCD(539)\n```\n\nPackages are frequently updated, especially around the time R versions change. The easiest way to update packages is Tools \\> Check for Package Updated in RStudio.\n\nOccasionally, two loaded packages will have functions with identical names. Any conflicts with be announced when loading packages. See how `filter()` and `lag()` from `library(tidyverse)` and `library(stats)` conflict:\n\n![](intro-to-r/images/load-tidyverse.png) In this case, the tidyverse functions are usually favored. If there is ever a conflict or any doubt about which function is used, use the package name and `::` to directly call the function. For example, `dplyr::select(apples)`. `::` can also be used to call a function without loading the entire package.\n\n### CRAN\n\nThe [Comprehensive R Archive Network](https://cran.r-project.org/index.html) (CRAN) contains almost 12,000 packages contributed over the last two decades by a range of developers. New packages are added to CRAN almost every day.\n\nCRAN enables R to have all of the benefits of open-source development and the security and predictability of proprietary statistical packages like SAS and Stata. CRAN weds the benefits of broad-based, real-time package development with certain [standards](https://cran.r-project.org/index.html) for functionality and documentation. Methods and tools make it to R before SAS or Stata, if they ever make it to SAS or Stata, but have standards that generally exceed Python or other open-source languages. (See: [Malicious Libraries Found on Python Package Index (PyPI)](https://www.blog.pythonlibrary.org/2017/09/15/malicious-libraries-found-on-python-package-index-pypi/))\n\nBecause of CRAN's long history and R's place in the statistics community, CRAN contains many methods that can't be accessed, much less duplicated, using proprietary software. In addition to being useful now, this also ensures that R isn't a temporary fad and will have staying power because of the challenge of replicating or besting CRAN.\n\nR's extensible design is important, but most tasks can be accomplished with a handful of packages:\n\n- `ggplot2` data visualization\n- `dplyr` data management\n- `tidyr` data tidying\n- `readr` data import\n- `purrr` functional programming\n- `tibble` data frames\n- `hms` times\n- `stringr` character strings\n- `lubridate` dates/times\\\n- `forcats` factors\n- `DBI` databases\n- `haven` SPSS, SAS, and Stata files\n- `readxl`.xls and .xlsx\n- `modelr` simple modeling within a pipeline\n- `broom` turning models into tidy data\n- `tidyverse` loads all of the packages listed up to this point; see Hadley Wichkham's \"[tidyverse](https://blog.rstudio.org/2016/09/15/tidyverse-1-0-0/)\"\n\n## Organizing Analyses\n\n------------------------------------------------------------------------\n\nThis section outlines how to organize an analysis to get the most out of R. Newer users may want to skip this section and work through [R for Data Science](http://r4ds.had.co.nz/) until they understand `library(readr)`, `library(dplyr)`, and `library(ggplot2)`.\n\n### Projects\n\nOrganizing scripts, files, and data is one of the most important steps to creating a clear and reproducible analysis.\n\nR Projects, proper noun, are the best way to organize an analysis. They have several advantages:\n\n- They make it possible to concurrently run multiple RStudio sessions.\n- They allow for project-specific RStudio settings.\n- They integrate well with Git version control.\n- They are the \"node\" of relative file paths. (more on this in a second)\n\nBefore setting up an R Project, go to Tools \\> Global Options and uncheck \"Restore most recently opened project at startup\".\n\n![](intro-to-r/images/restore.png){width=\"50%\"}\n\nEvery new analysis in R should start with an R Project. First, create a directory that holds all data, scripts, and files for the analysis. Storing files and data in a sub-directories is encouraged. For example, data can be stored in a folder called data/.\n\nNext, click \"New Project...\" in the top right corner.\n\n![](intro-to-r/images/new-project.png){width=\"50%\"}\n\nWhen prompted, turn your recently created \"Existing Directory\" into a project.\n\n![](intro-to-r/images/existing-directory.png){width=\"50%\"}\n\nUpon completion, the name of the R Project should now be displayed in the top right corner of RStudio where it previously displayed \"Project: (None)\". Once opened, .RProj files do not need to be saved. Double-clicking .Rproj files in the directory is now the best way to open RStudio. This will allow for the concurrent use of multiple R sessions and ensure the portability of file paths. Once an RStudio project is open, scripts can be opened by double-clicking individual files in the computer directory or clicking files in the \"Files\" tab in the top right of RStudio.\n\nR Projects make code highly portable because of the way they handle file paths. Here are a few rules:\n\n#### Filepaths\n\nNever use `\\` in file paths in R. `\\` is a regular expression and will complicate an analysis. Fortunately, RStudio understands `/` in file paths regardless of operating system.\n\nNever use `setwd()` in R. It is unnecessary, it makes code unreproducible across machines, and it is rude to collaborators. R Projects create a better framework for file paths. Simply treat the directory where the R Project lives as the working directory and directories inside of that directory as sub-directories.\n\nFor example, say there's a `.Rproj` called `starwars-analysis.Rproj` in a directory called `starwars-analysis`. If there is a .csv in that folder called `jedi.csv`, the file can be loaded with `read_csv(\"jedi.csv\")` instead of `read_csv(\"H:/ibp/analyses/starwars-analysis/diamonds.csv\")`. If that file is in a sub-directory of `starwars-analysis` called `data`, it can be loaded with `read_csv(\"data/jedi.csv\")`. The same concepts hold for writing data and graphics.\n\nThis simplifies code and makes it portable because all relative filepaths will be identical on all computers. To share an analysis, simply send the entire directory to a collaborator or share it with GitHub.\n\nHere's an example directory:\n\n
\n\nIt isn't always possible to avoid absolute file paths because of the many different ways the Urban Institute stores data. Avoid absolute paths when possible and be deliberate about where analyses live in relation to where data live.\n\nFinally, it's good practice to include a README in the same directory as the .Rproj. The README should outline the purpose and the directories and can include information about how to contribute, licenses, dependencies, and acknowledgements. This [GitHub page](https://gist.github.com/PurpleBooth/109311bb0361f32d87a2) is a good README template.\n\nCheck out [R for Data Science](http://r4ds.had.co.nz/workflow-projects.html) by Hadley Wickham and Garrett Grolemund for a more thorough explanation of this workflow. Jenny Bryan also has a good [blogpost](https://www.tidyverse.org/articles/2017/12/workflow-vs-script/) about avoiding `setwd()`.\n\n### Naming Conventions\n\nNaming functions, objects, variables, files, and scripts is one of the toughest and least-taught dimensions of computer programming. Better names can add clarity to code, save time and effort, and minimize errors caused by accidentally overwriting existing functions or other objects.\n\n> There are only two hard things in Computer Science: cache invalidation and naming things. \\~ [Phil Karlton](http://www.meerkat.com/2017/12/naming-things-hard/)\n\n#### Functions and Other Objects\n\nR is case-sensitive.\n\nObjects in R can be named anything - [even unicode characters](https://www.r-bloggers.com/rules-for-naming-objects-in-r/). But just because something *can* be named anything doesn't mean it should.\n\nMost functions and objects in R are lowerCamelCase, period.separated, or underscore_separated. As an individual or team, it's important to pick a style and stick with it, but as [this article](https://journal.r-project.org/archive/2012-2/RJournal_2012-2_Baaaath.pdf) from 2012 shows, there isn't much consistency across the R community. Hadley Wickham's tidyverse uses underscores, so expect to see some consolidation into this style.\n\nIn general, it's good practice to name functions with verbs and other objects with nouns.\n\nVariable and object names that start with numbers, have spaces, or use peculiar syntax require back-ticks.\n\n> select(urban, \\`R Users Group\\`)\n\n> urban\\$\\`R Users Group\\`)\n\nFinally, it's possible to overwrite existing functions and other objects in R with the assignment operator. Don't give vectors or data frames the same names as exisiting functions and don't overwrite existing functions with custom functions.\n\n#### Files\n\nNaming conventions for scripts and files is probably the most overlooked dimension in programming and analysis. The first three bullets from this section come from this [rich slide deck](http://www2.stat.duke.edu/~rcs46/lectures_2015/01-markdown-git/slides/naming-slides/naming-slides.pdf) by Jenny Bryan. This may seem pedantic, but picking a file naming convention now can save a bunch of time and headaches in the future.\n\n**1) Machine readable**\n\nCreate file names that are easily machine readable. Use all lower case letters and skip punctuation other than delimiters. Use underscores as characters for splitting the file name. For example, `stringr::str_split_fixed(\"2018-01-10_r-introduction_machine-readable-example_01.csv\", \"[_\\\\.]\", 5)` splits the file name on underscores and periods and returns date, project, file name, file number, and file type. This information can then be stored and sorted in a data frame.\n\n**2) Human readable**\n\nCreate file names that are human readable. The example from above is informative without any machine interpretation.\n\n**3) Plays well with default ordering**\n\nIt is often useful to include date or sequence numbers in script and file names. For example, include 2018-10-01 for data collected on January 10th, 2018 or include 3 for the third script a sequence of five `.R` programs. Starting file names with the date or sequence numbers means files will show up in a logical order by default. Be sure to use ISO 8601 standard for dates (YYYY-MM-DD).\n\n**4) Don't Use File Names for Version Control**\n\nVersion control with file names is unwieldy and usually results in names that are barely human readable and definitely not machine readable.\n\n> \"2018-01-10_r-introduction_machine-readable-example_01_v2_for-aaron_after-review_before-submission.R\"\n\nIterations usually don't iterate sensibly. For example, what was \"v1\", \"v2\" abandoned for \"for-aaron\", \"after-review\", \"before-submission\". Furthermore, version control with file names is poor for concurrent work and merging.\n\nThe next section will outline the optimal tool for version control.\n\n### Version Control\n\nThe workflow outlined above integrates perfectly with version control like Git and distributed version control repository hosting services like GitHub.\n\nVersion control is a system for recording changes to files over time. Version control is built around repositories. In this case, the folder containing the `.Rproj` is the perfect directory to use as a repository. A handful of simple commands are used to track and commit changes to text files (.R, .Rmd, etc.) and data. This record is valuable for testing alternatives, communicating with others and your future self, and documenting progress on projects.\n\nGitHub is a distributed repository system built on top of Git. GitHub has a number of valuable tools for collaboration and project management. In particular, it makes concurrent collaboration on code simpler with branches and has a slick system for issues. Here are the [branches](https://github.com/UrbanInstitute/urban_R_theme/branches) and [issues](https://github.com/UrbanInstitute/urban_R_theme/issues) for the Urban Institute R Graphics Guide. It also has free web hosting for websites like the website you are reading right now. [GitHub has a quick guide that is a good place to start learning Git](https://try.github.io/levels/1/challenges/1).\n\nThe Urban Institute has a number of legacy models and code bases that span years and have been touched by scores of brilliant researchers. The future value of a record of all code changes and development is borderline unthinkable.\n\n### Coding Style\n\n> \"Good coding style is like using correct punctuation. You can manage without it, but it sure makes things easier to read.\" \\~Hadley Wickham (2014)\n\ngood coding style is like using correct punctuation you can manage without it but it sure makes thing easier to read\n\nThe details of a coding style are less important than consistently sticking to that style. Be flexible when working with collaborators so the style doesn't change inside an analysis.\n\nHere are three good sources for inspiration:\n\n- [Tidyverse Style Guide](http://style.tidyverse.org/)\n- [Google's R Style Guide](https://google.github.io/styleguide/Rguide.xml)\n- [Hadley Wickham's R Style Guide](http://adv-r.had.co.nz/Style.html)\n\n## Putting it All Together\n\n------------------------------------------------------------------------\n\nR can augment or replace a traditional proprietary statistical packages like SAS or Stata with a few extra bells and whistles, but hopefully this guide and other resources show a fuller vision for developing reproducible, accurate, and collaborative analyses.[^1]\n\n[^1]: The language \"reproducible, accurate, and collaborative analyses\" comes from [Hilary S. Parker's talk](https://www.rstudio.com/resources/videos/opinionated-analysis-development/) at rstudio::conf 2017 about opinionated analysis development.\n\nThis research pipeline, to use the phrase by Roger Peng, Jeff Leek, and Brian Caffo, combines the best of traditional economic and social policy research, computer science/software development, and statistics.[^2] Here are the rules:\n\n[^2]: The basis for [this section](https://www.coursera.org/learn/reproducible-research/lecture/abevs/reproducible-research-concepts-and-ideas-part-2) comes from this Coursera talk by Roger Peng.\n\n#### 1) No steps in an analysis are done by hand and all steps are recorded with executable scripts.\n\nIt is common to use executable scripts to estimate a regression equation or to tabulate weighted summary statistics. But for some reason, other steps like file management, data munging, and visualization are often done \"by hand\". Good science demands that every step of an analysis is recorded - and if possible - with executable scripts.\n\nFortunately, it is possible to script most steps in R from downloading data from the Internet and accessing APIs to visualizations and drafting manuscripts. This may be challenging at first, but it will save time and result in better research in the long run.\n\n#### 2) All code is entirely reproducible and portable.\n\nExecutable scripts are for communicating with other researchers and our future selves. Scripts lose value if they aren't portable and can't be reproduced in the future or by others. Recording every step with execuatble scripts is a start, but scripts aren't valuable if they require expensive proprietary software,or if researchers have to significantly alter scripts to run an analysis.\n\nOpen source software, like R, promotes accessibility, portability, and reproducibility. Also, be sure to avoid `setwd()` and use relative filepaths.\n\n#### 3) Local and collaborative version control is used and all repositories include all code and a README.\n\nUse local version control like Git and a distributed version control repository hosting service like GitHub to track changes and share analyses. The version control should include all scripts and meta information about the analysis in a README.\n\n#### 4) Raw data and tidy analytic data are stored in a collaborative location with a code book.\n\nMany raw data are already stored in collaborative locations like BLS.gov and don't need to be duplicated. Tidy analytic data, like the data used to estimate a regression equation, should be stored in a collaborative location. This is good practice, but is less essential if executable scripts are flawless and reproducible. Researcher-entered data and data from less-stable sources should be stored in raw and analytic forms.\n\nSmall data sets can be stored on GitHub without issue. Larger data sets should be stored in collaborative locations accessible by scripting languages. This is only possible for public data and best-practices for private data are less established.\n\nSave codebooks for data sets as text files or PDFs in repositories. Creating codebooks for user-entered data or variables created in executable scripts is often worth the time.\n\n#### 5) Code review and issue tracking are used to improve accuracy and computational efficiency.\n\nGetting stronger programmers and/or methodologists to review code is valuable for limiting programming and analytic mistakes, improving computational efficiency, and learning.\n\n[GitHub issues](https://guides.github.com/features/issues/) is a powerful tool for managing, discussing, and collaborating on code.\n\n#### 6) Projects rely heavily on literate statistical programming and standard means of distribution for execution, validation, and publishing.\n\nLiterate statistical programming is the combination of natural language explanations for humans and executable code in one document. The idea was created by Donald Knuth and is embodied by R Markdown.\n\nR Markdown combines text chunks, code chunks, and output chunks in one script that can be \"knitted\" using `library(knitr)` to created PDFs, books, .htmls, and websites like the website where this guide lives.\n\nThis workflow combines the analytic and narrative process in a tool that is flexible, scalable, reproducible, and less error-prone. R Markdown documents can be used for executing programs, validating models and analyses, and publishing. These documents can be submitted to many academic journals and shared easily with [GitHub pages](https://pages.github.com/).\n\n#### 7) Software versions and dependencies are recorded and all software is cited in publications.\n\n`sessionInfo()` reports the R version, locale, packages used, and other important information about an R session. `citation()` creates a text and BibTex entry of the citation for R. `citation()` creates a text and BibTex entry for R packages. `library(packrat)` (outlined [here](https://rstudio.github.io/packrat/)) is a tool for saving R dependencies.\n\n## Bibliography and References\n\n------------------------------------------------------------------------\n\nHadley Wickham (2017). tidyverse: Easily Install and Load the 'Tidyverse'. R package version 1.2.1. https://CRAN.R-project.org/package=tidyverse\n\nHadley Wickham and Garrett Grolemund (2017). R For Data Science http://r4ds.had.co.nz/\n\nHadley Wickham (2014). Advanced R http://adv-r.had.co.nz/Style.html\n\nHilary S. Parker (2017. Opinionated Analysis Development https://www.rstudio.com/resources/videos/opinionated-analysis-development/\n\nJenny Bryan (2017).\\\nProject-oriented workflow https://www.tidyverse.org/articles/2017/12/workflow-vs-script/\n\nJenny Bryan (2015). naming things. http://www2.stat.duke.edu/\\~rcs46/lectures_2015/01-markdown-git/slides/naming-slides/naming-slides.pdf\n\nJJ Allaire, Yihui Xie, Jonathan McPherson, Javier Luraschi, Kevin Ushey, Aron Atkins, Hadley Wickham, Joe Cheng and Winston Chang (2017). rmarkdown: Dynamic Documents for R. R package version 1.8. https://CRAN.R-project.org/package=rmarkdown\n\nJustin M. Shea (2017). wooldridge: 105 Data Sets from \"Introductory Econometrics: A Modern Approach\" by Jeffrey M. Wooldridge. R package version 1.2.0. https://CRAN.R-project.org/package=wooldridge\n\nRoger Peng Reproducible Research Part 2 https://www.coursera.org/learn/reproducible-research/lecture/abevs/reproducible-research-concepts-and-ideas-part-2\n\nYihui Xie (2017). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.18.\n\n```{r session info}\nsessionInfo()\n```\n"},"formats":{"html":{"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"knitr"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[]},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"intro-to-r.html"},"language":{},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.2.269"},"extensions":{"book":{"multiFile":true}}}}}
\ No newline at end of file
+{"title":"Introduction","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}}},"headingText":"Introduction","containsRefs":false,"markdown":"\n\n\n\n::: {#header}\n\n:::\n\n```{r markdown setup, include=FALSE}\n\nknitr::opts_chunk$set(fig.path = \"intro-to-r/www/images/\")\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\nknitr::opts_chunk$set(warning = FALSE)\n\noptions(scipen = 999)\n```\n\n\nR is one of two premier programming languages for data science and one of the [fastest growing programming languages.](https://stackoverflow.blog/2017/10/10/impressive-growth-r/) Created by researchers for researchers (with some help from software engineers), R offers rich, intuitive tools that make it perfect for visualization, public policy analysis, econometrics, geospatial analysis, and statistics.\n\nR doesn't come in a box. R was never wrapped in cellophane and it definitely isn't purchased at a store. R's pricelessness and open-source development are two of its greatest strengths, but it can often leave new users without the anchor of the box and booklet often provided with proprietary software.\n\nThis guide is meant to be an on-ramp for soon-to-be R Users and a fill-in-the-gap guide for existing R Users. It starts with the most basic question, \"what is R?\" and progresses to advanced topics like organizing analyses. Along the way it even demonstrates how to read XKCD comics in R.\n\nR boasts a strong community in the world and inside the Urban Institute. Please don't hesitate to contact Aaron Williams (awilliams\\@urban.org) or Amy Rogin (arogin\\@urban.org) with thoughts or questions about R.\n\n## What is R?\n\n![](intro-to-r/images/r-logo.png){width=\"15%\"}\n\n[Source](https://www.r-project.org/logo/)\n\nR is a free, open-source software for statistical computing. It is known for intuitive, crisp graphics and an extensive, growing library of statistical and analytic methods. Above all, R boasts an enthusiastic community of developers, instructors, and users.\n\nThe copyright and documentation for R is held by a not-for-profit organization called [The R Foundation](https://www.r-project.org/foundation/).\n\n![](intro-to-r/images/r-studio-logo.png){width=\"15%\"}\n\n[Source, Fair use](https://en.wikipedia.org/w/index.php?curid=48590482)\n\nRStudio is a free, open-source integrated development environment (IDE) that runs on top of R. In practice, R users almost exclusively open RStudio and rarely directly open R.\n\nRStudio is developed by a for-profit company called [RStudio](https://www.rstudio.com/). RStudio, the company, employs some of the R community's most prolific, open-source developers and creates many open-source tools and trainings.\n\nWhile R code can be written in any text editor, the RStudio IDE is a powerful tool with a console, syntax-highlighting, and debugging tools. [This cheatsheet](https://github.com/rstudio/cheatsheets/raw/master/rstudio-ide.pdf) outlines the power of RStudio.\n\n## Installation and Updates\n\n------------------------------------------------------------------------\n\n### When should you update?\n\nAll Urban computers should come pre-installed with R and Rstudio. However your R version may be out of date and require updating. We recommend having at least R version 3.6.0 or higher. You can check what version of R you have installed by opening Rstudio and submitting the following line of code to the console: `R.Version()$version.string`.\n\nIf you're working on a personal computer, you may not have R or Rstudio installed. So follow this guide to install both on your computer.\n\n### Updating/Installing R\n\n1) Visit https://cran.r-project.org/bin/windows/base/. The latest R version will be the downloadable link at the top. As of 1/1/2020, that R version is 3.6.2. Click on the link at the top and download the `R-x.x.x-win.exe` file.\n\n2) Open the R-x.x.x-win.exe\\` file. Click next, accept all the defaults, and install R. After R has been installed, click the Finish button. You should not need admin privileges for this.\n\n3) Check that your version of R has been updated in Rstudio. If Rstudio is already open, first close it. Then open Rstudio and retype in `R.Version()$version.string`. You should see an updated version number printed out on the console.\n\n4) Test that R packages are loading as expected. Packages you already had installed should continue to work with newer versions of R. But in some cases, you may need to re-install the packages to work properly with new versions of R.\n\n### Updating/Installing Rstudio\n\n1) Open Rstudio and go to Help \\> Check for Updates to see if RStudio is up-to-date\n\n2) If it is out-of-date, download the [appropriate update](https://rstudio.com/products/rstudio/download/#download).\n\n3) Before you run the installer, contact IT at helpdesk\\@urban.org for administrative approval as the program requires admin access.\n\n4) Run the installer and accept all defaults.\n\nMoving forward, RStudio will automatically and regularly update on Windows computers at the Urban Institute.\n\n## Learning R\n\n------------------------------------------------------------------------\n\n### What to Learn\n\nThere is often more than one way to accomplish a goal in R because of the language's flexibility. At first, this flexibility can be overwhelming. That's why it is useful to pick and master one set of tools in R before branching out and learning everything R.\n\nFortunately, [Hadley Wickham's tidyverse](https://www.tidyverse.org/) offers a comprehensive set of tools for data analysis that are good for both beginners and experts. The tidyverse is self-described as \"an opinionated collection of R packages designed for data science.\" The tidyverse consists of almost two dozen clear and concise tools for every part of an analysis workflow. At first, focus on the function `read_csv()` for loading data, the package `dplyr` for manipulating data, and the package `ggplot2` for plotting.\n\nHere's a quick example that reads a .csv, filters the data, and creates a publishable column plot in just fifteen lines of code:\n\n```{r quick example}\n# load packages and source the Urban Institute ggplot2 theme\nlibrary(tidyverse) # contains read_csv, library(dplyr), and library(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\n# read bankdata.csv\nbank <- read_csv(\"intro-to-r/data/bankdata.csv\") \n\nbank_subset <- bank %>%\n\t# filter to observations of unmarried mothers less than age 30\n\tfilter(married == \"NO\" & age < 30) %>%\t\n\t# drop all variables except children and income\n\tselect(children, income)\t\t\t\t\t\t\t\t\n\n# plot!\nbank_subset %>%\n\tggplot(mapping = aes(x = children, y = income)) +\n\tgeom_bar(stat = \"summary\", fun.y = \"mean\") +\n\tscale_y_continuous(expand = c(0, 0), labels = scales::dollar) +\n\tlabs(title = \"Mean income\",\n\t\t\t subtitle = \"Unmarried mothers less than age 30\",\n\t\t\t caption = \"Urban Institute analysis of bank data\",\n\t\t\t x = \"Number of children\",\n\t\t\t y = \"Income\")\n```\n\n### Resources for Learning\n\n*R for Data Science* by Hadley Wickham and Garrett Grolemund is the best print resource for learning R and the tidyverse. The book is available [online](http://r4ds.had.co.nz/index.html) for free and *begins* with visualization which is motivating and practical. *R for Data Science* contains dozens of worthwhile exercises but no solutions guide. Please check your solutions against the [Urban Institute r4ds solutions guide on GitHub](https://github.com/UI-Research/r4ds-exercises.git) and please contribute if the exercise isn't already in the guide!\n\nRStudio publishes a number of cheat sheets that cover the tidyverse. The main cheat sheets can be accessed in RStudio at Help \\> Cheat Sheets. Additional cheat sheets are accessible here on the [RStudio website](https://www.rstudio.com/resources/cheatsheets/).\n\nDavid Robinson, a data scientist from Data Camp, has a new [video course](https://www.datacamp.com/instructors/drobinson) about the tidyverse. Few people know as much about R and communicate as effectively as David Robinson.\n\n*Advanced R* by Hadley Wickham is a good resource for new R users that have experience with other programming languages and computer science. It is available [online](http://adv-r.had.co.nz/) for free.\n\n### Library\n\nIt's easy to feel overwhelmed by the frenetic development of the extended R universe. Books are an invaluable resource for slowing down and focusing on fully-formed ideas.\n\nAaron Williams (awilliams\\@urban.org) has a number of books that can be checked out:\n\n- [The Art of R Programming](https://www.nostarch.com/artofr.htm)\n- [ggplot2](http://www.springer.com/us/book/9780387981413)\n- [Efficient R Programming](http://shop.oreilly.com/product/0636920047995.do) ([Online!](https://csgillespie.github.io/efficientR/))\n- [Text Mining with R](http://shop.oreilly.com/product/0636920067153.do) ([Online!](https://www.tidytextmining.com/))\n- [Reasoning with Data](https://www.guilford.com/books/Reasoning-with-Data/Jeffrey-Stanton/9781462530267/reviews)\n- [Practical Statistics for Data Scientists](http://shop.oreilly.com/product/0636920048992.do)\n\n### Built-in Data Sets\n\nR has many built-in data sets that are useful for practice and even more data sets are accessible through R packages.\n\nSubmitting `data()` shows a list of all available data sets. `cars` and `iris` are two classic sets that are used in many examples.\n\n`library(tidyverse)` loads many more \"tidy\" data sets including `diamonds` and `starwars`.\n\n```{r tidyverse}\nlibrary(tidyverse)\nstarwars %>%\n\tcount(species) %>%\n\tarrange(desc(n)) %>%\n\thead()\n```\n\n`library(dslabs)` by [Rafael Irizarry](https://simplystatistics.org/2018/01/22/the-dslabs-package-provides-datasets-for-teaching-data-science/) includes varied data sets that are intentionally imperfect that are useful for practice. Students of econometrics will enjoy `library(wooldridge)`. It loads 105 data sets from *Introductory Econometrics: A Modern Approach* by Jeffrey Wooldridge. Now you can practice estimating your hedonic pricing models in R!\n\n```{r psid}\nlibrary(wooldridge)\nlibrary(tidyverse)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nas_tibble(hprice1) %>%\n\tggplot(aes(x = sqrft, y = price)) +\n\tgeom_point() +\n\tscale_y_continuous(expand = c(0, 0), lim = c(0, 800)) +\n\tlabs(title = '\"hprice1\" data from Wooldridge') \n```\n\n### Getting Help\n\nEven the best R programmers spend hours each week searching the Internet for answers. Here are some of the best ways to find answers:\n\nSubmit `?` and any function name without parentheses (ex. `?mean`) to see the function documentation in RStudio.\n\nWhen Googling, set the search range to the last year to avoid out-of-date solutions and to focus on up-to-date practices.\n\n[Stack Overflow](https://stackoverflow.com/) contains numerous solutions. Add `[r]` to any search to limit results to R. If a problem is particularly perplexing, it is simple to submit questions. Exercise caution when submitting questions because the Stack Overflow community has strict norms about questions and loose norms about respecting novices.\n\n[RStudio Community](https://community.rstudio.com/) is a new forum for R Users. It has a smaller back catalog than Stack Overflow but users are friendlier than on Stack Overflow.\n\nFinally, Aaron Williams (awilliams\\@urban.org) from IBP and Amy Rogin (arogin\\@urban.org) from METRO are available to solve problems, offer guidance, and share R enthusiasm.\n\n### CRAN Task Views\n\nR has sub-communities, frameworks, and tools focused on different subject-matter and and methodological areas. [CRAN Task Views](https://cran.r-project.org/web/views/) is invaluable for understanding these communities and finding the best frameworks and tools for different disciplines in R.\n\nCRAN Task Views has 35 pages focused on subcategories of R ranging from [econometrics](https://cran.r-project.org/web/views/Econometrics.html) to natural language processing. Each page is maintained by a subject-matter expert and contains methods, packages, books, and mailing lists that are useful for researchers.\n\nThe econometrics page alone contains detailed information on basic linear regression, microeconometrics, instrumental variables, panel data models, further regression models, time series data and models, data sets, CRAN packages, articles, books, and more.\n\n## R Code\n\n------------------------------------------------------------------------\n\nIt's time to start writing R code. Remember, most R users never open R and exclusively use RStudio. Go ahead and open R once to admire its dated text editor. Then, close R and never directly open it again. Now, open RStudio.\n\n### Submitting Code\n\nRStudio has four main panels: code editor (top left by default), R console (bottom left by default), environment and history (top right by default), and files, plots, packages, help, and viewer pane (bottom right by default).\n\nThere are two main ways to submit code:\n\n1) Type code to the right of ![](intro-to-r/images/code-console.png) in the R console and hit enter. **Note:** R won't create a long-term record of this code.\n2) Click ![](intro-to-r/images/new-script.png) in the top left to create a new R script in the code editor panel. Type code in the script. Highlight desired code and either click run the in top right of the code editor panel or type Ctrl/command-enter to run code. Scripts can be saved, so they are the best way to write code that will be used again.\n\nFor practice, submit `state.name` in the R console to create a vector with all fifty state names (sorry statehood advocates, no Washington, D.C.). Next, create a script, paste `state.name`, highlight the text, and click run at the top right of the code editor. You should get the same output both times.\n\n```{r state names}\nstate.name\n```\n\n### Syntax\n\nThe are five fundamental pieces of syntax in R.\n\n- `<-` is the assignment operator. An object created on the right side of an assignment operator is assigned to a name on the left side of an assignment operator. Assignment operators are important for saving the consequences of operations and functions. Operations without assignment operators will typically be printed to the console but not saved.\n- `#` begins a comment. Comments are useful for explaining decisions in scripts. As Haldey Wickham notes in the [Tidyverse styleguide](http://style.tidyverse.org/), 'In code, use comments to explain the \"why\" not the \"what\" or \"how\".\n- `c()` combines similar vectors into larger vectors. For example, `c(1, 2, 3)` is a numeric vector of length three made up of three numeric vectors of length one.\n- `?` in front of any function name without parentheses returns function documentation. For example, `?mean`.\n- `%>%` from `library(magrittr)` and `library(tidyverse)` is the \"pipe operator\". It passes the output from one function to another function. This is useful because strings of operations can be \"piped\" together instead of each individual operation needing to be assigned to an object.\n\n### Vectors\n\nVectors are the fundamental piece of data in R. R has six vector types (you can't mix vector types): logical, integer, double, character, complex, and raw. . You can check the type of a vector with `typeof()` and the length with `length()`\n\n### Data frames\n\nData frames are combinations of equal length vectors. Data analysis in R is built around the data frames. As a guiding principle working with data frames, you want to have \"tidy data\" whenever possible. A tidy data frame means that :\n\n1. Each variable has its own column.\n\n2. Each observation has its own row.\n\n3. Each value has its own cell.\n\n[![\\[Source\\](https://r4ds.had.co.nz/tidy-data.html)](intro-to-r/images/tidy-data.png)](https://r4ds.had.co.nz/tidy-data.html)\n\nHaving data in a tidy format allows R's vectorized nature to shine and many of the `tidyverse` functions are designed for tidy data.\n\n### Missing values\n\nR stores missing values as `NA`. A single `NA` in a calculation can cause the entire result to return as `NA`.\n\n```{r}\nsum(c(2, 2, NA))\n```\n\nThe contagiousness of `NA` is good, it makes users explicitly acknowledge dropping missing values with `na.rm = TRUE`.\n\n```{r}\nsum(c(2, 2, NA), na.rm = TRUE)\n```\n\n`== NA` does not test for missing values. Instead, use `is.na()`.\n\n- is.na() and math with booleans\n- complete.cases\n\n### Functions\n\nFunctions in R are collections of code that when called cause certain actions. R contains hundreds of functions and thousands of more functions can be accessed through packages.\n\nMost functions take arguments. For example, the function `mean()` has arguments `x`, `trim`, `na.rm`, and `...`. The first argument in most functions, in this case `x`, is an input object. Arguments can be passed to functions by name or position. `mean(c(1, 2, 3))` is equivalent to `mean(x = c(1, 2, 3))`.\n\nNotice how the other three arguments were skipped. Most arguments in functions have default values. The best way to see default values is to submit the function name with a question mark, like `?mean`. In this case, `trim = 0`, `na.rm = FALSE`, and no further arguments were passed through with `...`.\n\nIn the previous example, the `c()` function was nested inside of the `mean()` function. It is also possible to assign a vector of 1, 2, and 3 to a name and pass the name to the mean function.\n\n```{r mean, eval = FALSE}\napples <- c(1, 2, 3)\n\nmean(apples)\n```\n\nR is a [functional programming language](http://adv-r.had.co.nz/Functional-programming.html). In addition to having many pre-made functions like `mean()`, R has powerful tools for creating and manipulating custom functions. This is useful because:\n\n- It avoids tedious and error-prone copying-and-pasting and makes iterating processes simple;\n- Is a powerful way to organize sets of operations;\n- Is a standardized way to save code for later and to share operations with others.\n\nThis last bullet is key to the package system in R.\n\n### Packages\n\nOpening RStudio automatically loads \"base R\", a fundamental collection of code and functions that handles simple operations like math and system management. R can be extended with collections of code and functions developed by the R community called packages. This sounds wild, but most packages are created and maintained by some of the best statisticians and developers in the world.\n\nMost packages can be installed with `install.packages(\"dplyr\")`, where the string between the quotation marks is the name of the package. Packages installed with `install.packages()` come from CRAN and must pass certain checks for performance and documentation. Popular packages on CRAN, like dplyr, have as much, if not more support, standards, and quality than code in proprietary software packages like Stata or SAS.\n\nIt is possible, but less common, to install packages from places like GitHub. This is less secure and the functionality of the packages is more likely to change over time. `install.packages()` need only be run once per version of package per machine and should rarely be included in .R scripts.\n\nPackages are loaded once per R session with the function `library()`. It is a good idea to include `library(package-name)` at the top of scripts for each package used in the script. This way it is obvious at the top of the script which packages are installed and loaded.\n\n**Note:** `install.packages()` uses quoted package names and `library()` uses unquoted package names.\n\nFor practice, submit the following three lines of code to install `RXKCD`, load `library(RXKCD)`, and get a random [XKCD comic](https://www.xkcd.com/).\n\n```{r xkcd, eval=FALSE}\ninstall.packages(\"RXKCD\")\nlibrary(RXKCD)\ngetXKCD(\"random\")\n```\n\n```{r xkcd run, echo=FALSE}\nlibrary(RXKCD)\n# assignment to hide text output\ncomic <- getXKCD(539)\n```\n\nPackages are frequently updated, especially around the time R versions change. The easiest way to update packages is Tools \\> Check for Package Updated in RStudio.\n\nOccasionally, two loaded packages will have functions with identical names. Any conflicts with be announced when loading packages. See how `filter()` and `lag()` from `library(tidyverse)` and `library(stats)` conflict:\n\n![](intro-to-r/images/load-tidyverse.png) In this case, the tidyverse functions are usually favored. If there is ever a conflict or any doubt about which function is used, use the package name and `::` to directly call the function. For example, `dplyr::select(apples)`. `::` can also be used to call a function without loading the entire package.\n\n### CRAN\n\nThe [Comprehensive R Archive Network](https://cran.r-project.org/index.html) (CRAN) contains almost 12,000 packages contributed over the last two decades by a range of developers. New packages are added to CRAN almost every day.\n\nCRAN enables R to have all of the benefits of open-source development and the security and predictability of proprietary statistical packages like SAS and Stata. CRAN weds the benefits of broad-based, real-time package development with certain [standards](https://cran.r-project.org/index.html) for functionality and documentation. Methods and tools make it to R before SAS or Stata, if they ever make it to SAS or Stata, but have standards that generally exceed Python or other open-source languages. (See: [Malicious Libraries Found on Python Package Index (PyPI)](https://www.blog.pythonlibrary.org/2017/09/15/malicious-libraries-found-on-python-package-index-pypi/))\n\nBecause of CRAN's long history and R's place in the statistics community, CRAN contains many methods that can't be accessed, much less duplicated, using proprietary software. In addition to being useful now, this also ensures that R isn't a temporary fad and will have staying power because of the challenge of replicating or besting CRAN.\n\nR's extensible design is important, but most tasks can be accomplished with a handful of packages:\n\n- `ggplot2` data visualization\n- `dplyr` data management\n- `tidyr` data tidying\n- `readr` data import\n- `purrr` functional programming\n- `tibble` data frames\n- `hms` times\n- `stringr` character strings\n- `lubridate` dates/times\\\n- `forcats` factors\n- `DBI` databases\n- `haven` SPSS, SAS, and Stata files\n- `readxl`.xls and .xlsx\n- `modelr` simple modeling within a pipeline\n- `broom` turning models into tidy data\n- `tidyverse` loads all of the packages listed up to this point; see Hadley Wichkham's \"[tidyverse](https://blog.rstudio.org/2016/09/15/tidyverse-1-0-0/)\"\n\n## Organizing Analyses\n\n------------------------------------------------------------------------\n\nThis section outlines how to organize an analysis to get the most out of R. Newer users may want to skip this section and work through [R for Data Science](http://r4ds.had.co.nz/) until they understand `library(readr)`, `library(dplyr)`, and `library(ggplot2)`.\n\n### Projects\n\nOrganizing scripts, files, and data is one of the most important steps to creating a clear and reproducible analysis.\n\nR Projects, proper noun, are the best way to organize an analysis. They have several advantages:\n\n- They make it possible to concurrently run multiple RStudio sessions.\n- They allow for project-specific RStudio settings.\n- They integrate well with Git version control.\n- They are the \"node\" of relative file paths. (more on this in a second)\n\nBefore setting up an R Project, go to Tools \\> Global Options and uncheck \"Restore most recently opened project at startup\".\n\n![](intro-to-r/images/restore.png){width=\"50%\"}\n\nEvery new analysis in R should start with an R Project. First, create a directory that holds all data, scripts, and files for the analysis. Storing files and data in a sub-directories is encouraged. For example, data can be stored in a folder called data/.\n\nNext, click \"New Project...\" in the top right corner.\n\n![](intro-to-r/images/new-project.png){width=\"50%\"}\n\nWhen prompted, turn your recently created \"Existing Directory\" into a project.\n\n![](intro-to-r/images/existing-directory.png){width=\"50%\"}\n\nUpon completion, the name of the R Project should now be displayed in the top right corner of RStudio where it previously displayed \"Project: (None)\". Once opened, .RProj files do not need to be saved. Double-clicking .Rproj files in the directory is now the best way to open RStudio. This will allow for the concurrent use of multiple R sessions and ensure the portability of file paths. Once an RStudio project is open, scripts can be opened by double-clicking individual files in the computer directory or clicking files in the \"Files\" tab in the top right of RStudio.\n\nR Projects make code highly portable because of the way they handle file paths. Here are a few rules:\n\n#### Filepaths\n\nNever use `\\` in file paths in R. `\\` is a regular expression and will complicate an analysis. Fortunately, RStudio understands `/` in file paths regardless of operating system.\n\nNever use `setwd()` in R. It is unnecessary, it makes code unreproducible across machines, and it is rude to collaborators. R Projects create a better framework for file paths. Simply treat the directory where the R Project lives as the working directory and directories inside of that directory as sub-directories.\n\nFor example, say there's a `.Rproj` called `starwars-analysis.Rproj` in a directory called `starwars-analysis`. If there is a .csv in that folder called `jedi.csv`, the file can be loaded with `read_csv(\"jedi.csv\")` instead of `read_csv(\"H:/ibp/analyses/starwars-analysis/diamonds.csv\")`. If that file is in a sub-directory of `starwars-analysis` called `data`, it can be loaded with `read_csv(\"data/jedi.csv\")`. The same concepts hold for writing data and graphics.\n\nThis simplifies code and makes it portable because all relative filepaths will be identical on all computers. To share an analysis, simply send the entire directory to a collaborator or share it with GitHub.\n\nHere's an example directory:\n\n
\n\nIt isn't always possible to avoid absolute file paths because of the many different ways the Urban Institute stores data. Avoid absolute paths when possible and be deliberate about where analyses live in relation to where data live.\n\nFinally, it's good practice to include a README in the same directory as the .Rproj. The README should outline the purpose and the directories and can include information about how to contribute, licenses, dependencies, and acknowledgements. This [GitHub page](https://gist.github.com/PurpleBooth/109311bb0361f32d87a2) is a good README template.\n\nCheck out [R for Data Science](http://r4ds.had.co.nz/workflow-projects.html) by Hadley Wickham and Garrett Grolemund for a more thorough explanation of this workflow. Jenny Bryan also has a good [blogpost](https://www.tidyverse.org/articles/2017/12/workflow-vs-script/) about avoiding `setwd()`.\n\n### Naming Conventions\n\nNaming functions, objects, variables, files, and scripts is one of the toughest and least-taught dimensions of computer programming. Better names can add clarity to code, save time and effort, and minimize errors caused by accidentally overwriting existing functions or other objects.\n\n> There are only two hard things in Computer Science: cache invalidation and naming things. \\~ [Phil Karlton](http://www.meerkat.com/2017/12/naming-things-hard/)\n\n#### Functions and Other Objects\n\nR is case-sensitive.\n\nObjects in R can be named anything - [even unicode characters](https://www.r-bloggers.com/rules-for-naming-objects-in-r/). But just because something *can* be named anything doesn't mean it should.\n\nMost functions and objects in R are lowerCamelCase, period.separated, or underscore_separated. As an individual or team, it's important to pick a style and stick with it, but as [this article](https://journal.r-project.org/archive/2012-2/RJournal_2012-2_Baaaath.pdf) from 2012 shows, there isn't much consistency across the R community. Hadley Wickham's tidyverse uses underscores, so expect to see some consolidation into this style.\n\nIn general, it's good practice to name functions with verbs and other objects with nouns.\n\nVariable and object names that start with numbers, have spaces, or use peculiar syntax require back-ticks.\n\n> select(urban, \\`R Users Group\\`)\n\n> urban\\$\\`R Users Group\\`)\n\nFinally, it's possible to overwrite existing functions and other objects in R with the assignment operator. Don't give vectors or data frames the same names as exisiting functions and don't overwrite existing functions with custom functions.\n\n#### Files\n\nNaming conventions for scripts and files is probably the most overlooked dimension in programming and analysis. The first three bullets from this section come from this [rich slide deck](http://www2.stat.duke.edu/~rcs46/lectures_2015/01-markdown-git/slides/naming-slides/naming-slides.pdf) by Jenny Bryan. This may seem pedantic, but picking a file naming convention now can save a bunch of time and headaches in the future.\n\n**1) Machine readable**\n\nCreate file names that are easily machine readable. Use all lower case letters and skip punctuation other than delimiters. Use underscores as characters for splitting the file name. For example, `stringr::str_split_fixed(\"2018-01-10_r-introduction_machine-readable-example_01.csv\", \"[_\\\\.]\", 5)` splits the file name on underscores and periods and returns date, project, file name, file number, and file type. This information can then be stored and sorted in a data frame.\n\n**2) Human readable**\n\nCreate file names that are human readable. The example from above is informative without any machine interpretation.\n\n**3) Plays well with default ordering**\n\nIt is often useful to include date or sequence numbers in script and file names. For example, include 2018-10-01 for data collected on January 10th, 2018 or include 3 for the third script a sequence of five `.R` programs. Starting file names with the date or sequence numbers means files will show up in a logical order by default. Be sure to use ISO 8601 standard for dates (YYYY-MM-DD).\n\n**4) Don't Use File Names for Version Control**\n\nVersion control with file names is unwieldy and usually results in names that are barely human readable and definitely not machine readable.\n\n> \"2018-01-10_r-introduction_machine-readable-example_01_v2_for-aaron_after-review_before-submission.R\"\n\nIterations usually don't iterate sensibly. For example, what was \"v1\", \"v2\" abandoned for \"for-aaron\", \"after-review\", \"before-submission\". Furthermore, version control with file names is poor for concurrent work and merging.\n\nThe next section will outline the optimal tool for version control.\n\n### Version Control\n\nThe workflow outlined above integrates perfectly with version control like Git and distributed version control repository hosting services like GitHub.\n\nVersion control is a system for recording changes to files over time. Version control is built around repositories. In this case, the folder containing the `.Rproj` is the perfect directory to use as a repository. A handful of simple commands are used to track and commit changes to text files (.R, .Rmd, etc.) and data. This record is valuable for testing alternatives, communicating with others and your future self, and documenting progress on projects.\n\nGitHub is a distributed repository system built on top of Git. GitHub has a number of valuable tools for collaboration and project management. In particular, it makes concurrent collaboration on code simpler with branches and has a slick system for issues. Here are the [branches](https://github.com/UrbanInstitute/urban_R_theme/branches) and [issues](https://github.com/UrbanInstitute/urban_R_theme/issues) for the Urban Institute R Graphics Guide. It also has free web hosting for websites like the website you are reading right now. [GitHub has a quick guide that is a good place to start learning Git](https://try.github.io/levels/1/challenges/1).\n\nThe Urban Institute has a number of legacy models and code bases that span years and have been touched by scores of brilliant researchers. The future value of a record of all code changes and development is borderline unthinkable.\n\n### Coding Style\n\n> \"Good coding style is like using correct punctuation. You can manage without it, but it sure makes things easier to read.\" \\~Hadley Wickham (2014)\n\ngood coding style is like using correct punctuation you can manage without it but it sure makes thing easier to read\n\nThe details of a coding style are less important than consistently sticking to that style. Be flexible when working with collaborators so the style doesn't change inside an analysis.\n\nHere are three good sources for inspiration:\n\n- [Tidyverse Style Guide](http://style.tidyverse.org/)\n- [Google's R Style Guide](https://google.github.io/styleguide/Rguide.xml)\n- [Hadley Wickham's R Style Guide](http://adv-r.had.co.nz/Style.html)\n\n## Putting it All Together\n\n------------------------------------------------------------------------\n\nR can augment or replace a traditional proprietary statistical packages like SAS or Stata with a few extra bells and whistles, but hopefully this guide and other resources show a fuller vision for developing reproducible, accurate, and collaborative analyses.[^1]\n\n[^1]: The language \"reproducible, accurate, and collaborative analyses\" comes from [Hilary S. Parker's talk](https://www.rstudio.com/resources/videos/opinionated-analysis-development/) at rstudio::conf 2017 about opinionated analysis development.\n\nThis research pipeline, to use the phrase by Roger Peng, Jeff Leek, and Brian Caffo, combines the best of traditional economic and social policy research, computer science/software development, and statistics.[^2] Here are the rules:\n\n[^2]: The basis for [this section](https://www.coursera.org/learn/reproducible-research/lecture/abevs/reproducible-research-concepts-and-ideas-part-2) comes from this Coursera talk by Roger Peng.\n\n#### 1) No steps in an analysis are done by hand and all steps are recorded with executable scripts.\n\nIt is common to use executable scripts to estimate a regression equation or to tabulate weighted summary statistics. But for some reason, other steps like file management, data munging, and visualization are often done \"by hand\". Good science demands that every step of an analysis is recorded - and if possible - with executable scripts.\n\nFortunately, it is possible to script most steps in R from downloading data from the Internet and accessing APIs to visualizations and drafting manuscripts. This may be challenging at first, but it will save time and result in better research in the long run.\n\n#### 2) All code is entirely reproducible and portable.\n\nExecutable scripts are for communicating with other researchers and our future selves. Scripts lose value if they aren't portable and can't be reproduced in the future or by others. Recording every step with execuatble scripts is a start, but scripts aren't valuable if they require expensive proprietary software,or if researchers have to significantly alter scripts to run an analysis.\n\nOpen source software, like R, promotes accessibility, portability, and reproducibility. Also, be sure to avoid `setwd()` and use relative filepaths.\n\n#### 3) Local and collaborative version control is used and all repositories include all code and a README.\n\nUse local version control like Git and a distributed version control repository hosting service like GitHub to track changes and share analyses. The version control should include all scripts and meta information about the analysis in a README.\n\n#### 4) Raw data and tidy analytic data are stored in a collaborative location with a code book.\n\nMany raw data are already stored in collaborative locations like BLS.gov and don't need to be duplicated. Tidy analytic data, like the data used to estimate a regression equation, should be stored in a collaborative location. This is good practice, but is less essential if executable scripts are flawless and reproducible. Researcher-entered data and data from less-stable sources should be stored in raw and analytic forms.\n\nSmall data sets can be stored on GitHub without issue. Larger data sets should be stored in collaborative locations accessible by scripting languages. This is only possible for public data and best-practices for private data are less established.\n\nSave codebooks for data sets as text files or PDFs in repositories. Creating codebooks for user-entered data or variables created in executable scripts is often worth the time.\n\n#### 5) Code review and issue tracking are used to improve accuracy and computational efficiency.\n\nGetting stronger programmers and/or methodologists to review code is valuable for limiting programming and analytic mistakes, improving computational efficiency, and learning.\n\n[GitHub issues](https://guides.github.com/features/issues/) is a powerful tool for managing, discussing, and collaborating on code.\n\n#### 6) Projects rely heavily on literate statistical programming and standard means of distribution for execution, validation, and publishing.\n\nLiterate statistical programming is the combination of natural language explanations for humans and executable code in one document. The idea was created by Donald Knuth and is embodied by R Markdown.\n\nR Markdown combines text chunks, code chunks, and output chunks in one script that can be \"knitted\" using `library(knitr)` to created PDFs, books, .htmls, and websites like the website where this guide lives.\n\nThis workflow combines the analytic and narrative process in a tool that is flexible, scalable, reproducible, and less error-prone. R Markdown documents can be used for executing programs, validating models and analyses, and publishing. These documents can be submitted to many academic journals and shared easily with [GitHub pages](https://pages.github.com/).\n\n#### 7) Software versions and dependencies are recorded and all software is cited in publications.\n\n`sessionInfo()` reports the R version, locale, packages used, and other important information about an R session. `citation()` creates a text and BibTex entry of the citation for R. `citation()` creates a text and BibTex entry for R packages. `library(packrat)` (outlined [here](https://rstudio.github.io/packrat/)) is a tool for saving R dependencies.\n\n## Bibliography and References\n\n------------------------------------------------------------------------\n\nHadley Wickham (2017). tidyverse: Easily Install and Load the 'Tidyverse'. R package version 1.2.1. https://CRAN.R-project.org/package=tidyverse\n\nHadley Wickham and Garrett Grolemund (2017). R For Data Science http://r4ds.had.co.nz/\n\nHadley Wickham (2014). Advanced R http://adv-r.had.co.nz/Style.html\n\nHilary S. Parker (2017. Opinionated Analysis Development https://www.rstudio.com/resources/videos/opinionated-analysis-development/\n\nJenny Bryan (2017).\\\nProject-oriented workflow https://www.tidyverse.org/articles/2017/12/workflow-vs-script/\n\nJenny Bryan (2015). naming things. http://www2.stat.duke.edu/\\~rcs46/lectures_2015/01-markdown-git/slides/naming-slides/naming-slides.pdf\n\nJJ Allaire, Yihui Xie, Jonathan McPherson, Javier Luraschi, Kevin Ushey, Aron Atkins, Hadley Wickham, Joe Cheng and Winston Chang (2017). rmarkdown: Dynamic Documents for R. R package version 1.8. https://CRAN.R-project.org/package=rmarkdown\n\nJustin M. Shea (2017). wooldridge: 105 Data Sets from \"Introductory Econometrics: A Modern Approach\" by Jeffrey M. Wooldridge. R package version 1.2.0. https://CRAN.R-project.org/package=wooldridge\n\nRoger Peng Reproducible Research Part 2 https://www.coursera.org/learn/reproducible-research/lecture/abevs/reproducible-research-concepts-and-ideas-part-2\n\nYihui Xie (2017). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.18.\n\n```{r session info}\nsessionInfo()\n```\n","srcMarkdownNoYaml":"\n\n\n\n::: {#header}\n\n:::\n\n```{r markdown setup, include=FALSE}\n\nknitr::opts_chunk$set(fig.path = \"intro-to-r/www/images/\")\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\nknitr::opts_chunk$set(warning = FALSE)\n\noptions(scipen = 999)\n```\n\n# Introduction\n\nR is one of two premier programming languages for data science and one of the [fastest growing programming languages.](https://stackoverflow.blog/2017/10/10/impressive-growth-r/) Created by researchers for researchers (with some help from software engineers), R offers rich, intuitive tools that make it perfect for visualization, public policy analysis, econometrics, geospatial analysis, and statistics.\n\nR doesn't come in a box. R was never wrapped in cellophane and it definitely isn't purchased at a store. R's pricelessness and open-source development are two of its greatest strengths, but it can often leave new users without the anchor of the box and booklet often provided with proprietary software.\n\nThis guide is meant to be an on-ramp for soon-to-be R Users and a fill-in-the-gap guide for existing R Users. It starts with the most basic question, \"what is R?\" and progresses to advanced topics like organizing analyses. Along the way it even demonstrates how to read XKCD comics in R.\n\nR boasts a strong community in the world and inside the Urban Institute. Please don't hesitate to contact Aaron Williams (awilliams\\@urban.org) or Amy Rogin (arogin\\@urban.org) with thoughts or questions about R.\n\n## What is R?\n\n![](intro-to-r/images/r-logo.png){width=\"15%\"}\n\n[Source](https://www.r-project.org/logo/)\n\nR is a free, open-source software for statistical computing. It is known for intuitive, crisp graphics and an extensive, growing library of statistical and analytic methods. Above all, R boasts an enthusiastic community of developers, instructors, and users.\n\nThe copyright and documentation for R is held by a not-for-profit organization called [The R Foundation](https://www.r-project.org/foundation/).\n\n![](intro-to-r/images/r-studio-logo.png){width=\"15%\"}\n\n[Source, Fair use](https://en.wikipedia.org/w/index.php?curid=48590482)\n\nRStudio is a free, open-source integrated development environment (IDE) that runs on top of R. In practice, R users almost exclusively open RStudio and rarely directly open R.\n\nRStudio is developed by a for-profit company called [RStudio](https://www.rstudio.com/). RStudio, the company, employs some of the R community's most prolific, open-source developers and creates many open-source tools and trainings.\n\nWhile R code can be written in any text editor, the RStudio IDE is a powerful tool with a console, syntax-highlighting, and debugging tools. [This cheatsheet](https://github.com/rstudio/cheatsheets/raw/master/rstudio-ide.pdf) outlines the power of RStudio.\n\n## Installation and Updates\n\n------------------------------------------------------------------------\n\n### When should you update?\n\nAll Urban computers should come pre-installed with R and Rstudio. However your R version may be out of date and require updating. We recommend having at least R version 3.6.0 or higher. You can check what version of R you have installed by opening Rstudio and submitting the following line of code to the console: `R.Version()$version.string`.\n\nIf you're working on a personal computer, you may not have R or Rstudio installed. So follow this guide to install both on your computer.\n\n### Updating/Installing R\n\n1) Visit https://cran.r-project.org/bin/windows/base/. The latest R version will be the downloadable link at the top. As of 1/1/2020, that R version is 3.6.2. Click on the link at the top and download the `R-x.x.x-win.exe` file.\n\n2) Open the R-x.x.x-win.exe\\` file. Click next, accept all the defaults, and install R. After R has been installed, click the Finish button. You should not need admin privileges for this.\n\n3) Check that your version of R has been updated in Rstudio. If Rstudio is already open, first close it. Then open Rstudio and retype in `R.Version()$version.string`. You should see an updated version number printed out on the console.\n\n4) Test that R packages are loading as expected. Packages you already had installed should continue to work with newer versions of R. But in some cases, you may need to re-install the packages to work properly with new versions of R.\n\n### Updating/Installing Rstudio\n\n1) Open Rstudio and go to Help \\> Check for Updates to see if RStudio is up-to-date\n\n2) If it is out-of-date, download the [appropriate update](https://rstudio.com/products/rstudio/download/#download).\n\n3) Before you run the installer, contact IT at helpdesk\\@urban.org for administrative approval as the program requires admin access.\n\n4) Run the installer and accept all defaults.\n\nMoving forward, RStudio will automatically and regularly update on Windows computers at the Urban Institute.\n\n## Learning R\n\n------------------------------------------------------------------------\n\n### What to Learn\n\nThere is often more than one way to accomplish a goal in R because of the language's flexibility. At first, this flexibility can be overwhelming. That's why it is useful to pick and master one set of tools in R before branching out and learning everything R.\n\nFortunately, [Hadley Wickham's tidyverse](https://www.tidyverse.org/) offers a comprehensive set of tools for data analysis that are good for both beginners and experts. The tidyverse is self-described as \"an opinionated collection of R packages designed for data science.\" The tidyverse consists of almost two dozen clear and concise tools for every part of an analysis workflow. At first, focus on the function `read_csv()` for loading data, the package `dplyr` for manipulating data, and the package `ggplot2` for plotting.\n\nHere's a quick example that reads a .csv, filters the data, and creates a publishable column plot in just fifteen lines of code:\n\n```{r quick example}\n# load packages and source the Urban Institute ggplot2 theme\nlibrary(tidyverse) # contains read_csv, library(dplyr), and library(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\n# read bankdata.csv\nbank <- read_csv(\"intro-to-r/data/bankdata.csv\") \n\nbank_subset <- bank %>%\n\t# filter to observations of unmarried mothers less than age 30\n\tfilter(married == \"NO\" & age < 30) %>%\t\n\t# drop all variables except children and income\n\tselect(children, income)\t\t\t\t\t\t\t\t\n\n# plot!\nbank_subset %>%\n\tggplot(mapping = aes(x = children, y = income)) +\n\tgeom_bar(stat = \"summary\", fun.y = \"mean\") +\n\tscale_y_continuous(expand = c(0, 0), labels = scales::dollar) +\n\tlabs(title = \"Mean income\",\n\t\t\t subtitle = \"Unmarried mothers less than age 30\",\n\t\t\t caption = \"Urban Institute analysis of bank data\",\n\t\t\t x = \"Number of children\",\n\t\t\t y = \"Income\")\n```\n\n### Resources for Learning\n\n*R for Data Science* by Hadley Wickham and Garrett Grolemund is the best print resource for learning R and the tidyverse. The book is available [online](http://r4ds.had.co.nz/index.html) for free and *begins* with visualization which is motivating and practical. *R for Data Science* contains dozens of worthwhile exercises but no solutions guide. Please check your solutions against the [Urban Institute r4ds solutions guide on GitHub](https://github.com/UI-Research/r4ds-exercises.git) and please contribute if the exercise isn't already in the guide!\n\nRStudio publishes a number of cheat sheets that cover the tidyverse. The main cheat sheets can be accessed in RStudio at Help \\> Cheat Sheets. Additional cheat sheets are accessible here on the [RStudio website](https://www.rstudio.com/resources/cheatsheets/).\n\nDavid Robinson, a data scientist from Data Camp, has a new [video course](https://www.datacamp.com/instructors/drobinson) about the tidyverse. Few people know as much about R and communicate as effectively as David Robinson.\n\n*Advanced R* by Hadley Wickham is a good resource for new R users that have experience with other programming languages and computer science. It is available [online](http://adv-r.had.co.nz/) for free.\n\n### Library\n\nIt's easy to feel overwhelmed by the frenetic development of the extended R universe. Books are an invaluable resource for slowing down and focusing on fully-formed ideas.\n\nAaron Williams (awilliams\\@urban.org) has a number of books that can be checked out:\n\n- [The Art of R Programming](https://www.nostarch.com/artofr.htm)\n- [ggplot2](http://www.springer.com/us/book/9780387981413)\n- [Efficient R Programming](http://shop.oreilly.com/product/0636920047995.do) ([Online!](https://csgillespie.github.io/efficientR/))\n- [Text Mining with R](http://shop.oreilly.com/product/0636920067153.do) ([Online!](https://www.tidytextmining.com/))\n- [Reasoning with Data](https://www.guilford.com/books/Reasoning-with-Data/Jeffrey-Stanton/9781462530267/reviews)\n- [Practical Statistics for Data Scientists](http://shop.oreilly.com/product/0636920048992.do)\n\n### Built-in Data Sets\n\nR has many built-in data sets that are useful for practice and even more data sets are accessible through R packages.\n\nSubmitting `data()` shows a list of all available data sets. `cars` and `iris` are two classic sets that are used in many examples.\n\n`library(tidyverse)` loads many more \"tidy\" data sets including `diamonds` and `starwars`.\n\n```{r tidyverse}\nlibrary(tidyverse)\nstarwars %>%\n\tcount(species) %>%\n\tarrange(desc(n)) %>%\n\thead()\n```\n\n`library(dslabs)` by [Rafael Irizarry](https://simplystatistics.org/2018/01/22/the-dslabs-package-provides-datasets-for-teaching-data-science/) includes varied data sets that are intentionally imperfect that are useful for practice. Students of econometrics will enjoy `library(wooldridge)`. It loads 105 data sets from *Introductory Econometrics: A Modern Approach* by Jeffrey Wooldridge. Now you can practice estimating your hedonic pricing models in R!\n\n```{r psid}\nlibrary(wooldridge)\nlibrary(tidyverse)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nas_tibble(hprice1) %>%\n\tggplot(aes(x = sqrft, y = price)) +\n\tgeom_point() +\n\tscale_y_continuous(expand = c(0, 0), lim = c(0, 800)) +\n\tlabs(title = '\"hprice1\" data from Wooldridge') \n```\n\n### Getting Help\n\nEven the best R programmers spend hours each week searching the Internet for answers. Here are some of the best ways to find answers:\n\nSubmit `?` and any function name without parentheses (ex. `?mean`) to see the function documentation in RStudio.\n\nWhen Googling, set the search range to the last year to avoid out-of-date solutions and to focus on up-to-date practices.\n\n[Stack Overflow](https://stackoverflow.com/) contains numerous solutions. Add `[r]` to any search to limit results to R. If a problem is particularly perplexing, it is simple to submit questions. Exercise caution when submitting questions because the Stack Overflow community has strict norms about questions and loose norms about respecting novices.\n\n[RStudio Community](https://community.rstudio.com/) is a new forum for R Users. It has a smaller back catalog than Stack Overflow but users are friendlier than on Stack Overflow.\n\nFinally, Aaron Williams (awilliams\\@urban.org) from IBP and Amy Rogin (arogin\\@urban.org) from METRO are available to solve problems, offer guidance, and share R enthusiasm.\n\n### CRAN Task Views\n\nR has sub-communities, frameworks, and tools focused on different subject-matter and and methodological areas. [CRAN Task Views](https://cran.r-project.org/web/views/) is invaluable for understanding these communities and finding the best frameworks and tools for different disciplines in R.\n\nCRAN Task Views has 35 pages focused on subcategories of R ranging from [econometrics](https://cran.r-project.org/web/views/Econometrics.html) to natural language processing. Each page is maintained by a subject-matter expert and contains methods, packages, books, and mailing lists that are useful for researchers.\n\nThe econometrics page alone contains detailed information on basic linear regression, microeconometrics, instrumental variables, panel data models, further regression models, time series data and models, data sets, CRAN packages, articles, books, and more.\n\n## R Code\n\n------------------------------------------------------------------------\n\nIt's time to start writing R code. Remember, most R users never open R and exclusively use RStudio. Go ahead and open R once to admire its dated text editor. Then, close R and never directly open it again. Now, open RStudio.\n\n### Submitting Code\n\nRStudio has four main panels: code editor (top left by default), R console (bottom left by default), environment and history (top right by default), and files, plots, packages, help, and viewer pane (bottom right by default).\n\nThere are two main ways to submit code:\n\n1) Type code to the right of ![](intro-to-r/images/code-console.png) in the R console and hit enter. **Note:** R won't create a long-term record of this code.\n2) Click ![](intro-to-r/images/new-script.png) in the top left to create a new R script in the code editor panel. Type code in the script. Highlight desired code and either click run the in top right of the code editor panel or type Ctrl/command-enter to run code. Scripts can be saved, so they are the best way to write code that will be used again.\n\nFor practice, submit `state.name` in the R console to create a vector with all fifty state names (sorry statehood advocates, no Washington, D.C.). Next, create a script, paste `state.name`, highlight the text, and click run at the top right of the code editor. You should get the same output both times.\n\n```{r state names}\nstate.name\n```\n\n### Syntax\n\nThe are five fundamental pieces of syntax in R.\n\n- `<-` is the assignment operator. An object created on the right side of an assignment operator is assigned to a name on the left side of an assignment operator. Assignment operators are important for saving the consequences of operations and functions. Operations without assignment operators will typically be printed to the console but not saved.\n- `#` begins a comment. Comments are useful for explaining decisions in scripts. As Haldey Wickham notes in the [Tidyverse styleguide](http://style.tidyverse.org/), 'In code, use comments to explain the \"why\" not the \"what\" or \"how\".\n- `c()` combines similar vectors into larger vectors. For example, `c(1, 2, 3)` is a numeric vector of length three made up of three numeric vectors of length one.\n- `?` in front of any function name without parentheses returns function documentation. For example, `?mean`.\n- `%>%` from `library(magrittr)` and `library(tidyverse)` is the \"pipe operator\". It passes the output from one function to another function. This is useful because strings of operations can be \"piped\" together instead of each individual operation needing to be assigned to an object.\n\n### Vectors\n\nVectors are the fundamental piece of data in R. R has six vector types (you can't mix vector types): logical, integer, double, character, complex, and raw. . You can check the type of a vector with `typeof()` and the length with `length()`\n\n### Data frames\n\nData frames are combinations of equal length vectors. Data analysis in R is built around the data frames. As a guiding principle working with data frames, you want to have \"tidy data\" whenever possible. A tidy data frame means that :\n\n1. Each variable has its own column.\n\n2. Each observation has its own row.\n\n3. Each value has its own cell.\n\n[![\\[Source\\](https://r4ds.had.co.nz/tidy-data.html)](intro-to-r/images/tidy-data.png)](https://r4ds.had.co.nz/tidy-data.html)\n\nHaving data in a tidy format allows R's vectorized nature to shine and many of the `tidyverse` functions are designed for tidy data.\n\n### Missing values\n\nR stores missing values as `NA`. A single `NA` in a calculation can cause the entire result to return as `NA`.\n\n```{r}\nsum(c(2, 2, NA))\n```\n\nThe contagiousness of `NA` is good, it makes users explicitly acknowledge dropping missing values with `na.rm = TRUE`.\n\n```{r}\nsum(c(2, 2, NA), na.rm = TRUE)\n```\n\n`== NA` does not test for missing values. Instead, use `is.na()`.\n\n- is.na() and math with booleans\n- complete.cases\n\n### Functions\n\nFunctions in R are collections of code that when called cause certain actions. R contains hundreds of functions and thousands of more functions can be accessed through packages.\n\nMost functions take arguments. For example, the function `mean()` has arguments `x`, `trim`, `na.rm`, and `...`. The first argument in most functions, in this case `x`, is an input object. Arguments can be passed to functions by name or position. `mean(c(1, 2, 3))` is equivalent to `mean(x = c(1, 2, 3))`.\n\nNotice how the other three arguments were skipped. Most arguments in functions have default values. The best way to see default values is to submit the function name with a question mark, like `?mean`. In this case, `trim = 0`, `na.rm = FALSE`, and no further arguments were passed through with `...`.\n\nIn the previous example, the `c()` function was nested inside of the `mean()` function. It is also possible to assign a vector of 1, 2, and 3 to a name and pass the name to the mean function.\n\n```{r mean, eval = FALSE}\napples <- c(1, 2, 3)\n\nmean(apples)\n```\n\nR is a [functional programming language](http://adv-r.had.co.nz/Functional-programming.html). In addition to having many pre-made functions like `mean()`, R has powerful tools for creating and manipulating custom functions. This is useful because:\n\n- It avoids tedious and error-prone copying-and-pasting and makes iterating processes simple;\n- Is a powerful way to organize sets of operations;\n- Is a standardized way to save code for later and to share operations with others.\n\nThis last bullet is key to the package system in R.\n\n### Packages\n\nOpening RStudio automatically loads \"base R\", a fundamental collection of code and functions that handles simple operations like math and system management. R can be extended with collections of code and functions developed by the R community called packages. This sounds wild, but most packages are created and maintained by some of the best statisticians and developers in the world.\n\nMost packages can be installed with `install.packages(\"dplyr\")`, where the string between the quotation marks is the name of the package. Packages installed with `install.packages()` come from CRAN and must pass certain checks for performance and documentation. Popular packages on CRAN, like dplyr, have as much, if not more support, standards, and quality than code in proprietary software packages like Stata or SAS.\n\nIt is possible, but less common, to install packages from places like GitHub. This is less secure and the functionality of the packages is more likely to change over time. `install.packages()` need only be run once per version of package per machine and should rarely be included in .R scripts.\n\nPackages are loaded once per R session with the function `library()`. It is a good idea to include `library(package-name)` at the top of scripts for each package used in the script. This way it is obvious at the top of the script which packages are installed and loaded.\n\n**Note:** `install.packages()` uses quoted package names and `library()` uses unquoted package names.\n\nFor practice, submit the following three lines of code to install `RXKCD`, load `library(RXKCD)`, and get a random [XKCD comic](https://www.xkcd.com/).\n\n```{r xkcd, eval=FALSE}\ninstall.packages(\"RXKCD\")\nlibrary(RXKCD)\ngetXKCD(\"random\")\n```\n\n```{r xkcd run, echo=FALSE}\nlibrary(RXKCD)\n# assignment to hide text output\ncomic <- getXKCD(539)\n```\n\nPackages are frequently updated, especially around the time R versions change. The easiest way to update packages is Tools \\> Check for Package Updated in RStudio.\n\nOccasionally, two loaded packages will have functions with identical names. Any conflicts with be announced when loading packages. See how `filter()` and `lag()` from `library(tidyverse)` and `library(stats)` conflict:\n\n![](intro-to-r/images/load-tidyverse.png) In this case, the tidyverse functions are usually favored. If there is ever a conflict or any doubt about which function is used, use the package name and `::` to directly call the function. For example, `dplyr::select(apples)`. `::` can also be used to call a function without loading the entire package.\n\n### CRAN\n\nThe [Comprehensive R Archive Network](https://cran.r-project.org/index.html) (CRAN) contains almost 12,000 packages contributed over the last two decades by a range of developers. New packages are added to CRAN almost every day.\n\nCRAN enables R to have all of the benefits of open-source development and the security and predictability of proprietary statistical packages like SAS and Stata. CRAN weds the benefits of broad-based, real-time package development with certain [standards](https://cran.r-project.org/index.html) for functionality and documentation. Methods and tools make it to R before SAS or Stata, if they ever make it to SAS or Stata, but have standards that generally exceed Python or other open-source languages. (See: [Malicious Libraries Found on Python Package Index (PyPI)](https://www.blog.pythonlibrary.org/2017/09/15/malicious-libraries-found-on-python-package-index-pypi/))\n\nBecause of CRAN's long history and R's place in the statistics community, CRAN contains many methods that can't be accessed, much less duplicated, using proprietary software. In addition to being useful now, this also ensures that R isn't a temporary fad and will have staying power because of the challenge of replicating or besting CRAN.\n\nR's extensible design is important, but most tasks can be accomplished with a handful of packages:\n\n- `ggplot2` data visualization\n- `dplyr` data management\n- `tidyr` data tidying\n- `readr` data import\n- `purrr` functional programming\n- `tibble` data frames\n- `hms` times\n- `stringr` character strings\n- `lubridate` dates/times\\\n- `forcats` factors\n- `DBI` databases\n- `haven` SPSS, SAS, and Stata files\n- `readxl`.xls and .xlsx\n- `modelr` simple modeling within a pipeline\n- `broom` turning models into tidy data\n- `tidyverse` loads all of the packages listed up to this point; see Hadley Wichkham's \"[tidyverse](https://blog.rstudio.org/2016/09/15/tidyverse-1-0-0/)\"\n\n## Organizing Analyses\n\n------------------------------------------------------------------------\n\nThis section outlines how to organize an analysis to get the most out of R. Newer users may want to skip this section and work through [R for Data Science](http://r4ds.had.co.nz/) until they understand `library(readr)`, `library(dplyr)`, and `library(ggplot2)`.\n\n### Projects\n\nOrganizing scripts, files, and data is one of the most important steps to creating a clear and reproducible analysis.\n\nR Projects, proper noun, are the best way to organize an analysis. They have several advantages:\n\n- They make it possible to concurrently run multiple RStudio sessions.\n- They allow for project-specific RStudio settings.\n- They integrate well with Git version control.\n- They are the \"node\" of relative file paths. (more on this in a second)\n\nBefore setting up an R Project, go to Tools \\> Global Options and uncheck \"Restore most recently opened project at startup\".\n\n![](intro-to-r/images/restore.png){width=\"50%\"}\n\nEvery new analysis in R should start with an R Project. First, create a directory that holds all data, scripts, and files for the analysis. Storing files and data in a sub-directories is encouraged. For example, data can be stored in a folder called data/.\n\nNext, click \"New Project...\" in the top right corner.\n\n![](intro-to-r/images/new-project.png){width=\"50%\"}\n\nWhen prompted, turn your recently created \"Existing Directory\" into a project.\n\n![](intro-to-r/images/existing-directory.png){width=\"50%\"}\n\nUpon completion, the name of the R Project should now be displayed in the top right corner of RStudio where it previously displayed \"Project: (None)\". Once opened, .RProj files do not need to be saved. Double-clicking .Rproj files in the directory is now the best way to open RStudio. This will allow for the concurrent use of multiple R sessions and ensure the portability of file paths. Once an RStudio project is open, scripts can be opened by double-clicking individual files in the computer directory or clicking files in the \"Files\" tab in the top right of RStudio.\n\nR Projects make code highly portable because of the way they handle file paths. Here are a few rules:\n\n#### Filepaths\n\nNever use `\\` in file paths in R. `\\` is a regular expression and will complicate an analysis. Fortunately, RStudio understands `/` in file paths regardless of operating system.\n\nNever use `setwd()` in R. It is unnecessary, it makes code unreproducible across machines, and it is rude to collaborators. R Projects create a better framework for file paths. Simply treat the directory where the R Project lives as the working directory and directories inside of that directory as sub-directories.\n\nFor example, say there's a `.Rproj` called `starwars-analysis.Rproj` in a directory called `starwars-analysis`. If there is a .csv in that folder called `jedi.csv`, the file can be loaded with `read_csv(\"jedi.csv\")` instead of `read_csv(\"H:/ibp/analyses/starwars-analysis/diamonds.csv\")`. If that file is in a sub-directory of `starwars-analysis` called `data`, it can be loaded with `read_csv(\"data/jedi.csv\")`. The same concepts hold for writing data and graphics.\n\nThis simplifies code and makes it portable because all relative filepaths will be identical on all computers. To share an analysis, simply send the entire directory to a collaborator or share it with GitHub.\n\nHere's an example directory:\n\n
\n\nIt isn't always possible to avoid absolute file paths because of the many different ways the Urban Institute stores data. Avoid absolute paths when possible and be deliberate about where analyses live in relation to where data live.\n\nFinally, it's good practice to include a README in the same directory as the .Rproj. The README should outline the purpose and the directories and can include information about how to contribute, licenses, dependencies, and acknowledgements. This [GitHub page](https://gist.github.com/PurpleBooth/109311bb0361f32d87a2) is a good README template.\n\nCheck out [R for Data Science](http://r4ds.had.co.nz/workflow-projects.html) by Hadley Wickham and Garrett Grolemund for a more thorough explanation of this workflow. Jenny Bryan also has a good [blogpost](https://www.tidyverse.org/articles/2017/12/workflow-vs-script/) about avoiding `setwd()`.\n\n### Naming Conventions\n\nNaming functions, objects, variables, files, and scripts is one of the toughest and least-taught dimensions of computer programming. Better names can add clarity to code, save time and effort, and minimize errors caused by accidentally overwriting existing functions or other objects.\n\n> There are only two hard things in Computer Science: cache invalidation and naming things. \\~ [Phil Karlton](http://www.meerkat.com/2017/12/naming-things-hard/)\n\n#### Functions and Other Objects\n\nR is case-sensitive.\n\nObjects in R can be named anything - [even unicode characters](https://www.r-bloggers.com/rules-for-naming-objects-in-r/). But just because something *can* be named anything doesn't mean it should.\n\nMost functions and objects in R are lowerCamelCase, period.separated, or underscore_separated. As an individual or team, it's important to pick a style and stick with it, but as [this article](https://journal.r-project.org/archive/2012-2/RJournal_2012-2_Baaaath.pdf) from 2012 shows, there isn't much consistency across the R community. Hadley Wickham's tidyverse uses underscores, so expect to see some consolidation into this style.\n\nIn general, it's good practice to name functions with verbs and other objects with nouns.\n\nVariable and object names that start with numbers, have spaces, or use peculiar syntax require back-ticks.\n\n> select(urban, \\`R Users Group\\`)\n\n> urban\\$\\`R Users Group\\`)\n\nFinally, it's possible to overwrite existing functions and other objects in R with the assignment operator. Don't give vectors or data frames the same names as exisiting functions and don't overwrite existing functions with custom functions.\n\n#### Files\n\nNaming conventions for scripts and files is probably the most overlooked dimension in programming and analysis. The first three bullets from this section come from this [rich slide deck](http://www2.stat.duke.edu/~rcs46/lectures_2015/01-markdown-git/slides/naming-slides/naming-slides.pdf) by Jenny Bryan. This may seem pedantic, but picking a file naming convention now can save a bunch of time and headaches in the future.\n\n**1) Machine readable**\n\nCreate file names that are easily machine readable. Use all lower case letters and skip punctuation other than delimiters. Use underscores as characters for splitting the file name. For example, `stringr::str_split_fixed(\"2018-01-10_r-introduction_machine-readable-example_01.csv\", \"[_\\\\.]\", 5)` splits the file name on underscores and periods and returns date, project, file name, file number, and file type. This information can then be stored and sorted in a data frame.\n\n**2) Human readable**\n\nCreate file names that are human readable. The example from above is informative without any machine interpretation.\n\n**3) Plays well with default ordering**\n\nIt is often useful to include date or sequence numbers in script and file names. For example, include 2018-10-01 for data collected on January 10th, 2018 or include 3 for the third script a sequence of five `.R` programs. Starting file names with the date or sequence numbers means files will show up in a logical order by default. Be sure to use ISO 8601 standard for dates (YYYY-MM-DD).\n\n**4) Don't Use File Names for Version Control**\n\nVersion control with file names is unwieldy and usually results in names that are barely human readable and definitely not machine readable.\n\n> \"2018-01-10_r-introduction_machine-readable-example_01_v2_for-aaron_after-review_before-submission.R\"\n\nIterations usually don't iterate sensibly. For example, what was \"v1\", \"v2\" abandoned for \"for-aaron\", \"after-review\", \"before-submission\". Furthermore, version control with file names is poor for concurrent work and merging.\n\nThe next section will outline the optimal tool for version control.\n\n### Version Control\n\nThe workflow outlined above integrates perfectly with version control like Git and distributed version control repository hosting services like GitHub.\n\nVersion control is a system for recording changes to files over time. Version control is built around repositories. In this case, the folder containing the `.Rproj` is the perfect directory to use as a repository. A handful of simple commands are used to track and commit changes to text files (.R, .Rmd, etc.) and data. This record is valuable for testing alternatives, communicating with others and your future self, and documenting progress on projects.\n\nGitHub is a distributed repository system built on top of Git. GitHub has a number of valuable tools for collaboration and project management. In particular, it makes concurrent collaboration on code simpler with branches and has a slick system for issues. Here are the [branches](https://github.com/UrbanInstitute/urban_R_theme/branches) and [issues](https://github.com/UrbanInstitute/urban_R_theme/issues) for the Urban Institute R Graphics Guide. It also has free web hosting for websites like the website you are reading right now. [GitHub has a quick guide that is a good place to start learning Git](https://try.github.io/levels/1/challenges/1).\n\nThe Urban Institute has a number of legacy models and code bases that span years and have been touched by scores of brilliant researchers. The future value of a record of all code changes and development is borderline unthinkable.\n\n### Coding Style\n\n> \"Good coding style is like using correct punctuation. You can manage without it, but it sure makes things easier to read.\" \\~Hadley Wickham (2014)\n\ngood coding style is like using correct punctuation you can manage without it but it sure makes thing easier to read\n\nThe details of a coding style are less important than consistently sticking to that style. Be flexible when working with collaborators so the style doesn't change inside an analysis.\n\nHere are three good sources for inspiration:\n\n- [Tidyverse Style Guide](http://style.tidyverse.org/)\n- [Google's R Style Guide](https://google.github.io/styleguide/Rguide.xml)\n- [Hadley Wickham's R Style Guide](http://adv-r.had.co.nz/Style.html)\n\n## Putting it All Together\n\n------------------------------------------------------------------------\n\nR can augment or replace a traditional proprietary statistical packages like SAS or Stata with a few extra bells and whistles, but hopefully this guide and other resources show a fuller vision for developing reproducible, accurate, and collaborative analyses.[^1]\n\n[^1]: The language \"reproducible, accurate, and collaborative analyses\" comes from [Hilary S. Parker's talk](https://www.rstudio.com/resources/videos/opinionated-analysis-development/) at rstudio::conf 2017 about opinionated analysis development.\n\nThis research pipeline, to use the phrase by Roger Peng, Jeff Leek, and Brian Caffo, combines the best of traditional economic and social policy research, computer science/software development, and statistics.[^2] Here are the rules:\n\n[^2]: The basis for [this section](https://www.coursera.org/learn/reproducible-research/lecture/abevs/reproducible-research-concepts-and-ideas-part-2) comes from this Coursera talk by Roger Peng.\n\n#### 1) No steps in an analysis are done by hand and all steps are recorded with executable scripts.\n\nIt is common to use executable scripts to estimate a regression equation or to tabulate weighted summary statistics. But for some reason, other steps like file management, data munging, and visualization are often done \"by hand\". Good science demands that every step of an analysis is recorded - and if possible - with executable scripts.\n\nFortunately, it is possible to script most steps in R from downloading data from the Internet and accessing APIs to visualizations and drafting manuscripts. This may be challenging at first, but it will save time and result in better research in the long run.\n\n#### 2) All code is entirely reproducible and portable.\n\nExecutable scripts are for communicating with other researchers and our future selves. Scripts lose value if they aren't portable and can't be reproduced in the future or by others. Recording every step with execuatble scripts is a start, but scripts aren't valuable if they require expensive proprietary software,or if researchers have to significantly alter scripts to run an analysis.\n\nOpen source software, like R, promotes accessibility, portability, and reproducibility. Also, be sure to avoid `setwd()` and use relative filepaths.\n\n#### 3) Local and collaborative version control is used and all repositories include all code and a README.\n\nUse local version control like Git and a distributed version control repository hosting service like GitHub to track changes and share analyses. The version control should include all scripts and meta information about the analysis in a README.\n\n#### 4) Raw data and tidy analytic data are stored in a collaborative location with a code book.\n\nMany raw data are already stored in collaborative locations like BLS.gov and don't need to be duplicated. Tidy analytic data, like the data used to estimate a regression equation, should be stored in a collaborative location. This is good practice, but is less essential if executable scripts are flawless and reproducible. Researcher-entered data and data from less-stable sources should be stored in raw and analytic forms.\n\nSmall data sets can be stored on GitHub without issue. Larger data sets should be stored in collaborative locations accessible by scripting languages. This is only possible for public data and best-practices for private data are less established.\n\nSave codebooks for data sets as text files or PDFs in repositories. Creating codebooks for user-entered data or variables created in executable scripts is often worth the time.\n\n#### 5) Code review and issue tracking are used to improve accuracy and computational efficiency.\n\nGetting stronger programmers and/or methodologists to review code is valuable for limiting programming and analytic mistakes, improving computational efficiency, and learning.\n\n[GitHub issues](https://guides.github.com/features/issues/) is a powerful tool for managing, discussing, and collaborating on code.\n\n#### 6) Projects rely heavily on literate statistical programming and standard means of distribution for execution, validation, and publishing.\n\nLiterate statistical programming is the combination of natural language explanations for humans and executable code in one document. The idea was created by Donald Knuth and is embodied by R Markdown.\n\nR Markdown combines text chunks, code chunks, and output chunks in one script that can be \"knitted\" using `library(knitr)` to created PDFs, books, .htmls, and websites like the website where this guide lives.\n\nThis workflow combines the analytic and narrative process in a tool that is flexible, scalable, reproducible, and less error-prone. R Markdown documents can be used for executing programs, validating models and analyses, and publishing. These documents can be submitted to many academic journals and shared easily with [GitHub pages](https://pages.github.com/).\n\n#### 7) Software versions and dependencies are recorded and all software is cited in publications.\n\n`sessionInfo()` reports the R version, locale, packages used, and other important information about an R session. `citation()` creates a text and BibTex entry of the citation for R. `citation()` creates a text and BibTex entry for R packages. `library(packrat)` (outlined [here](https://rstudio.github.io/packrat/)) is a tool for saving R dependencies.\n\n## Bibliography and References\n\n------------------------------------------------------------------------\n\nHadley Wickham (2017). tidyverse: Easily Install and Load the 'Tidyverse'. R package version 1.2.1. https://CRAN.R-project.org/package=tidyverse\n\nHadley Wickham and Garrett Grolemund (2017). R For Data Science http://r4ds.had.co.nz/\n\nHadley Wickham (2014). Advanced R http://adv-r.had.co.nz/Style.html\n\nHilary S. Parker (2017. Opinionated Analysis Development https://www.rstudio.com/resources/videos/opinionated-analysis-development/\n\nJenny Bryan (2017).\\\nProject-oriented workflow https://www.tidyverse.org/articles/2017/12/workflow-vs-script/\n\nJenny Bryan (2015). naming things. http://www2.stat.duke.edu/\\~rcs46/lectures_2015/01-markdown-git/slides/naming-slides/naming-slides.pdf\n\nJJ Allaire, Yihui Xie, Jonathan McPherson, Javier Luraschi, Kevin Ushey, Aron Atkins, Hadley Wickham, Joe Cheng and Winston Chang (2017). rmarkdown: Dynamic Documents for R. R package version 1.8. https://CRAN.R-project.org/package=rmarkdown\n\nJustin M. Shea (2017). wooldridge: 105 Data Sets from \"Introductory Econometrics: A Modern Approach\" by Jeffrey M. Wooldridge. R package version 1.2.0. https://CRAN.R-project.org/package=wooldridge\n\nRoger Peng Reproducible Research Part 2 https://www.coursera.org/learn/reproducible-research/lecture/abevs/reproducible-research-concepts-and-ideas-part-2\n\nYihui Xie (2017). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.18.\n\n```{r session info}\nsessionInfo()\n```\n"},"formats":{"html":{"identifier":{"display-name":"HTML","target-format":"html","base-format":"html"},"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"knitr"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"inline-includes":false,"preserve-yaml":false,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[],"notebook-links":true,"format-links":true},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"intro-to-r.html"},"language":{"toc-title-document":"Table of contents","toc-title-website":"On this page","related-formats-title":"Other Formats","related-notebooks-title":"Notebooks","source-notebooks-prefix":"Source","section-title-abstract":"Abstract","section-title-appendices":"Appendices","section-title-footnotes":"Footnotes","section-title-references":"References","section-title-reuse":"Reuse","section-title-copyright":"Copyright","section-title-citation":"Citation","appendix-attribution-cite-as":"For attribution, please cite this work as:","appendix-attribution-bibtex":"BibTeX citation:","title-block-author-single":"Author","title-block-author-plural":"Authors","title-block-affiliation-single":"Affiliation","title-block-affiliation-plural":"Affiliations","title-block-published":"Published","title-block-modified":"Modified","callout-tip-title":"Tip","callout-note-title":"Note","callout-warning-title":"Warning","callout-important-title":"Important","callout-caution-title":"Caution","code-summary":"Code","code-tools-menu-caption":"Code","code-tools-show-all-code":"Show All Code","code-tools-hide-all-code":"Hide All Code","code-tools-view-source":"View Source","code-tools-source-code":"Source Code","code-line":"Line","code-lines":"Lines","copy-button-tooltip":"Copy to Clipboard","copy-button-tooltip-success":"Copied!","repo-action-links-edit":"Edit this page","repo-action-links-source":"View source","repo-action-links-issue":"Report an issue","back-to-top":"Back to top","search-no-results-text":"No results","search-matching-documents-text":"matching documents","search-copy-link-title":"Copy link to search","search-hide-matches-text":"Hide additional matches","search-more-match-text":"more match in this document","search-more-matches-text":"more matches in this document","search-clear-button-title":"Clear","search-detached-cancel-button-title":"Cancel","search-submit-button-title":"Submit","search-label":"Search","toggle-section":"Toggle section","toggle-sidebar":"Toggle sidebar navigation","toggle-dark-mode":"Toggle dark mode","toggle-reader-mode":"Toggle reader mode","toggle-navigation":"Toggle navigation","crossref-fig-title":"Figure","crossref-tbl-title":"Table","crossref-lst-title":"Listing","crossref-thm-title":"Theorem","crossref-lem-title":"Lemma","crossref-cor-title":"Corollary","crossref-prp-title":"Proposition","crossref-cnj-title":"Conjecture","crossref-def-title":"Definition","crossref-exm-title":"Example","crossref-exr-title":"Exercise","crossref-ch-prefix":"Chapter","crossref-apx-prefix":"Appendix","crossref-sec-prefix":"Section","crossref-eq-prefix":"Equation","crossref-lof-title":"List of Figures","crossref-lot-title":"List of Tables","crossref-lol-title":"List of Listings","environment-proof-title":"Proof","environment-remark-title":"Remark","environment-solution-title":"Solution","listing-page-order-by":"Order By","listing-page-order-by-default":"Default","listing-page-order-by-date-asc":"Oldest","listing-page-order-by-date-desc":"Newest","listing-page-order-by-number-desc":"High to Low","listing-page-order-by-number-asc":"Low to High","listing-page-field-date":"Date","listing-page-field-title":"Title","listing-page-field-description":"Description","listing-page-field-author":"Author","listing-page-field-filename":"File Name","listing-page-field-filemodified":"Modified","listing-page-field-subtitle":"Subtitle","listing-page-field-readingtime":"Reading Time","listing-page-field-categories":"Categories","listing-page-minutes-compact":"{0} min","listing-page-category-all":"All","listing-page-no-matches":"No matching items"},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.3.433"},"extensions":{"book":{"multiFile":true}}}},"projectFormats":["html"]}
\ No newline at end of file
diff --git a/.quarto/idx/mapping.qmd.json b/.quarto/idx/mapping.qmd.json
index d506e2e..9f5ae65 100644
--- a/.quarto/idx/mapping.qmd.json
+++ b/.quarto/idx/mapping.qmd.json
@@ -1 +1 @@
-{"title":"Introduction","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"editor_options":{"markdown":{"wrap":72}}},"headingText":"Introduction","containsRefs":false,"markdown":"\n\n\n\n::: {#header}\n\n:::\n\n```{r markdown-setup, include=FALSE}\nknitr::opts_chunk$set(fig.path = \"mapping/www/images/\")\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\nknitr::opts_chunk$set(warning = FALSE)\n\noptions(scipen = 999)\n```\n\n```{r setup, include=FALSE}\nlibrary(tidyverse)\nlibrary(knitr)\nlibrary(kableExtra)\nlibrary(here)\nlibrary(sf)\n```\n\n\nThis guide will teach you the concepts and code you will need for\nmapping and geospatial analysis in R. **This is a long guide, so if you\nneed something specific, we encourage you to scroll to the appropriate\nsection using the Table of Contents on the left.** If you just want copy\nand pasteable code to create different kinds of maps, head to the\n[`Map Gallery`](#map_gallery).\n\nNow let's start mapping!\n\n![](mapping/www/images/yay_maps.gif)\n\n## Geospatial Workflow\n\nThis picture below outlines what we think are the main steps in a\ngeospatial workflow. This guide will be split into sections describing\neach of the steps.\n\n![](mapping/www/images/geospatial_workflow.png)\n\n## Should this be a map?\n\nThe [Urban Institute Data Visualization Style\nGuide](http://urbaninstitute.github.io/graphics-styleguide/) offers some\nblunt but useful suggestions for maps:\n\n> Just because you've got geographic data, doesn't mean that you have to\n> make a map. Many times, there are more efficient storyforms that will\n> get your point across more clearly. If your data shows a very clear\n> geographic trend or if the absolute location of a place or event\n> matters, maps might be the best approach, but sometimes the reflexive\n> impulse to map the data can make you forget that showing the data in\n> another form might answer other---and sometimes more\n> important---questions.\n\nSo we would encourage you to think critically before making a map.\n\n## Why map with R?\n\nR can have a steeper learning curve than point-and-click tools - like\nQGIS or ArcGIS - for geospatial analysis and mapping. But creating maps\nin R has many advantages including:\n\n1) **Reproducibility**: By creating maps with R code, you can easily\n share the outputs and the code that generated the output with\n collaborators, allowing them to replicate your work and catch errors\n easily.\n\n2) **Iteration**: With point and click software like ArcGIS, making 50\n maps would be 50 times the work/time. But using R, we can easily\n make make many iterations of the same map with a few changes to the\n code.\n\n3) **Easy Updates**: Writing code provides a roadmap for others (and\n future you!) to quickly update parts of the map as needed. Say for\n example a collaborator wanted to change the legend colors of 50\n state maps. With R, this is possible in just a few seconds!\n\n4) **An Expansive ecosystem**: There are several R packages that make\n it very easy to get spatial data, create static and interactive\n maps, and perform spatial analyses. This feature rich package\n ecosystem which all play nice together is frankly unmatched by other\n programming languages and even point and click tools like QGIS and\n ArcGIS. Some of these R packages include:\n\n - `sf`: For managing and analyzing spatial dataframes\n - `tigris`: For downloading in Census geographies\n - `ggplot2`: For making publication ready static maps\n - `urbnmapr`: For automatically adding Urban styling to static\n maps\n - `mapview`: For making expxploratory interactive maps\n\n5) **Cost**: Most point-and-click tools for geospatial analysis are\n proprietary and expensive. R is free open-source software. The\n software and most of its packages can be used for free by anyone for\n almost any use case.\n\n## Helpful Learning Resources\n\nIn addition to this guide, you may want to look at these other helpful\nresources:\n\n- The Urban Institute [mapping training\n series](https://ui-research.github.io/urbn101-mapping/) (with video\n lectures and notes)\n- Chapters\n [5](https://walker-data.com/census-r/census-geographic-data-and-applications-in-r.html),\n [6](https://walker-data.com/census-r/mapping-census-data-with-r.html),\n and\n [7](https://walker-data.com/census-r/spatial-analysis-with-us-census-data.html)\n from Kyle Walker's Analyzing US Census Data\n [book](https://walker-data.com/census-r/index.html).\n- Andrew Heiss' fantastic mapping\n [guide](https://datavizm20.classes.andrewheiss.com/example/12-example/)\n- All of the vignettes for the [`sf`\n package](https://cran.r-project.org/web/packages/sf/sf.pdf)\n- [Geocomputation with\n R](https://geocompr.robinlovelace.net/index.html): A book by Robin\n Lovelace and others\n- UChicago's R Spatial Workshops:\n \n\n# Get Spatial Data {#get_spatial_data}\n\n------------------------------------------------------------------------\n\n## library(sf) {.tabset .tabset-pills}\n\n### The short version\n\n`library(sf)` stores geospatial data, which are\n**points** (a single longitude/latitude),\n**lines** (a pair of connected points), or\n**polygons** (a collection of points which\nmake a polygon) in a `geometry` column within R dataframes\n\n![](mapping/www/images/amtrak_points_lines_polygons.jpg)\n\nThis is what `sf` dataframe looks like in the console:\n\n```{r print-sf-dataframe}\ndc_parks <- st_read(\"mapping/data/dc_parks.geojson\", \n\t\t\t\t\t\t\t\t\t\tquiet = TRUE)\n\n# Print just the NAME and geometry column\ndc_parks %>%\n select(NAME) %>%\n head(2)\n```\n\n### The long version\n\nThe `sf` library is a key tool for reading in, managing, and working\nwith spatial data in R. `sf` stands for simple features (not San\nFrancisco you Bay Area folks) and denotes a way to describe the spatial\nattributes of real life objects. The R object you will be working with\nmost frequently for mapping is an `sf` dataframe. An `sf` dataframe is\nessentially a regular R dataframe, with a couple of extra features for\nuse in mapping. These extra features exclusive to `sf` dataframes\ninclude:\n\n- sticky `geometry` columns\n- attached coordinate reference systems\n- some other spatial metadata\n\nThe most important of the above list is the sticky `geometry` column,\nwhich is a magical column that contains all of the geographic\ninformation for each row of data. Say for example you had a `sf`\ndataframe of all DC census tracts. Then the `geometry` column would\ncontain all of the geographic points used to define DC census tract\npolygons. The stickiness of this column means that no matter what data\nmunging/filtering you do, you will not be able to drop or delete the\n`geometry` column. Below is a graphic to help you understand this:\n\n![](mapping/www/images/sf_sticky_geometry.png)\n\ncredits: @allisonhorst\n\nThis is what an `sf` dataframe looks like in the console:\n\n```{r print_sf}\n# Read in spatial data about DC parks from DC Open Data Portal\ndc_parks <- st_read(\"https://opendata.arcgis.com/api/v3/datasets/287eaa2ecbff4d699762bbc6795ffdca_9/downloads/data?format=geojson&spatialRefId=4326\",\n\t\t\t\t\t\t\t\t\t\tquiet = TRUE)\n\n# dc_parks <- st_read(\"mapping/data/dc_parks.geojson\")\n\n# Select just a few columns for readability\ndc_parks <- dc_parks %>%\n select(NAME, geometry)\n\n# Print to the console\ndc_parks\n```\n\nNote that there is some spatial metadata such as the `Geometry Type`,\n`Bounding Box`, and `CRS` which shows up as a header before the actual\ncontents of the dataframe.\n\nSince `sf` dataframes operate similarly to regular dataframes, we can\nuse all our familiar `tidyverse` functions for data wrangling, including\n`select`, `filter`, `rename`, `mutate`, `group_by` and `summarize`. The\n`sf` package also has many functions that provide easy ways to replicate\ncommon tasks done in other GIS software like spatial joins, clipping,\nand buffering. Almost all of the mapping and geospatial analysis methods\ndescribed in this guide rely on you having an `sf` dataframe. So let's\ntalk about how to get one!\n\n## Importing spatial data {.tabset .tabset-pills}\n\nGetting an `sf` dataframe is always the first step in the geospatial\nworkflow. Here's how to import spatial data for...\n\n### States and counties\n\nWe highly recommend using the `library(urbnmapr)` package, which was\ncreated by folks here at Urban to easily create state and county level\nmaps. The `get_urbn_map()` function in the package allows you to read in\nspatial data on states and counties, with options to include\nterritories. Importantly, it will also display AL and HI as insets on\nthe map in accordance with the Urban Institute Data Visualization Style\nGuide. For information on how to install `urbnmapr`, see the [GitHub\nrepository](https://github.com/UrbanInstitute/urbnmapr).\n\nBelow is an example of how you would use `urbnmapr` to get an `sf`\ndataframe of all the states or counties in the US.\n\n```{r urbnmapr-1, eval=FALSE}\nlibrary(urbnmapr)\n\n# Get state data\nstates <- get_urbn_map(\"states\", sf = TRUE)\n\n# Can also get county data\ncounties <- get_urbn_map(\"counties\", sf = TRUE)\n```\n\n### Other Census geographies\n\nUse the `library(tigris)` package, which allows you to easily download\nTIGER and other cartographic boundaries from the US Census Bureau. In\norder to automatically load in the boundaries as `sf` objects, run\n`r options(tigris_class = \"sf\")` once per R session.\n\n`library(tigris)` has all the standard census geographies, including\ncensus tracts, counties, CBSAs, ZCTAs, congressional districts, tribal\nareas, and more. It also includes other elements such as water, roads,\nand military bases.\n\nBy default, `libraray(tigris)` will download large very large and\ndetailed TIGER line boundary files. For thematic mapping, the smaller\ncartographic boundary files are a better choice, as they are clipped to\nthe shoreline, generalized, and therefore usually smaller in size\nwithout losing too much accuracy. To load cartographic boundaries, use\nthe `cb = TRUE` argument. If you are doing detailed geospatial analysis\nand need the most detailed shapefiles, then you should use the detailed\nTIGER line boundary files and set `cb = FALSE`.\n\nBelow is an example of how you would use `library(tigris)` to get a `sf`\ndataframe of all Census tracts in DC for 2019.\n\n```{r tigris-1, eval=FALSE}\nlibrary(tigris)\n\n# Only need to set once per script\noptions(tigris_class = \"sf\")\n\ndc_tracts <- tracts(\n state = \"DC\",\n cb = TRUE,\n year = 2019\n)\n```\n\nUnlike `library(urbnmapr)`, different functions are used to get\ngeographic data for different geographic levels. For instance, the\n`blocks()` function will load census block group data, and the\n`tracts()` function will load tract data. Other functions include\n`block_groups()`, `zctas()` , and `core_based_statistical_areas()`. For\nthe full list of supported geographies and functions, see the [package\nvignette](https://cran.r-project.org/web/packages/tigris/tigris.pdf).\n\nFor folks interested in pulling in Census demographic information along\nwith Census geographies, we recommend checking out the sister package to\n`library(tigris)`: `library(tidycensus)`. That package allows you to\ndownload in Census variables and Census geographic data simultaneously.\n\n### Countries\n\nWe recommend using the `library(rnaturalearth)` package, which is\nsimilar to `library(tigris)` but allows you to download and use\nboundaries beyond the US. Instead of setting class to `sf` one time per\nsession as we did with `library(tigris)`, you must set the\n`returnclass = \"sf\"` argument each time you use a function from the\npackage. Below is an example of downloading in an `sf` dataframe of all\nthe countries in the world.\n\n```{r natural-earth, eval = FALSE}\n\nlibrary(rnaturalearth)\n\nworld <- ne_countries(returnclass = \"sf\")\n\nggplot() +\n geom_sf(data = world, mapping = aes())\n```\n\n### Your own files\n\n#### Shapefiles/GeoJSONS\n\nShapefiles and GeoJSONs are 2 common spatial file formats you will found\nout in the wild. `library(sf)` has a function called `st_read` which\nallows you to easily read in these files as `sf` dataframes. The only\nrequired argument is `dsn` or data source name. This is the filepath of\nthe `.shp` file or the `.geojson` file on your local computer. For\ngeojsons, `dsn` can also be a URL.\n\nBelow is an example of reading in a shapefile of fire stations in DC\nwhich is stored in `mapping/data/shapefiles/`. Note that shapefiles are\nactually stored as 6+ different files inside a folder. You need to\nprovide the filepath to the file ending in `.shp`.\n\n```{r list f-ei}\nlibrary(sf)\n\n# Print out all files in the directory\nlist.files(\"mapping/data/shapefiles\")\n\n# Read in .shp file\ndc_firestations <- st_read(\n dsn = \"mapping/data/shapefiles/Fire_Stations.shp\",\n quiet = TRUE\n)\n```\n\nAnd now `dc_firestations` is an `sf` dataframe you can use for all your\nmapping needs! `st_read` supports reading in a wide variety of other\nspatial file formats, including geodatabases, KML files, and over 200\nothers. For an incomplete list, please see the this `sf`\n[vignette](https://r-spatial.github.io/sf/articles/sf2.html).\n\n#### CSVs or dataframes with lat/lons\n\nIf you have a CSV with geographic information stored in columns, you\nwill need to read in the CSV as a regular R dataframe and then convert\nto an `sf` dataframe. `library(sf)` contains the `st_as_sf()` function\nfor converting regular R dataframes into an `sf` dataframe. The two\narguments you must specify for this function are:\n\n- `coords`: A length 2 vector with the names of the columns\n corresponding to longitude and latitude (in that order!). For\n example, `c(\"lon\", \"lat\")`.\n- `crs`: The CRS (coordinate references system) for your\n longitude/latitude coordinates. Remember you need to specify both\n the\\\n authority and the SRID code, for example (\"EPSG:4326\"). For more\n information on finding and setting CRS codes, please see the\n [`CRS`](#crs) section.\n\nBelow is an example of reading in data from a CSV and converting it to\nan `sf` dataframe.\n\n```{r make-sf}\nlibrary(sf)\n\n# Read in dataset of state capitals which is stored as a csv\nstate_capitals <- read_csv(\"mapping/data/state-capitals.csv\")\n\nstate_capitals <- state_capitals %>%\n # Specify names of the lon/lat columns in the CSV to use to make geometry col\n st_as_sf(\n coords = c(\"longitude\", \"latitude\"),\n crs = 4326\n )\n```\n\nOne common mistake is that before converting to an `sf` dataframe, you\nmust drop any rows that have `NA` values for latitude or longitude. If\nyour data contains `NA` values, then the `st_as_sf()` function will\nthrow an error.\n\n## Appending spatial info to your data\n\nOftentimes, the data you are working with will just have state or county\nidentifiers - like FIPS codes or state abbreviations - but will not\ncontain any geographic information. In this case, you must do the extra\nwork of downloading in the geographic data as an `sf` dataframe and then\njoining your non-spatial data to the spatial data. Generally this\ninvolves 3 steps:\n\n1) Reading in your own data as a data frame\n2) Reading in the geographic data as an `sf` dataframe\n3) Using `left_join` to merge the geographic data with your own non\n spatial data and create a new expanded `sf` dataframe\n\nLet's say we had a dataframe on CHIP enrollment by state with state\nabbreviations.\n\n```{r readin-chip-data}\n\n# read the state CHIP data\nchip_by_state <- read_csv(\"mapping/data/chip-enrollment.csv\") %>%\n # clean column names so there are no random spaces/uppercase letters\n janitor::clean_names()\n\n# print to the console\nchip_by_state %>% head()\n```\n\nIn order to convert this to an `sf` dataframe, we need to read in the\nspatial boundaries for each state and append it to our dataframe. Here\nis how we do that with `get_urbn_map()` and `left_join()` .\n\n```{r append-spatial-info, cache = FALSE}\nlibrary(urbnmapr)\n\n# read in state geographic data from urbnmapr\nstates <- get_urbn_map(map = \"states\", sf = TRUE)\n\n# left join state geographies to chip data\nchip_with_geographies <- states %>%\n left_join(\n chip_by_state,\n # Specify join column, which are slightly differently named in states and chip\n # respectively\n by = c(\"state_abbv\" = \"state_abbreviation\")\n )\n\nchip_with_geographies %>%\n select(state_fips, state_abbv, chip_enrollment)\n```\n\n```{r append-state-pops, include = FALSE, eval = TRUE, echo = FALSE}\n# TODO: DELETE THIS\n\n# Read in data on state populations from 2010\nstate_pops <-\n read_csv(\"https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv\",\n # Set this to disable printing column info to console\n col_types = cols()\n ) %>%\n filter(ages == \"total\", year == \"2010\") %>%\n select(state_abbv = `state/region`, population)\n\nchip_with_geographies <- chip_with_geographies %>%\n # Specify left_join from tidylog to print summary messages\n tidylog::left_join(state_pops, by = \"state_abbv\") %>%\n # Calculate the chip enrollment percentage and append as a column\n mutate(chip_pct = chip_enrollment / population)\n```\n\n# Project\n\n## Coordinate Reference Systems {#crs .tabset .tabset-pills}\n\n### The short version\n\nJust watch [this video](https://www.youtube.com/watch?v=vVX-PrBRtTY%60)\nand know the following:\n\n- All spatial data has a CRS, which specifies how to identify a\n location on earth.\n\n- It's important that all spatial datasets you are working with be in\n the same CRS. You can find the CRS with `st_crs()` and change the\n CRS with `st_transform()`.\n\n- The Urban Institute Style Guide requires the use of the Atlas Equal\n Earth Projection (`\"ESRI:102003\"`) for national maps. For state and\n local maps, use [this](https://github.com/veltman/d3-stateplane)\n handy guide to find an appropriate State Plane projection.\n\n### The long version\n\nCoordinate reference systems (CRS) specify the 3d shape of the earth and\noptionally how we project that 3d shape onto a 2d surface. They are an\nimportant part of working with spatial data as you need to ensure that\nall the data you are working with are in the same CRS in order for\nspatial operations and maps to be accurate.\n\nCRS can be specified either by name (ie Maryland State Plane) or\n**S**patial **R**eference System **ID**entifier (SRID). THe SRID is a\nnumeric identifier that uniquely identifies a coordinate reference\nsystem. Generally when referring to an SRID, you need to refer to an\nauthority (ie the data source) and a unique ID. An example is\n`EPSG:26985` which refers to the Maryland State plane projection from\nthe EPSG, or `ESRI:102003` which refers to the Atlas Equal Area\nprojection from ESRI. Most CRS codes will be from the EPSG, and some\nfrom ESRI and others. A good resource for finding/validating CRS codes\nis [epsg.io](epsg.io).\n\nSidenote - EPSG stands for the now defunct European Petroleum Survey\nGroup. And while oil companies have generally been terrible for the\nearth, the one nice thing they did for the earth was to set up common\nstandards for coordinate reference systems.\n\nYou might be thinking well isn't the earth just a sphere? Why do we need\nall this complicated stuff? And the answer is well the earth is [kind\nof](https://oceanservice.noaa.gov/facts/earth-round.html) a sphere, but\nit's really more of a misshapen ellipsoid which is pudgier at the\nequator than at the poles. To visualize how coordinate reference systems\nwork, imagine that the earth is a (lumpy) orange. Now peel the skin off\nan orange and try to flatten it. There are many ways to do it, but all\nwill create\n[distortions](https://twitter.com/neilrkaye/status/1050740679008296967)\nof some kind. The CRS will give us the formula we've used to specify the\nshape of the orange (usually a sphere or ellipsoid of some kind) and\noptionally, specify how we flattened the orange into 2d.\n\nBroadly, there are two kinds of Coordinate Reference Systems:\n\n1) [**Geographic coordinate\n systems**](https://www.ibm.com/support/knowledgecenter/en/SSGU8G_12.1.0/com.ibm.spatial.doc/ids_spat_407.html)\n\n - (sometimes called unprojected coordinate systems)\n - Specifies a 3d shape for the earth\n - Uses a spheroid/ellipsoid to approximate shape of the earth\n - Usually use decimal degree units (ie latitude/longitude) to\n identify locations on earth\n\n![](mapping/www/images/gcs_image.png)\n\n1) [**Projected coordinate\n systems**](https://mgimond.github.io/Spatial/chp09-0.html#projected-coordinate-systems)\n\n - Specifies a 3d shape for the earth + a 2d mapping\n\n - Is a geographic coordinate system + a *projection*\n\n ![](mapping/www/images/projecting_xkcd.png)\n\n credit: [xkcd](https://imgs.xkcd.com/comics/projecting.png)\n\n - **projection**: mathematical formula used to convert a 3d\n coordinate system to a 2d flat coordinate system\n\n - Many different kinds of projections, including Equal Area,\n Equidistant, Conformal, etc\n\n - All projections distort the true shape of the earth in some\n way, either in terms of shape, area, or angle. Required\n [xkcd comic](https://xkcd.com/977/)\n\n - Usually use linear units (ie feet, meters) and therefore\n useful for distance based spatial operations (ie creating\n buffers)\n\n## Finding the CRS\n\nIf you are lucky, your data will have embedded CRS data that will be\nautomatically detected when the file is read in. This is usually the\ncase for GeoJSONS (`.geojson`) and shapefiles (`.shp`). When you use\n`st_read()` on these files, you should see the CRS displayed in the\nmetadata:\n\n![](mapping/www/images/sf_crs_pic.png)\n\nYou can also the `st_crs()` function to find the CRS. The CRS code is\nlocated at the end in `ID[authority, SRID]`.\n\n```{r st_crs}\nst_crs(dc_firestations)\n```\n\nSometimes, the CRS will be blank or `NA` as the dataset did not specify\nthe CRS. In that case you **MUST find and set the CRS for your data\nbefore proceeding** with analysis. Below are some good rules of thumb\nfor finding out what the CRS for your data is:\n\n- For geojsons, the CRS should always be `EPSG:4326` (or WGS 84). The\n official geojson specification states that this is the only valid\n CRS for geojsons, but in the wild, this may not be true 100% of the\n time.\n- For shapefiles, there should be a file that ends in `.proj` in the\n same directory as the `.shp` file. This file contains the projection\n information for that file and should be used automatically when\n reading in shapefiles.\n- For CSV's with latitude/longitude columns, the CRS is usually\n `EPSG:4326` (or WGS 84).\n- Look at the metadata and any accompanying documentation to see if\n the coordinate reference system for the data is specified\n\nIf none of the above rules of thumb apply to you, check out the\n`crsuggest` R [package](https://github.com/walkerke/crsuggest).\n\nOnce you've identified the appropriate CRS, you can set the CRS for your\ndata with `st_crs()`:\n\n```{r set_crs, eval = FALSE}\n\n# If you are certain that your data contains coordinates in the ESRI Atlas Equal Earth projections\nst_crs(some_sf_dataframe) <- st_crs(\"ESRI:102003\")\n```\n\n## Transforming the CRS\n\nOften you will need to change the CRS for your `sf` dataframe so that\nall datasets you are using have the same CRS, or to use a projected CRS\nfor performing more accurate spatial operations. You can do this with\n`st_transform`:\n\n```{r transform-crs}\n# Transforming CRS from WGS 84 to Urban required Equal Earth Projection\nstate_capitals <- state_capitals %>% st_transform(\"ESRI:102003\")\n```\n\n`st_transform()` also allows you to just use the CRS of another `sf`\ndataframe when transforming.\n\n```{r transform-crs-with-another-sf-object}\n# transform CRS of chip_with_geographies to be the same as CRS of dc_firestations\nchip_with_geographies <- chip_with_geographies %>%\n st_transform(crs = st_crs(state_capitals))\n```\n\nIf you are working with local data, you should use an appropriate state\nplane projection instead of the Atlas Equal Earth projection which is\nmeant for national maps. `library(crsuggest)` can simplify the process\nof picking an appropriate state plane CRS.\n\n```{r crsuggest-ex, cache = TRUE}\nlibrary(crsuggest)\n\nsuggest_crs(dc_firestations) %>%\n # Use the value in the \"crs_code\" column to transform CRS's\n head(4)\n```\n\n# Map\n\nIn order to start mapping, you need an `sf` dataframe. If you don't have\none, see the [`Get Spatial Data`](#get_spatial_data) section above.\n\n## The basics\n\n### library(ggplot2)\n\nMost mapping in R fits the same theoretical framework as plotting in R\nusing `library(ggplot2)`. To learn more about ggplot2, visit the Data\nViz\n[page](https://urbaninstitute.github.io/r-at-urban/graphics-guide.html#Grammar_of_Graphics_and_Conventions)\nor read the official ggplot [book](html).\n\nThe key function for mapping is **the special `geom_sf()` function**\nwhich works with `sf` dataframes. This function magically detects\nwhether you have point or polygon spatial data and displays the results\non a map.\n\n### A simple map\n\nTo make a simple map, add `geom_sf()` to a `ggplot()` and set\n`data = an_sf_dataframe`. Below is code for making a map of all 50\nstates using `library(urbnmapr)`:\n\n```{r first-map, cache = TRUE}\nlibrary(urbnmapr)\n\nstates <- get_urbn_map(\"states\", sf = TRUE)\n\nggplot() +\n geom_sf(\n data = states,\n mapping = aes()\n )\n```\n\n## Styling\n\n### `library(urbnthemes)`\n\n`library(urbnthemes)` automatically styles maps in accordance with the\n[Urban Institute Data Visualization Style\nGuide](http://urbaninstitute.github.io/graphics-styleguide/). By using\n`library(urbnthemes)`, you can create publication ready maps you can\nimmediately drop in to Urban research briefs or blog posts.\n\nTo install `urbnthemes`, visit the package's [GitHub\nrepository](https://github.com/UrbanInstitute/urbnthemes) and follow the\ninstructions. There are 2 ways to use the `urbnthemes` functions:\n\n```{r urbnthemes}\n\nlibrary(urbnthemes)\n\n# You can either run this once per script to automatically style all maps with\n# the Urban theme\nset_urbn_defaults(style = \"map\")\n\n# Or you can add `+ theme_urbn_map()` to the end of every map you make\nggplot() +\n geom_sf(states, mapping = aes()) +\n theme_urbn_map()\n```\n\n### Layering\n\nYou can layer multiple points/lines/polygons on top of each other using\nthe `+` operator from `library(ggplot2)`. The shapes will appear from\nbottom to top (ie the last mapped object will show up on top). It is\nimportant that all layers are in the same CRS (coordinate reference\nsystem).\n\n```{r layers, cache = TRUE}\n\nstate_capitals <- state_capitals %>%\n # This will change CRS to ESRI:102003 and shift the AK and HI state capitals\n # point locations to the appropriate locations on the inset maps.\n tigris::shift_geometry() %>%\n # For now filter out AL and HI as their state capitals will be slightly off.\n filter(!state %in% c(\"Alaska\", \"Hawaii\"))\n\nggplot() +\n geom_sf(\n data = states,\n mapping = aes()\n ) +\n # Note we change the data argument\n geom_sf(\n data = state_capitals,\n mapping = aes(),\n # urbnthemes library has urbn color palettes built in.\n color = palette_urbn_main[\"yellow\"],\n size = 2.0\n ) +\n theme_urbn_map()\n```\n\n### Fill and Outline Colors\n\nThe same commands used to change colors, opacity, lines, size, etc. in\ncharts can be used for maps too. To change the colors of the map , just\nuse the `fill =` and `color =` parameters in `geom_sf()`. `fill` will\nchange the fill color of polygons; `color` will change the color of\npolygon outlines, lines, and points.\n\nGenerally, maps that show the magnitude of a variable use the blue\nsequential ramp and maps that display positives and negatives use the\ndiverging color ramp.`library(urbnthemes)` contains inbuilt. helper\nvariables (like `palette_urbn_main`) for accessing color palettes from\nthe Urban Data Viz Style guide. If for example you want states to be\nUrban's magenta color:\n\n```{r urbnthemes- pink}\n\nggplot() +\n geom_sf(states,\n mapping = aes(),\n # Adjust polygon fill color\n fill = palette_urbn_main[\"magenta\"],\n # Adjust polygon outline color\n color = \"white\"\n ) +\n theme_urbn_map()\n```\n\n### Adding text\n\nYou can also add text, like state abbreviations, directly to your map\nusing `geom_sf_text` and the helper function `get_urbn_labels()`.\n\n```{r geom_sf_text}\nlibrary(urbnmapr)\n\nggplot() +\n geom_sf(states,\n mapping = aes(),\n color = \"white\"\n ) +\n theme_urbn_map() +\n # Generates dataframe of state abbv and appropriate location to plot them\n geom_sf_text(\n data = get_urbn_labels(\n map = \"states\",\n sf = TRUE\n ),\n aes(label = state_abbv),\n size = 3\n )\n```\n\nThere's also `geom_sf_label()` if you want labels with a border.\n\n# Map Gallery {#map_gallery}\n\nBelow are copy and pasteable examples of maps you can make, after you\nhave an `sf` dataframe.\n\n## Choropleth Maps\n\nChoropleth maps display geographic areas with shades, colors, or\npatterns in proportion to a variable or variables. Choropleth maps can\nrepresent massive geographies like the entire world and small\ngeographies like Census Tracts. To make a choropleth map, you need to\nset `geom_sf(aes(fill = some_variable_name))`. Below are examples\n\n### Continuous color scale\n\n```{r choropoleth_continious}\n# Map of CHIP enrollment percentage by state\nchip_with_geographies_map <- chip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct\n ))\n\n\n# Below add-ons to the map are optional, but make the map look prettier.\nchip_with_geographies_map +\n # scale_fill_gradientn adds colors with more interpolation and reverses color scale\n scale_fill_gradientn(\n # Convert legend from decimal to percentages\n labels = scales::percent_format(),\n # Make legend title more readable\n name = \"CHIP Enrollment %\",\n # Manually add 0 to lower limit to include it in legend. NA=use maximum value in data\n limits = c(0, NA),\n # Set number of breaks on legend = 3\n n.breaks = 3\n )\n```\n\n### Discrete color scale\n\nThe quick and dirty way is with `scale_fill_steps()`, which creates\ndiscretized bins for continuous variables:\n\n```{r chorpleth_disccrete}\nchip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct\n )) +\n scale_fill_steps(\n # Convert legend from decimal to percentages\n labels = scales::percent_format(),\n # Make legend title more readable\n name = \"CHIP Enrollment %\",\n # Show top and bottom limits on legend\n show.limits = TRUE,\n # Roughly set number of bins. Won't be exact as R uses algorithms under the\n # hood for pretty looking breaks.\n n.breaks = 4\n )\n```\n\nOften you will want to manually generate the bins yourself to give you\nmore fine grained control over the exact legend text. (ie `1% - 1.8%`,\n`1.8 - 2.5%`, etc). Below is an example of discretizing the continuous\n`chip_pct` variable yourself using `cut_interval()` and a helper\nfunction to get nice looking interval labels:\n\n```{r format_intervals}\n\n# Helper function to clean up R generated intervals into nice looking interval labels\nformat_interval <- function(interval_text) {\n text <- interval_text %>%\n # Remove open and close brackets which is R generated math notation\n str_remove_all(\"\\\\(\") %>%\n str_remove_all(\"\\\\)\") %>%\n str_remove_all(\"\\\\[\") %>%\n str_remove_all(\"\\\\]\") %>%\n str_replace_all(\",\", \" — \")\n\n # Convert decimal ranges to percent ranges\n text <- text %>%\n str_split(\" — \") %>%\n map(~ as.numeric(.x) %>%\n scales::percent() %>%\n paste0(collapse = \" — \")) %>%\n unlist() %>%\n # By default character vectors are plotted in alphabetical order. We want\n # factors in reverse alphabetical order to get correct colors in ggplot\n fct_rev()\n\n return(text)\n}\n\nchip_with_geographies <- chip_with_geographies %>%\n # cut_interval into n groups with equal range. Set boundary so 0 is included in the bins\n mutate(chip_pct_interval = cut_interval(chip_pct, n = 5)) %>%\n # Generate nice looking interval labels\n mutate(chip_pct_interval = format_interval(chip_pct_interval))\n```\n\nAnd now we can map the discretized `chip_pct_interval` variable using\n`geom_sf()`:\n\n```{r make_discrete_map}\nchip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct_interval\n )) +\n # Default is to use main urban palette, which assumes unrelated groups. We\n # adjust colors manually to be on Urban cyan palette\n scale_fill_manual(\n values = palette_urbn_cyan[c(8, 7, 5, 3, 1)],\n name = \"CHIP Enrollment %\"\n )\n```\n\nIn addition to `cut_interval` there are [similar\nfunctions](https://ggplot2.tidyverse.org/reference/cut_interval.html)\nfor creating intervals/bins with slightly different rules. When creating\nbins, be careful as changing the number of bins can drastically change\nhow the map looks.\n\n## Bubble Maps\n\nThis is just a layered map with one polygon layer and one point layer,\nwhere the points are sized in accordance with a variable in your data.\n\n```{r bubble_maps, cache = TRUE}\nset_urbn_defaults(style = \"map\")\n\n# Get sf dataframe of DC tracts\nlibrary(tigris)\ndc_tracts <- tracts(\n state = \"DC\",\n year = 2019,\n progress_bar = FALSE\n)\n\n# Add bubbles for firestations\nggplot() +\n geom_sf(data = dc_tracts, fill = palette_urbn_main[\"gray\"]) +\n geom_sf(\n data = dc_firestations,\n # Size bubbles by number of trucks at each station\n aes(size = TRUCK),\n color = palette_urbn_main[\"yellow\"],\n # Adjust transparency for readability\n alpha = 0.8\n )\n```\n\n## Dot-density Maps\n\nThese maps scatter dots within a geographic area. Typically each dot\nrepresents a unit (like 100 people, or 1000 houses). To create this kind\nof map, you need to start with an `sf` dataframe that is of `geometry`\ntype `POLYGON` or `MULTIPOLYGON` and then sample points within the\npolygon.\n\nThe below code generates a dot-density map representing people of\ndifferent races within Washington DC tracts The code may look a little\ncomplicated, but the key workhorse function is `st_sample()` which\nsamples points within each polygon to use in the dot density map:\n\n```{r dot_density_maps, cache = TRUE}\nlibrary(tidycensus)\n\n# Get counts by race of DC tracts\ndc_pop <- get_acs(\n geography = \"tract\",\n state = \"DC\",\n year = 2019,\n variables = c(\n Hispanic = \"DP05_0071\",\n White = \"DP05_0077\",\n Black = \"DP05_0078\",\n Asian = \"DP05_0080\"\n ),\n geometry = TRUE,\n progress_bar = FALSE\n)\n\n# Get unique groups (ie races)\ngroups <- unique(dc_pop$variable)\n\n# For each unique group (ie race), generate sampled points\ndc_race_dots <- map_dfr(groups, ~ {\n dc_pop %>%\n # .x = the group used in the loop\n filter(variable == .x) %>%\n # Use the projected MD state plane for accuracy\n st_transform(crs = \"EPSG:6487\") %>%\n # Have every dot represent 100 people\n mutate(est100 = as.integer(estimate / 100)) %>%\n st_sample(size = .$est100, exact = TRUE) %>%\n st_sf() %>%\n # Add group (ie race) as a column so we can use it when plotting\n mutate(group = .x)\n})\n\n\nggplot() +\n # Plot tracts, then dots on top of tracts\n geom_sf(\n data = dc_pop,\n # Make interior of tracts transparent and boundaries black\n fill = \"transparent\",\n color = \"black\"\n ) +\n geom_sf(\n data = dc_race_dots,\n # Color in dots by racial group\n aes(color = group),\n # Adjust transparency and size to be more readable\n alpha = 0.5,\n size = 1.1,\n stroke = FALSE\n )\n```\n\n## Geofacets\n\nGeofaceting arranges sub-geography-specific plots into a grid that\nresembles a larger geography (usually the US). This can be a useful\nalternative to choropleth maps, which tend to overemphasize\nlow-population density areas with large areas. To make geofacetted\ncharts, use the `facet_geo()` function from the `geofacet` library,\nwhich can be thought of as equivalent to ggplot2's `facet_wrap()`. For\nthis example, we'll use the built-in `state_ranks` data.\n\n```{r geofacet-data}\nlibrary(geofacet)\n\nhead(state_ranks %>% as_tibble())\n```\n\n```{r geofacet-ex, cache = TRUE}\nset_urbn_defaults(style = \"print\")\n\nstate_ranks %>%\n filter(variable %in% c(\"education\", \"employment\")) %>%\n ggplot(aes(x = rank, y = variable)) +\n geom_col() +\n facet_geo(\n facets = \"state\",\n # Use custom urban geofacet grid which is built into urbnthemes\n # For now we need to rename a few columns as urbnthemes has to be\n # updated\n grid = urbnthemes::urbn_geofacet %>%\n rename(\n code = state_code,\n name = state_name\n )\n )\n```\n\nInteractive geofacets of the United States have been used in Urban\nFeatures like [A Matter of\nTime](https://apps.urban.org/features/long-prison-terms/trends.html)\nwhich included geofaceted line charts showing trends in incarceration by\nstate. Static geofacets of the United States were included in [Barriers\nto Accessing Homeownership Down Payment, Credit, and\nAffordability](https://www.urban.org/sites/default/files/publication/94801/barriers-to-homeownership-down-payments-credit-access-and-affordability_3.pdf)\nby the Housing Finance Policy Center.\n\n### Tile grid map\n\nYou can select predefined grids, or create your own at https://hafen.github.io/grid-designer/ \n\n```{r}\n# create a grid with all of the US states and territories \nmygrid <- data.frame(\n code = c(\"ME\", \"AK\", \"WI\", \"VT\", \"NH\", \"IL\", \"ID\", \"WA\", \"MN\", \"MT\", \"ND\", \"MI\", \"NY\", \"MA\", \"IA\", \"IN\", \"CT\", \"RI\", \"NJ\", \"PA\", \"OH\", \"SD\", \"WY\", \"NV\", \"OR\", \"CA\", \"NE\", \"DE\", \"MD\", \"VA\", \"WV\", \"KY\", \"MO\", \"CO\", \"UT\", \"AZ\", \"KS\", \"AR\", \"DC\", \"SC\", \"NC\", \"TN\", \"NM\", \"LA\", \"AL\", \"GA\", \"MS\", \"OK\", \"HI\", \"FL\", \"TX\"),\n row = c(1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8),\n col = c(12, 2, 7, 11, 12, 7, 3, 2, 6, 4, 5, 8, 10, 11, 6, 7, 11, 12, 10, 9, 8, 5, 4, 3, 2, 2, 5, 11, 10, 9, 8, 7, 6, 4, 3, 3, 5, 6, 10, 9, 8, 7, 4, 6, 8, 9, 7, 5, 2, 10, 5),\n stringsAsFactors = FALSE\n)\n\n## Combine data into geo_grid for tiling:\ngeo_grid_data <- mygrid %>% \n left_join(chip_with_geographies, by=c(\"code\" = \"state_abbv\")) \n\n## plot tile grid\ngeo_grid_data %>% \n ggplot(aes(x = col, y = row, fill = chip_pct_interval)) +\n scale_fill_manual(values = palette_urbn_cyan[c(8, 7, 5, 3, 1)], \n \t\t\t\t\t\t\t\t\t name = \"CHIP Enrollment %\") +\n geom_tile(color = \"white\", linewidth = 1) +\n geom_text(aes(label = code), color=\"white\", size = 4) +\n scale_y_reverse() +\n coord_equal() +\n labs(fill=NULL)\n```\n\n\n## Cartograms\n\nCartograms are a modified form of a choropleth map with intentionally\ndistorted sizes that map to a variable in your data. Below we create a\ncartogram with `library(cartogram)` where the state sizes are\nproportional to the population.\n\n```{r cartogram-example, cache = TRUE}\nlibrary(cartogram)\n\nset_urbn_defaults(style = \"map\")\n\nchip_with_geographies_weighted <- chip_with_geographies %>%\n # Note column name needs to be in quotes for this package\n cartogram_cont(weight = \"population\")\n\nggplot() +\n geom_sf(\n data = chip_with_geographies_weighted,\n # Color in states by chip percentages\n aes(fill = chip_pct)\n )\n```\n\n## Interactive Maps\n\nInteractive maps can be a great exploratory tool to explore and\nunderstand your data. And luckily there are a lot of new R packages that\nmake it really easy to create them. Interactive maps are powerful but\n**we do not recommend them for official use in Urban publications** as\ngetting them in Urban styles and appropriate basemaps can be tricky\n(reach out to\n[anarayanan\\@urban.org](mailto:anarayanan@urban.org){.email} if you\nreally want to include them).\n\n### `library(mapview)`\n\n`library(mapview)` is probably the most user friendly of the interactive\nmapping R libraries. All you have to do to create an interactive map is:\n\n```{r show-mapview}\nlibrary(mapview)\n\n\nchip_with_geographies_for_interactive_mapping <- chip_with_geographies %>%\n # Filter out AL and HI bc they would appear in Mexico. If you want AL, HI and\n # in the correct place in interactive maps, make sure to use tigris::states()\n filter(!state_abbv %in% c(\"AK\", \"HI\"))\n\nmapview(chip_with_geographies_for_interactive_mapping)\n```\n\nWhen you click on an object, you get a popup table of all it's\nattributes. And when you hover over an object, you get a popup with an\nobject id.\n\nEach of the above behaviors can be changed if desired. As you'll see in\nthe below section, the syntax for `library(mapview)` is significantly\ndifferent from `library(ggplot2)` so be careful!\n\n#### Coloring in points/polygons\n\nIn order to create a choropleth map where we color in the\npoints/polygons by a variable, we need to feed in a column name *in\nquotes* to the`zcol` argument inside the `mapview()` function:\n\n```{r mapview_zcol}\n# Create interactive state map colored in by chip enrollment\nmapview(chip_with_geographies_for_interactive_mapping, zcol = \"chip_enrollment\")\n```\n\nIf you want more granular control over the color palette for the legend\ncan also feed in a vector of color hex codes to `col.regions` along with\na column name to `zcol`. This will create a continuous color range along\nthe provided colors. Be careful though as the color interpolation is not\nperfect.\n\n```{r mapview-colors-granular}\n# library(RColorBrewer)\nmapview(chip_with_geographies_for_interactive_mapping,\n col.regions = c(\n palette_urbn_green[6],\n \"white\",\n palette_urbn_cyan[6]\n ),\n zcol = \"chip_enrollment\"\n)\n```\n\nIf you want to color in all points/polygons as the same color, just feed\nin a single color hex code to the `col.regions` argument:\n\n```{r mapview-colors}\nmapview(chip_with_geographies_for_interactive_mapping,\n col.regions = palette_urbn_green[5]\n)\n```\n\n#### Adding layers\n\nYou can add multiple `sf` objects on the same map by using the `+`\noperator. This is very useful when comparing 2 or more spatial datasets.\n\n```{r mapview-layers}\nmapview(chip_with_geographies_for_interactive_mapping, col.regions = palette_urbn_green[5]) +\n mapview(state_capitals, col.regions = palette_urbn_cyan[5])\n```\n\nYou can even create slider maps by using the `|` operator!\n\n```{r mapview-sliders}\nmapview(chip_with_geographies_for_interactive_mapping, col.regions = palette_urbn_green[5]) |\n mapview(state_capitals, col.regions = palette_urbn_cyan[5])\n```\n\n### More details\n\nTo learn more about more advanced options with `mapview` maps, check out\nthe\n[documentation](https://r-spatial.github.io/mapview/articles/articles/mapview_02-advanced.html)\npage and the [reference\nmanual](https://cran.r-project.org/web/packages/mapview/mapview.pdf).\n\nThere are also other interactive map making packages in R like `leaflet`\n(which `mapview` is a more user friendly wrapper of), `tmap`, and\n`mapdeck`. To learn about these other packages, [this book\nchapter](https://geocompr.robinlovelace.net/adv-map.html#interactive-maps)\nis a good starting point.\n\n# Spatial Operations\n\n## Cropping\n\nCropping (or clipping) is geographically filtering an `sf` dataframe to\njust the area we are interested in. Say we wanted to look at the roads\naround Fire Station 24 in DC.\n\n```{r roads_cropping_before, cache = TRUE}\nlibrary(tigris)\nlibrary(units)\n\ndc_firestations <- dc_firestations %>%\n st_transform(\"EPSG:6487\")\n\n\n# Draw 500 meter circle around one fire station\nfire_station_24_buffered <- dc_firestations %>%\n filter(NAME == \"Engine 24 Station\") %>%\n st_buffer(set_units(500, \"meter\"))\n\n# Get listing of all roads in DC\ndc_roads <- roads(\n state = \"DC\",\n county = \"District of Columbia\",\n class = \"sf\",\n progress_bar = FALSE\n) %>%\n st_transform(\"EPSG:6487\")\n\n# View roads on top of fire_station\nggplot() +\n # Order matters! We need to plot fire_stations first, and then roads on top\n # to see overlapping firestations\n geom_sf(\n data = fire_station_24_buffered,\n fill = palette_urbn_cyan[1],\n color = palette_urbn_cyan[7]\n ) +\n geom_sf(\n data = dc_roads,\n color = palette_urbn_gray[7]\n ) +\n theme_urbn_map()\n```\n\nWe can clip the larger roads dataframe to just roads that overlap with\nthe circle around the fire station with `st_intersection()`.\n\n```{r roads_cropping_after}\n\n# Use st_intersection() to crop the roads data to just roads within the\n# fire_station radius\ndc_roads_around_fire_station_24_buffered <- fire_station_24_buffered %>%\n st_intersection(dc_roads)\n\nggplot() +\n geom_sf(\n data = fire_station_24_buffered,\n fill = palette_urbn_cyan[1],\n color = palette_urbn_cyan[7]\n ) +\n geom_sf(\n data = dc_roads_around_fire_station_24_buffered,\n color = palette_urbn_gray[7]\n ) +\n theme_urbn_map()\n```\n\n**More Coming Soon!**\n\n## Calculating Distance\n\n## Spatial Joins\n\n### Point to Polygon\n\n### Polygon to Polygon\n\n## Aggregating\n\n## Drive/Transit times\n\n## Geocoding\n\nGeocoding is the process of turning text (usually addresses) into\ngeographic coordinates (usually latitudes/longitudes) for use in\nmapping. For Urban researchers, we highly recommend using the [Urban\ngeocoder](https://tech-tools.urban.org/geocoding/) as it is fast,\naccurate, designed to work with sensitive/confidential data and most\nimportantly free to use for Urban researchers! To learn about how we set\nup and chose the geocoder for the Urban Institute, you can read our\n[Data\\@Urban\nblog](https://medium.com/@urban_institute/choosing-a-geocoder-for-the-urban-institute-86192f656c5f).\n\n### Cleaning Addresses\n\nThe single most important factor in getting accurate geocoded data is\nhaving cleaned, well structured address data. This can prove difficult\nas address data out in the wild is often messy and unstandardized. While\nthe rules for cleaning addresses are very data specific, below are some\nexamples of clean addresses you should aim for in your data cleaning\nprocess:\n\n```{r cleaned-addr, cache=TRUE,eval=TRUE,results=TRUE, echo=FALSE}\nlibrary(gt)\ncleaned_address_table <- tribble(\n ~\"f_address\", ~\"Type of address\",\n \"123 Troy Drive, Pillowtown, CO, 92432\", \"residnetial address\",\n \"789 Abed Avenue, Apt 666, Blankesburg, CO, 92489\", \"residential apartment address\",\n \"Shirley Boulevard and Britta Drive, Blanketsburg, CO, 92489\", \"street intersection\",\n \"Pillowtown, CO\", \"city\",\n \"92489, CO\", \"Zip Code\",\n)\n\ngt(cleaned_address_table) %>%\n # tab_header(title = md(\"Clean Address Examples\")) %>%\n opt_row_striping(row_striping = TRUE) %>%\n tab_style(\n style = list(\n cell_text(weight = \"bold\")\n ),\n locations = cells_column_labels(\n columns = vars(f_address, `Type of address`)\n )\n ) %>%\n opt_align_table_header(align = c(\"left\")) %>%\n tab_options(\n container.width = \"100%\",\n container.height = \"400px\",\n # column_labels.background.color = palette_urbn_cyan[1],\n table.border.top.width = 0,\n table.border.bottom.width = 0,\n column_labels.border.bottom.width = 0,\n )\n```\n\nAll that being said, our geocoder is pretty tolerant of different\naddress formats, typos/spelling errors and missing states, zip codes,\netc. So don't spend too much time cleaning every address in the data.\nAlso note that while our geocoder is able to geocode cities and zip\ncodes, it will return the lat/lon of the center of the city/zip code,\nwhich may not be what you want.\n\n### Instructions\n\nTo use the [Urban geocoder](https://tech-tools.urban.org/geocoding/),\nyou will need to:\n\n1) Generate a CSV with a column named `f_address` which contains the\n addresses in single line format (ie\n `123 Abed Avenue, Blanketsburg, CO, 94328`). This means that if you\n have the addresses split across multiple columns (ie `Address`,\n `City`, `State`, `Zip` columns), you will need to concatenate them\n into one column. Also see our Address cleaning section above.\n\n2) Go to the Urban geocoder and answer the initial questions. This will\n tell you whether your data is non-confidential or confidential data,\n and allow you to upload your CSV for geocoding.\n\n3) Wait for an email telling you your results are ready. If your data\n is non-confidential, this email will contain a link to your geocoded\n results. This link expires in 24 hours, so make sure to download\n your data before then. If you data is confidential, the email will\n contain a link to the location on the Y Drive where your\n confidential geocoded data is stored. You can specify this output\n folder when submitting the CSV in step 1.\n\n### Geocoder outputs\n\n
The geocoded file will be your original data, plus a few more columns\n(including latitude and longitude). each of the new columns that have\nbeen appended to your original data. [It's very important that you take\na look at the Addr_type\ncolumn]{style=\"background-color: #FFFF00; font-weight: bold\"} in the\nCSV before doing further analysis to check the accuracy of the geocoding\nprocess.
\n\n+---------------+---------------------------------------------------+\n| Column | Description |\n+:==============+:==================================================+\n| Match_addr | The actual address that the inputted address was |\n| | matched to. This is the address that the geocoder |\n| | used to get Latitudes / Longitudes. If there are |\n| | potentially many typos or non standard address |\n| | formats in your data file, you will want to take |\n| | a close look at this column to confirm that the |\n| | matched address correctly handled typos and badly |\n| | formatted addresses. |\n+---------------+---------------------------------------------------+\n| Longitude | The WGS 84 datum Longitude (EPSG code 4326) |\n+---------------+---------------------------------------------------+\n| Latitude | The WGS 84 datum Latitude (EPSG code 4326) |\n+---------------+---------------------------------------------------+\n| Addr_type | The match level for a geocode request. This |\n| | should be used as an indicator of the precision |\n| | of geocode results. Generally, Subaddress, |\n| | PointAddress, StreetAddress, and StreetInt |\n| | represent accurate matches. The list below |\n| | contains all possible values for this field. |\n| | **Green values represent High accuracy matches, |\n| | yellow represents Medium accuracy matches and red |\n| | represents Low accuracy/inaccurate matches**. If |\n| | you have many yellow and red values in your data, |\n| | you should manually check the results before |\n| | proceeding with analysis. All possible values:\\ |\n| | \\ |\n| | **Subaddress:** A subset of a PointAddress that |\n| | represents a house or building subaddress |\n| | location, such as an apartment unit, floor, or |\n| | individual building within a complex. The |\n| | UnitName, UnitType, LevelName, LevelType, |\n| | BldgName, and BldgType field values help to |\n| | distinguish subaddresses which may be associated |\n| | with the same PointAddress. Reference data |\n| | consists of point features with associated house |\n| | number, street name, and subaddress elements, |\n| | along with administrative divisions and optional |\n| | postal code; for example, 3836 Emerald Ave, Suite |\n| | C, La Verne, CA, 91750.\\ |\n| | \\ |\n| | **PointAddress:** A street address based on |\n| | points that represent house and building |\n| | locations. Typically, this is the most spatially |\n| | accurate match level. Reference data contains |\n| | address points with associated house numbers and |\n| | street names, along with administrative divisions |\n| | and optional postal code. The X / Y |\n| | (`Longitude`/`Latitude`) and `geometry` output |\n| | values for a PointAddress match represent the |\n| | street entry location for the address; this is |\n| | the location used for routing operations. The |\n| | `DisplayX` and `DisplayY` values represent the |\n| | rooftop, or actual, location of the address. |\n| | Example: 380 New York St, Redlands, CA, 92373.\\ |\n| | \\ |\n| | **StreetAddress** --- A street address that |\n| | differs from PointAddress because the house |\n| | number is interpolated from a range of numbers. |\n| | Reference data contains street center lines with |\n| | house number ranges, along with administrative |\n| | divisions and optional postal code information, |\n| | for example, 647 Haight St, San Francisco, CA, |\n| | 94117.\\ |\n| | \\ |\n| | **StreetInt:** A street address consisting of a |\n| | street intersection along with city and optional |\n| | state and postal code information. This is |\n| | derived from StreetAddress reference data, for |\n| | example, Redlands Blvd & New York St, Redlands, |\n| | CA, 92373.\\ |\n| | \\ |\n| | **StreetName:** Similar to a street address but |\n| | without the house number. Reference data contains |\n| | street centerlines with associated street names |\n| | (no numbered address ranges), along with |\n| | administrative divisions and optional postal |\n| | code, for example, W Olive Ave, Redlands, CA, |\n| | 92373.\\ |\n| | \\ |\n| | **StreetAddressExt:** An interpolated street |\n| | address match that is returned when parameter |\n| | matchOutOfRange=true and the input house number |\n| | exceeds the house number range for the matched |\n| | street segment.\\ |\n| | \\ |\n| | **DistanceMarker:** A street address that |\n| | represents the linear distance along a street, |\n| | typically in kilometers or miles, from a |\n| | designated origin location. Example: Carr 682 KM |\n| | 4, Barceloneta, 00617.\\ |\n| | \\ |\n| | **PostalExt:** A postal code with an additional |\n| | extension, such as the United States Postal |\n| | Service ZIP+4. Reference data is postal code |\n| | points with extensions, for example, 90210-3841.\\ |\n| | \\ |\n| | **POI:** ---Points of interest. Reference data |\n| | consists of administrative division place-names, |\n| | businesses, landmarks, and geographic features, |\n| | for example, Golden Gate Bridge.\\ |\n| | \\ |\n| | **Locality:** A place-name representing a |\n| | populated place. The Type output field provides |\n| | more detailed information about the type of |\n| | populated place. Possible Type values for |\n| | Locality matches include Block, Sector, |\n| | Neighborhood, District, City, MetroArea, County, |\n| | State or Province, Territory, Country, and Zone. |\n| | Example: Bogotá, COL,\\ |\n| | \\ |\n| | **PostalLoc:** A combination of postal code and |\n| | city name. Reference data is typically a union of |\n| | postal boundaries and administrative (locality) |\n| | boundaries, for example, 7132 Frauenkirchen.\\ |\n| | \\ |\n| | **Postal:** Postal code. Reference data is postal |\n| | code points, for example, 90210 USA. |\n+---------------+---------------------------------------------------+\n| Score | A number from 1--100 indicating the degree to |\n| | which the input tokens in a geocoding request |\n| | match the address components in a candidate |\n| | record. A score of 100 represents a perfect |\n| | match, while lower scores represent decreasing |\n| | match accuracy. |\n+---------------+---------------------------------------------------+\n| Status | Indicates whether a batch geocode request results |\n| | in a match, tie, or unmatched. Possible values |\n| | include\\ |\n| | \\ |\n| | M - Match. The returned address matches the input |\n| | address and is the highest scoring candidate.\\ |\n| | \\ |\n| | T - Tied. The returned address matches the input |\n| | address but has the same score as one or more |\n| | additional candidates.\\ |\n| | \\ |\n| | U - Unmatched. No addresses match the inputted |\n| | address. |\n+---------------+---------------------------------------------------+\n| geometry | The WKT (Well-known text) representation of the |\n| | latitudes and longitudes. This column may be |\n| | useful if you're reading the CSV into R, Python, |\n| | or ArcGIS |\n+---------------+---------------------------------------------------+\n| Region | The state that `Match_addr` is located in |\n+---------------+---------------------------------------------------+\n| RegionAbbr | Abbreviated State Name. For example, CA for |\n| | California |\n+---------------+---------------------------------------------------+\n| Subregion | The county that the input address is located in |\n+---------------+---------------------------------------------------+\n| MetroArea | The name of the Metropolitan area that |\n| | `Match_addr` is located in. This field may be |\n| | blank if the input address is not located within |\n| | a metro area. |\n+---------------+---------------------------------------------------+\n| City | The city that `Match_addr` is located in |\n+---------------+---------------------------------------------------+\n| Nbrhd | The Neighborhood that `Match_addr` is located in. |\n| | Note these are ESRI defined neighborhoods which |\n| | may or may not align with other sources |\n| | neighborhood definitions |\n+---------------+---------------------------------------------------+\n\n\\\n\n# Geospatial Modeling\n\nComing soon!\n\n# Bibliography and references\n\n------------------------------------------------------------------------\n\n```{r session-info}\n\nsessionInfo()\n```\n"},"formats":{"html":{"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"knitr"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[]},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"mapping.html"},"language":{},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.2.269","editor_options":{"markdown":{"wrap":72}}},"extensions":{"book":{"multiFile":true}}}}}
\ No newline at end of file
+{"title":"Introduction","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"editor_options":{"markdown":{"wrap":72}}},"headingText":"Introduction","containsRefs":false,"markdown":"\n\n\n\n::: {#header}\n\n:::\n\n```{r markdown-setup, include=FALSE}\nknitr::opts_chunk$set(fig.path = \"mapping/www/images/\")\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\nknitr::opts_chunk$set(warning = FALSE)\n\noptions(scipen = 999)\n```\n\n```{r setup, include=FALSE}\nlibrary(tidyverse)\nlibrary(knitr)\nlibrary(kableExtra)\nlibrary(here)\nlibrary(sf)\n```\n\n\nThis guide will teach you the concepts and code you will need for\nmapping and geospatial analysis in R. **This is a long guide, so if you\nneed something specific, we encourage you to scroll to the appropriate\nsection using the Table of Contents on the left.** If you just want copy\nand pasteable code to create different kinds of maps, head to the\n[`Map Gallery`](#map_gallery).\n\nNow let's start mapping!\n\n![](mapping/www/images/yay_maps.gif)\n\n## Geospatial Workflow\n\nThis picture below outlines what we think are the main steps in a\ngeospatial workflow. This guide will be split into sections describing\neach of the steps.\n\n![](mapping/www/images/geospatial_workflow.png)\n\n## Should this be a map?\n\nThe [Urban Institute Data Visualization Style\nGuide](http://urbaninstitute.github.io/graphics-styleguide/) offers some\nblunt but useful suggestions for maps:\n\n> Just because you've got geographic data, doesn't mean that you have to\n> make a map. Many times, there are more efficient storyforms that will\n> get your point across more clearly. If your data shows a very clear\n> geographic trend or if the absolute location of a place or event\n> matters, maps might be the best approach, but sometimes the reflexive\n> impulse to map the data can make you forget that showing the data in\n> another form might answer other---and sometimes more\n> important---questions.\n\nSo we would encourage you to think critically before making a map.\n\n## Why map with R?\n\nR can have a steeper learning curve than point-and-click tools - like\nQGIS or ArcGIS - for geospatial analysis and mapping. But creating maps\nin R has many advantages including:\n\n1) **Reproducibility**: By creating maps with R code, you can easily\n share the outputs and the code that generated the output with\n collaborators, allowing them to replicate your work and catch errors\n easily.\n\n2) **Iteration**: With point and click software like ArcGIS, making 50\n maps would be 50 times the work/time. But using R, we can easily\n make make many iterations of the same map with a few changes to the\n code.\n\n3) **Easy Updates**: Writing code provides a roadmap for others (and\n future you!) to quickly update parts of the map as needed. Say for\n example a collaborator wanted to change the legend colors of 50\n state maps. With R, this is possible in just a few seconds!\n\n4) **An Expansive ecosystem**: There are several R packages that make\n it very easy to get spatial data, create static and interactive\n maps, and perform spatial analyses. This feature rich package\n ecosystem which all play nice together is frankly unmatched by other\n programming languages and even point and click tools like QGIS and\n ArcGIS. Some of these R packages include:\n\n - `sf`: For managing and analyzing spatial dataframes\n - `tigris`: For downloading in Census geographies\n - `ggplot2`: For making publication ready static maps\n - `urbnmapr`: For automatically adding Urban styling to static\n maps\n - `mapview`: For making expxploratory interactive maps\n\n5) **Cost**: Most point-and-click tools for geospatial analysis are\n proprietary and expensive. R is free open-source software. The\n software and most of its packages can be used for free by anyone for\n almost any use case.\n\n## Helpful Learning Resources\n\nIn addition to this guide, you may want to look at these other helpful\nresources:\n\n- The Urban Institute [mapping training\n series](https://ui-research.github.io/urbn101-mapping/) (with video\n lectures and notes)\n- Chapters\n [5](https://walker-data.com/census-r/census-geographic-data-and-applications-in-r.html),\n [6](https://walker-data.com/census-r/mapping-census-data-with-r.html),\n and\n [7](https://walker-data.com/census-r/spatial-analysis-with-us-census-data.html)\n from Kyle Walker's Analyzing US Census Data\n [book](https://walker-data.com/census-r/index.html).\n- Andrew Heiss' fantastic mapping\n [guide](https://datavizm20.classes.andrewheiss.com/example/12-example/)\n- All of the vignettes for the [`sf`\n package](https://cran.r-project.org/web/packages/sf/sf.pdf)\n- [Geocomputation with\n R](https://geocompr.robinlovelace.net/index.html): A book by Robin\n Lovelace and others\n- UChicago's R Spatial Workshops:\n \n\n# Get Spatial Data {#get_spatial_data}\n\n------------------------------------------------------------------------\n\n## library(sf) {.tabset .tabset-pills}\n\n### The short version\n\n`library(sf)` stores geospatial data, which are\n**points** (a single longitude/latitude),\n**lines** (a pair of connected points), or\n**polygons** (a collection of points which\nmake a polygon) in a `geometry` column within R dataframes\n\n![](mapping/www/images/amtrak_points_lines_polygons.jpg)\n\nThis is what `sf` dataframe looks like in the console:\n\n```{r print-sf-dataframe}\ndc_parks <- st_read(\"mapping/data/dc_parks.geojson\", \n\t\t\t\t\t\t\t\t\t\tquiet = TRUE)\n\n# Print just the NAME and geometry column\ndc_parks %>%\n select(NAME) %>%\n head(2)\n```\n\n### The long version\n\nThe `sf` library is a key tool for reading in, managing, and working\nwith spatial data in R. `sf` stands for simple features (not San\nFrancisco you Bay Area folks) and denotes a way to describe the spatial\nattributes of real life objects. The R object you will be working with\nmost frequently for mapping is an `sf` dataframe. An `sf` dataframe is\nessentially a regular R dataframe, with a couple of extra features for\nuse in mapping. These extra features exclusive to `sf` dataframes\ninclude:\n\n- sticky `geometry` columns\n- attached coordinate reference systems\n- some other spatial metadata\n\nThe most important of the above list is the sticky `geometry` column,\nwhich is a magical column that contains all of the geographic\ninformation for each row of data. Say for example you had a `sf`\ndataframe of all DC census tracts. Then the `geometry` column would\ncontain all of the geographic points used to define DC census tract\npolygons. The stickiness of this column means that no matter what data\nmunging/filtering you do, you will not be able to drop or delete the\n`geometry` column. Below is a graphic to help you understand this:\n\n![](mapping/www/images/sf_sticky_geometry.png)\n\ncredits: @allisonhorst\n\nThis is what an `sf` dataframe looks like in the console:\n\n```{r print_sf}\n# Read in spatial data about DC parks from DC Open Data Portal\ndc_parks <- st_read(\"https://opendata.arcgis.com/api/v3/datasets/287eaa2ecbff4d699762bbc6795ffdca_9/downloads/data?format=geojson&spatialRefId=4326\",\n\t\t\t\t\t\t\t\t\t\tquiet = TRUE)\n\n# dc_parks <- st_read(\"mapping/data/dc_parks.geojson\")\n\n# Select just a few columns for readability\ndc_parks <- dc_parks %>%\n select(NAME, geometry)\n\n# Print to the console\ndc_parks\n```\n\nNote that there is some spatial metadata such as the `Geometry Type`,\n`Bounding Box`, and `CRS` which shows up as a header before the actual\ncontents of the dataframe.\n\nSince `sf` dataframes operate similarly to regular dataframes, we can\nuse all our familiar `tidyverse` functions for data wrangling, including\n`select`, `filter`, `rename`, `mutate`, `group_by` and `summarize`. The\n`sf` package also has many functions that provide easy ways to replicate\ncommon tasks done in other GIS software like spatial joins, clipping,\nand buffering. Almost all of the mapping and geospatial analysis methods\ndescribed in this guide rely on you having an `sf` dataframe. So let's\ntalk about how to get one!\n\n## Importing spatial data {.tabset .tabset-pills}\n\nGetting an `sf` dataframe is always the first step in the geospatial\nworkflow. Here's how to import spatial data for...\n\n### States and counties\n\nWe highly recommend using the `library(urbnmapr)` package, which was\ncreated by folks here at Urban to easily create state and county level\nmaps. The `get_urbn_map()` function in the package allows you to read in\nspatial data on states and counties, with options to include\nterritories. Importantly, it will also display AL and HI as insets on\nthe map in accordance with the Urban Institute Data Visualization Style\nGuide. For information on how to install `urbnmapr`, see the [GitHub\nrepository](https://github.com/UrbanInstitute/urbnmapr).\n\nBelow is an example of how you would use `urbnmapr` to get an `sf`\ndataframe of all the states or counties in the US.\n\n```{r urbnmapr-1, eval=FALSE}\nlibrary(urbnmapr)\n\n# Get state data\nstates <- get_urbn_map(\"states\", sf = TRUE)\n\n# Can also get county data\ncounties <- get_urbn_map(\"counties\", sf = TRUE)\n```\n\n### Other Census geographies\n\nUse the `library(tigris)` package, which allows you to easily download\nTIGER and other cartographic boundaries from the US Census Bureau. In\norder to automatically load in the boundaries as `sf` objects, run\n`r options(tigris_class = \"sf\")` once per R session.\n\n`library(tigris)` has all the standard census geographies, including\ncensus tracts, counties, CBSAs, ZCTAs, congressional districts, tribal\nareas, and more. It also includes other elements such as water, roads,\nand military bases.\n\nBy default, `libraray(tigris)` will download large very large and\ndetailed TIGER line boundary files. For thematic mapping, the smaller\ncartographic boundary files are a better choice, as they are clipped to\nthe shoreline, generalized, and therefore usually smaller in size\nwithout losing too much accuracy. To load cartographic boundaries, use\nthe `cb = TRUE` argument. If you are doing detailed geospatial analysis\nand need the most detailed shapefiles, then you should use the detailed\nTIGER line boundary files and set `cb = FALSE`.\n\nBelow is an example of how you would use `library(tigris)` to get a `sf`\ndataframe of all Census tracts in DC for 2019.\n\n```{r tigris-1, eval=FALSE}\nlibrary(tigris)\n\n# Only need to set once per script\noptions(tigris_class = \"sf\")\n\ndc_tracts <- tracts(\n state = \"DC\",\n cb = TRUE,\n year = 2019\n)\n```\n\nUnlike `library(urbnmapr)`, different functions are used to get\ngeographic data for different geographic levels. For instance, the\n`blocks()` function will load census block group data, and the\n`tracts()` function will load tract data. Other functions include\n`block_groups()`, `zctas()` , and `core_based_statistical_areas()`. For\nthe full list of supported geographies and functions, see the [package\nvignette](https://cran.r-project.org/web/packages/tigris/tigris.pdf).\n\nFor folks interested in pulling in Census demographic information along\nwith Census geographies, we recommend checking out the sister package to\n`library(tigris)`: `library(tidycensus)`. That package allows you to\ndownload in Census variables and Census geographic data simultaneously.\n\n### Countries\n\nWe recommend using the `library(rnaturalearth)` package, which is\nsimilar to `library(tigris)` but allows you to download and use\nboundaries beyond the US. Instead of setting class to `sf` one time per\nsession as we did with `library(tigris)`, you must set the\n`returnclass = \"sf\"` argument each time you use a function from the\npackage. Below is an example of downloading in an `sf` dataframe of all\nthe countries in the world.\n\n```{r natural-earth, eval = FALSE}\n\nlibrary(rnaturalearth)\n\nworld <- ne_countries(returnclass = \"sf\")\n\nggplot() +\n geom_sf(data = world, mapping = aes())\n```\n\n### Your own files\n\n#### Shapefiles/GeoJSONS\n\nShapefiles and GeoJSONs are 2 common spatial file formats you will found\nout in the wild. `library(sf)` has a function called `st_read` which\nallows you to easily read in these files as `sf` dataframes. The only\nrequired argument is `dsn` or data source name. This is the filepath of\nthe `.shp` file or the `.geojson` file on your local computer. For\ngeojsons, `dsn` can also be a URL.\n\nBelow is an example of reading in a shapefile of fire stations in DC\nwhich is stored in `mapping/data/shapefiles/`. Note that shapefiles are\nactually stored as 6+ different files inside a folder. You need to\nprovide the filepath to the file ending in `.shp`.\n\n```{r list f-ei}\nlibrary(sf)\n\n# Print out all files in the directory\nlist.files(\"mapping/data/shapefiles\")\n\n# Read in .shp file\ndc_firestations <- st_read(\n dsn = \"mapping/data/shapefiles/Fire_Stations.shp\",\n quiet = TRUE\n)\n```\n\nAnd now `dc_firestations` is an `sf` dataframe you can use for all your\nmapping needs! `st_read` supports reading in a wide variety of other\nspatial file formats, including geodatabases, KML files, and over 200\nothers. For an incomplete list, please see the this `sf`\n[vignette](https://r-spatial.github.io/sf/articles/sf2.html).\n\n#### CSVs or dataframes with lat/lons\n\nIf you have a CSV with geographic information stored in columns, you\nwill need to read in the CSV as a regular R dataframe and then convert\nto an `sf` dataframe. `library(sf)` contains the `st_as_sf()` function\nfor converting regular R dataframes into an `sf` dataframe. The two\narguments you must specify for this function are:\n\n- `coords`: A length 2 vector with the names of the columns\n corresponding to longitude and latitude (in that order!). For\n example, `c(\"lon\", \"lat\")`.\n- `crs`: The CRS (coordinate references system) for your\n longitude/latitude coordinates. Remember you need to specify both\n the\\\n authority and the SRID code, for example (\"EPSG:4326\"). For more\n information on finding and setting CRS codes, please see the\n [`CRS`](#crs) section.\n\nBelow is an example of reading in data from a CSV and converting it to\nan `sf` dataframe.\n\n```{r make-sf}\nlibrary(sf)\n\n# Read in dataset of state capitals which is stored as a csv\nstate_capitals <- read_csv(\"mapping/data/state-capitals.csv\")\n\nstate_capitals <- state_capitals %>%\n # Specify names of the lon/lat columns in the CSV to use to make geometry col\n st_as_sf(\n coords = c(\"longitude\", \"latitude\"),\n crs = 4326\n )\n```\n\nOne common mistake is that before converting to an `sf` dataframe, you\nmust drop any rows that have `NA` values for latitude or longitude. If\nyour data contains `NA` values, then the `st_as_sf()` function will\nthrow an error.\n\n## Appending spatial info to your data\n\nOftentimes, the data you are working with will just have state or county\nidentifiers - like FIPS codes or state abbreviations - but will not\ncontain any geographic information. In this case, you must do the extra\nwork of downloading in the geographic data as an `sf` dataframe and then\njoining your non-spatial data to the spatial data. Generally this\ninvolves 3 steps:\n\n1) Reading in your own data as a data frame\n2) Reading in the geographic data as an `sf` dataframe\n3) Using `left_join` to merge the geographic data with your own non\n spatial data and create a new expanded `sf` dataframe\n\nLet's say we had a dataframe on CHIP enrollment by state with state\nabbreviations.\n\n```{r readin-chip-data}\n\n# read the state CHIP data\nchip_by_state <- read_csv(\"mapping/data/chip-enrollment.csv\") %>%\n # clean column names so there are no random spaces/uppercase letters\n janitor::clean_names()\n\n# print to the console\nchip_by_state %>% head()\n```\n\nIn order to convert this to an `sf` dataframe, we need to read in the\nspatial boundaries for each state and append it to our dataframe. Here\nis how we do that with `get_urbn_map()` and `left_join()` .\n\n```{r append-spatial-info, cache = FALSE}\nlibrary(urbnmapr)\n\n# read in state geographic data from urbnmapr\nstates <- get_urbn_map(map = \"states\", sf = TRUE)\n\n# left join state geographies to chip data\nchip_with_geographies <- states %>%\n left_join(\n chip_by_state,\n # Specify join column, which are slightly differently named in states and chip\n # respectively\n by = c(\"state_abbv\" = \"state_abbreviation\")\n )\n\nchip_with_geographies %>%\n select(state_fips, state_abbv, chip_enrollment)\n```\n\n```{r append-state-pops, include = FALSE, eval = TRUE, echo = FALSE}\n# TODO: DELETE THIS\n\n# Read in data on state populations from 2010\nstate_pops <-\n read_csv(\"https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv\",\n # Set this to disable printing column info to console\n col_types = cols()\n ) %>%\n filter(ages == \"total\", year == \"2010\") %>%\n select(state_abbv = `state/region`, population)\n\nchip_with_geographies <- chip_with_geographies %>%\n # Specify left_join from tidylog to print summary messages\n tidylog::left_join(state_pops, by = \"state_abbv\") %>%\n # Calculate the chip enrollment percentage and append as a column\n mutate(chip_pct = chip_enrollment / population)\n```\n\n# Project\n\n## Coordinate Reference Systems {#crs .tabset .tabset-pills}\n\n### The short version\n\nJust watch [this video](https://www.youtube.com/watch?v=vVX-PrBRtTY%60)\nand know the following:\n\n- All spatial data has a CRS, which specifies how to identify a\n location on earth.\n\n- It's important that all spatial datasets you are working with be in\n the same CRS. You can find the CRS with `st_crs()` and change the\n CRS with `st_transform()`.\n\n- The Urban Institute Style Guide requires the use of the Atlas Equal\n Earth Projection (`\"ESRI:102003\"`) for national maps. For state and\n local maps, use [this](https://github.com/veltman/d3-stateplane)\n handy guide to find an appropriate State Plane projection.\n\n### The long version\n\nCoordinate reference systems (CRS) specify the 3d shape of the earth and\noptionally how we project that 3d shape onto a 2d surface. They are an\nimportant part of working with spatial data as you need to ensure that\nall the data you are working with are in the same CRS in order for\nspatial operations and maps to be accurate.\n\nCRS can be specified either by name (ie Maryland State Plane) or\n**S**patial **R**eference System **ID**entifier (SRID). THe SRID is a\nnumeric identifier that uniquely identifies a coordinate reference\nsystem. Generally when referring to an SRID, you need to refer to an\nauthority (ie the data source) and a unique ID. An example is\n`EPSG:26985` which refers to the Maryland State plane projection from\nthe EPSG, or `ESRI:102003` which refers to the Atlas Equal Area\nprojection from ESRI. Most CRS codes will be from the EPSG, and some\nfrom ESRI and others. A good resource for finding/validating CRS codes\nis [epsg.io](epsg.io).\n\nSidenote - EPSG stands for the now defunct European Petroleum Survey\nGroup. And while oil companies have generally been terrible for the\nearth, the one nice thing they did for the earth was to set up common\nstandards for coordinate reference systems.\n\nYou might be thinking well isn't the earth just a sphere? Why do we need\nall this complicated stuff? And the answer is well the earth is [kind\nof](https://oceanservice.noaa.gov/facts/earth-round.html) a sphere, but\nit's really more of a misshapen ellipsoid which is pudgier at the\nequator than at the poles. To visualize how coordinate reference systems\nwork, imagine that the earth is a (lumpy) orange. Now peel the skin off\nan orange and try to flatten it. There are many ways to do it, but all\nwill create\n[distortions](https://twitter.com/neilrkaye/status/1050740679008296967)\nof some kind. The CRS will give us the formula we've used to specify the\nshape of the orange (usually a sphere or ellipsoid of some kind) and\noptionally, specify how we flattened the orange into 2d.\n\nBroadly, there are two kinds of Coordinate Reference Systems:\n\n1) [**Geographic coordinate\n systems**](https://www.ibm.com/support/knowledgecenter/en/SSGU8G_12.1.0/com.ibm.spatial.doc/ids_spat_407.html)\n\n - (sometimes called unprojected coordinate systems)\n - Specifies a 3d shape for the earth\n - Uses a spheroid/ellipsoid to approximate shape of the earth\n - Usually use decimal degree units (ie latitude/longitude) to\n identify locations on earth\n\n![](mapping/www/images/gcs_image.png)\n\n1) [**Projected coordinate\n systems**](https://mgimond.github.io/Spatial/chp09-0.html#projected-coordinate-systems)\n\n - Specifies a 3d shape for the earth + a 2d mapping\n\n - Is a geographic coordinate system + a *projection*\n\n ![](mapping/www/images/projecting_xkcd.png)\n\n credit: [xkcd](https://imgs.xkcd.com/comics/projecting.png)\n\n - **projection**: mathematical formula used to convert a 3d\n coordinate system to a 2d flat coordinate system\n\n - Many different kinds of projections, including Equal Area,\n Equidistant, Conformal, etc\n\n - All projections distort the true shape of the earth in some\n way, either in terms of shape, area, or angle. Required\n [xkcd comic](https://xkcd.com/977/)\n\n - Usually use linear units (ie feet, meters) and therefore\n useful for distance based spatial operations (ie creating\n buffers)\n\n## Finding the CRS\n\nIf you are lucky, your data will have embedded CRS data that will be\nautomatically detected when the file is read in. This is usually the\ncase for GeoJSONS (`.geojson`) and shapefiles (`.shp`). When you use\n`st_read()` on these files, you should see the CRS displayed in the\nmetadata:\n\n![](mapping/www/images/sf_crs_pic.png)\n\nYou can also the `st_crs()` function to find the CRS. The CRS code is\nlocated at the end in `ID[authority, SRID]`.\n\n```{r st_crs}\nst_crs(dc_firestations)\n```\n\nSometimes, the CRS will be blank or `NA` as the dataset did not specify\nthe CRS. In that case you **MUST find and set the CRS for your data\nbefore proceeding** with analysis. Below are some good rules of thumb\nfor finding out what the CRS for your data is:\n\n- For geojsons, the CRS should always be `EPSG:4326` (or WGS 84). The\n official geojson specification states that this is the only valid\n CRS for geojsons, but in the wild, this may not be true 100% of the\n time.\n- For shapefiles, there should be a file that ends in `.proj` in the\n same directory as the `.shp` file. This file contains the projection\n information for that file and should be used automatically when\n reading in shapefiles.\n- For CSV's with latitude/longitude columns, the CRS is usually\n `EPSG:4326` (or WGS 84).\n- Look at the metadata and any accompanying documentation to see if\n the coordinate reference system for the data is specified\n\nIf none of the above rules of thumb apply to you, check out the\n`crsuggest` R [package](https://github.com/walkerke/crsuggest).\n\nOnce you've identified the appropriate CRS, you can set the CRS for your\ndata with `st_crs()`:\n\n```{r set_crs, eval = FALSE}\n\n# If you are certain that your data contains coordinates in the ESRI Atlas Equal Earth projections\nst_crs(some_sf_dataframe) <- st_crs(\"ESRI:102003\")\n```\n\n## Transforming the CRS\n\nOften you will need to change the CRS for your `sf` dataframe so that\nall datasets you are using have the same CRS, or to use a projected CRS\nfor performing more accurate spatial operations. You can do this with\n`st_transform`:\n\n```{r transform-crs}\n# Transforming CRS from WGS 84 to Urban required Equal Earth Projection\nstate_capitals <- state_capitals %>% st_transform(\"ESRI:102003\")\n```\n\n`st_transform()` also allows you to just use the CRS of another `sf`\ndataframe when transforming.\n\n```{r transform-crs-with-another-sf-object}\n# transform CRS of chip_with_geographies to be the same as CRS of dc_firestations\nchip_with_geographies <- chip_with_geographies %>%\n st_transform(crs = st_crs(state_capitals))\n```\n\nIf you are working with local data, you should use an appropriate state\nplane projection instead of the Atlas Equal Earth projection which is\nmeant for national maps. `library(crsuggest)` can simplify the process\nof picking an appropriate state plane CRS.\n\n```{r crsuggest-ex, cache = TRUE}\nlibrary(crsuggest)\n\nsuggest_crs(dc_firestations) %>%\n # Use the value in the \"crs_code\" column to transform CRS's\n head(4)\n```\n\n# Map\n\nIn order to start mapping, you need an `sf` dataframe. If you don't have\none, see the [`Get Spatial Data`](#get_spatial_data) section above.\n\n## The basics\n\n### library(ggplot2)\n\nMost mapping in R fits the same theoretical framework as plotting in R\nusing `library(ggplot2)`. To learn more about ggplot2, visit the Data\nViz\n[page](https://urbaninstitute.github.io/r-at-urban/graphics-guide.html#Grammar_of_Graphics_and_Conventions)\nor read the official ggplot [book](html).\n\nThe key function for mapping is **the special `geom_sf()` function**\nwhich works with `sf` dataframes. This function magically detects\nwhether you have point or polygon spatial data and displays the results\non a map.\n\n### A simple map\n\nTo make a simple map, add `geom_sf()` to a `ggplot()` and set\n`data = an_sf_dataframe`. Below is code for making a map of all 50\nstates using `library(urbnmapr)`:\n\n```{r first-map, cache = TRUE}\nlibrary(urbnmapr)\n\nstates <- get_urbn_map(\"states\", sf = TRUE)\n\nggplot() +\n geom_sf(\n data = states,\n mapping = aes()\n )\n```\n\n## Styling\n\n### `library(urbnthemes)`\n\n`library(urbnthemes)` automatically styles maps in accordance with the\n[Urban Institute Data Visualization Style\nGuide](http://urbaninstitute.github.io/graphics-styleguide/). By using\n`library(urbnthemes)`, you can create publication ready maps you can\nimmediately drop in to Urban research briefs or blog posts.\n\nTo install `urbnthemes`, visit the package's [GitHub\nrepository](https://github.com/UrbanInstitute/urbnthemes) and follow the\ninstructions. There are 2 ways to use the `urbnthemes` functions:\n\n```{r urbnthemes}\n\nlibrary(urbnthemes)\n\n# You can either run this once per script to automatically style all maps with\n# the Urban theme\nset_urbn_defaults(style = \"map\")\n\n# Or you can add `+ theme_urbn_map()` to the end of every map you make\nggplot() +\n geom_sf(states, mapping = aes()) +\n theme_urbn_map()\n```\n\n### Layering\n\nYou can layer multiple points/lines/polygons on top of each other using\nthe `+` operator from `library(ggplot2)`. The shapes will appear from\nbottom to top (ie the last mapped object will show up on top). It is\nimportant that all layers are in the same CRS (coordinate reference\nsystem).\n\n```{r layers, cache = TRUE}\n\nstate_capitals <- state_capitals %>%\n # This will change CRS to ESRI:102003 and shift the AK and HI state capitals\n # point locations to the appropriate locations on the inset maps.\n tigris::shift_geometry() %>%\n # For now filter out AL and HI as their state capitals will be slightly off.\n filter(!state %in% c(\"Alaska\", \"Hawaii\"))\n\nggplot() +\n geom_sf(\n data = states,\n mapping = aes()\n ) +\n # Note we change the data argument\n geom_sf(\n data = state_capitals,\n mapping = aes(),\n # urbnthemes library has urbn color palettes built in.\n color = palette_urbn_main[\"yellow\"],\n size = 2.0\n ) +\n theme_urbn_map()\n```\n\n### Fill and Outline Colors\n\nThe same commands used to change colors, opacity, lines, size, etc. in\ncharts can be used for maps too. To change the colors of the map , just\nuse the `fill =` and `color =` parameters in `geom_sf()`. `fill` will\nchange the fill color of polygons; `color` will change the color of\npolygon outlines, lines, and points.\n\nGenerally, maps that show the magnitude of a variable use the blue\nsequential ramp and maps that display positives and negatives use the\ndiverging color ramp.`library(urbnthemes)` contains inbuilt. helper\nvariables (like `palette_urbn_main`) for accessing color palettes from\nthe Urban Data Viz Style guide. If for example you want states to be\nUrban's magenta color:\n\n```{r urbnthemes- pink}\n\nggplot() +\n geom_sf(states,\n mapping = aes(),\n # Adjust polygon fill color\n fill = palette_urbn_main[\"magenta\"],\n # Adjust polygon outline color\n color = \"white\"\n ) +\n theme_urbn_map()\n```\n\n### Adding text\n\nYou can also add text, like state abbreviations, directly to your map\nusing `geom_sf_text` and the helper function `get_urbn_labels()`.\n\n```{r geom_sf_text}\nlibrary(urbnmapr)\n\nggplot() +\n geom_sf(states,\n mapping = aes(),\n color = \"white\"\n ) +\n theme_urbn_map() +\n # Generates dataframe of state abbv and appropriate location to plot them\n geom_sf_text(\n data = get_urbn_labels(\n map = \"states\",\n sf = TRUE\n ),\n aes(label = state_abbv),\n size = 3\n )\n```\n\nThere's also `geom_sf_label()` if you want labels with a border.\n\n# Map Gallery {#map_gallery}\n\nBelow are copy and pasteable examples of maps you can make, after you\nhave an `sf` dataframe.\n\n## Choropleth Maps\n\nChoropleth maps display geographic areas with shades, colors, or\npatterns in proportion to a variable or variables. Choropleth maps can\nrepresent massive geographies like the entire world and small\ngeographies like Census Tracts. To make a choropleth map, you need to\nset `geom_sf(aes(fill = some_variable_name))`. Below are examples\n\n### Continuous color scale\n\n```{r choropoleth_continious}\n# Map of CHIP enrollment percentage by state\nchip_with_geographies_map <- chip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct\n ))\n\n\n# Below add-ons to the map are optional, but make the map look prettier.\nchip_with_geographies_map +\n # scale_fill_gradientn adds colors with more interpolation and reverses color scale\n scale_fill_gradientn(\n # Convert legend from decimal to percentages\n labels = scales::percent_format(),\n # Make legend title more readable\n name = \"CHIP Enrollment %\",\n # Manually add 0 to lower limit to include it in legend. NA=use maximum value in data\n limits = c(0, NA),\n # Set number of breaks on legend = 3\n n.breaks = 3\n )\n```\n\n### Discrete color scale\n\nThe quick and dirty way is with `scale_fill_steps()`, which creates\ndiscretized bins for continuous variables:\n\n```{r chorpleth_disccrete}\nchip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct\n )) +\n scale_fill_steps(\n # Convert legend from decimal to percentages\n labels = scales::percent_format(),\n # Make legend title more readable\n name = \"CHIP Enrollment %\",\n # Show top and bottom limits on legend\n show.limits = TRUE,\n # Roughly set number of bins. Won't be exact as R uses algorithms under the\n # hood for pretty looking breaks.\n n.breaks = 4\n )\n```\n\nOften you will want to manually generate the bins yourself to give you\nmore fine grained control over the exact legend text. (ie `1% - 1.8%`,\n`1.8 - 2.5%`, etc). Below is an example of discretizing the continuous\n`chip_pct` variable yourself using `cut_interval()` and a helper\nfunction to get nice looking interval labels:\n\n```{r format_intervals}\n\n# Helper function to clean up R generated intervals into nice looking interval labels\nformat_interval <- function(interval_text) {\n text <- interval_text %>%\n # Remove open and close brackets which is R generated math notation\n str_remove_all(\"\\\\(\") %>%\n str_remove_all(\"\\\\)\") %>%\n str_remove_all(\"\\\\[\") %>%\n str_remove_all(\"\\\\]\") %>%\n str_replace_all(\",\", \" — \")\n\n # Convert decimal ranges to percent ranges\n text <- text %>%\n str_split(\" — \") %>%\n map(~ as.numeric(.x) %>%\n scales::percent() %>%\n paste0(collapse = \" — \")) %>%\n unlist() %>%\n # By default character vectors are plotted in alphabetical order. We want\n # factors in reverse alphabetical order to get correct colors in ggplot\n fct_rev()\n\n return(text)\n}\n\nchip_with_geographies <- chip_with_geographies %>%\n # cut_interval into n groups with equal range. Set boundary so 0 is included in the bins\n mutate(chip_pct_interval = cut_interval(chip_pct, n = 5)) %>%\n # Generate nice looking interval labels\n mutate(chip_pct_interval = format_interval(chip_pct_interval))\n```\n\nAnd now we can map the discretized `chip_pct_interval` variable using\n`geom_sf()`:\n\n```{r make_discrete_map}\nchip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct_interval\n )) +\n # Default is to use main urban palette, which assumes unrelated groups. We\n # adjust colors manually to be on Urban cyan palette\n scale_fill_manual(\n values = palette_urbn_cyan[c(8, 7, 5, 3, 1)],\n name = \"CHIP Enrollment %\"\n )\n```\n\nIn addition to `cut_interval` there are [similar\nfunctions](https://ggplot2.tidyverse.org/reference/cut_interval.html)\nfor creating intervals/bins with slightly different rules. When creating\nbins, be careful as changing the number of bins can drastically change\nhow the map looks.\n\n## Bubble Maps\n\nThis is just a layered map with one polygon layer and one point layer,\nwhere the points are sized in accordance with a variable in your data.\n\n```{r bubble_maps, cache = TRUE}\nset_urbn_defaults(style = \"map\")\n\n# Get sf dataframe of DC tracts\nlibrary(tigris)\ndc_tracts <- tracts(\n state = \"DC\",\n year = 2019,\n progress_bar = FALSE\n)\n\n# Add bubbles for firestations\nggplot() +\n geom_sf(data = dc_tracts, fill = palette_urbn_main[\"gray\"]) +\n geom_sf(\n data = dc_firestations,\n # Size bubbles by number of trucks at each station\n aes(size = TRUCK),\n color = palette_urbn_main[\"yellow\"],\n # Adjust transparency for readability\n alpha = 0.8\n )\n```\n\n## Dot-density Maps\n\nThese maps scatter dots within a geographic area. Typically each dot\nrepresents a unit (like 100 people, or 1000 houses). To create this kind\nof map, you need to start with an `sf` dataframe that is of `geometry`\ntype `POLYGON` or `MULTIPOLYGON` and then sample points within the\npolygon.\n\nThe below code generates a dot-density map representing people of\ndifferent races within Washington DC tracts The code may look a little\ncomplicated, but the key workhorse function is `st_sample()` which\nsamples points within each polygon to use in the dot density map:\n\n```{r dot_density_maps, cache = TRUE}\nlibrary(tidycensus)\n\n# Get counts by race of DC tracts\ndc_pop <- get_acs(\n geography = \"tract\",\n state = \"DC\",\n year = 2019,\n variables = c(\n Hispanic = \"DP05_0071\",\n White = \"DP05_0077\",\n Black = \"DP05_0078\",\n Asian = \"DP05_0080\"\n ),\n geometry = TRUE,\n progress_bar = FALSE\n)\n\n# Get unique groups (ie races)\ngroups <- unique(dc_pop$variable)\n\n# For each unique group (ie race), generate sampled points\ndc_race_dots <- map_dfr(groups, ~ {\n dc_pop %>%\n # .x = the group used in the loop\n filter(variable == .x) %>%\n # Use the projected MD state plane for accuracy\n st_transform(crs = \"EPSG:6487\") %>%\n # Have every dot represent 100 people\n mutate(est100 = as.integer(estimate / 100)) %>%\n st_sample(size = .$est100, exact = TRUE) %>%\n st_sf() %>%\n # Add group (ie race) as a column so we can use it when plotting\n mutate(group = .x)\n})\n\n\nggplot() +\n # Plot tracts, then dots on top of tracts\n geom_sf(\n data = dc_pop,\n # Make interior of tracts transparent and boundaries black\n fill = \"transparent\",\n color = \"black\"\n ) +\n geom_sf(\n data = dc_race_dots,\n # Color in dots by racial group\n aes(color = group),\n # Adjust transparency and size to be more readable\n alpha = 0.5,\n size = 1.1,\n stroke = FALSE\n )\n```\n\n## Geofacets\n\nGeofaceting arranges sub-geography-specific plots into a grid that\nresembles a larger geography (usually the US). This can be a useful\nalternative to choropleth maps, which tend to overemphasize\nlow-population density areas with large areas. To make geofacetted\ncharts, use the `facet_geo()` function from the `geofacet` library,\nwhich can be thought of as equivalent to ggplot2's `facet_wrap()`. For\nthis example, we'll use the built-in `state_ranks` data.\n\n```{r geofacet-data}\nlibrary(geofacet)\n\nhead(state_ranks %>% as_tibble())\n```\n\n```{r geofacet-ex, cache = TRUE}\nset_urbn_defaults(style = \"print\")\n\nstate_ranks %>%\n filter(variable %in% c(\"education\", \"employment\")) %>%\n ggplot(aes(x = rank, y = variable)) +\n geom_col() +\n facet_geo(\n facets = \"state\",\n # Use custom urban geofacet grid which is built into urbnthemes\n # For now we need to rename a few columns as urbnthemes has to be\n # updated\n grid = urbnthemes::urbn_geofacet %>%\n rename(\n code = state_code,\n name = state_name\n )\n )\n```\n\nInteractive geofacets of the United States have been used in Urban\nFeatures like [A Matter of\nTime](https://apps.urban.org/features/long-prison-terms/trends.html)\nwhich included geofaceted line charts showing trends in incarceration by\nstate. Static geofacets of the United States were included in [Barriers\nto Accessing Homeownership Down Payment, Credit, and\nAffordability](https://www.urban.org/sites/default/files/publication/94801/barriers-to-homeownership-down-payments-credit-access-and-affordability_3.pdf)\nby the Housing Finance Policy Center.\n\n### Tile grid map\n\nYou can select predefined grids, or create your own at https://hafen.github.io/grid-designer/ \n\n```{r}\n# create a grid with all of the US states and territories \nmygrid <- data.frame(\n code = c(\"ME\", \"AK\", \"WI\", \"VT\", \"NH\", \"IL\", \"ID\", \"WA\", \"MN\", \"MT\", \"ND\", \"MI\", \"NY\", \"MA\", \"IA\", \"IN\", \"CT\", \"RI\", \"NJ\", \"PA\", \"OH\", \"SD\", \"WY\", \"NV\", \"OR\", \"CA\", \"NE\", \"DE\", \"MD\", \"VA\", \"WV\", \"KY\", \"MO\", \"CO\", \"UT\", \"AZ\", \"KS\", \"AR\", \"DC\", \"SC\", \"NC\", \"TN\", \"NM\", \"LA\", \"AL\", \"GA\", \"MS\", \"OK\", \"HI\", \"FL\", \"TX\"),\n row = c(1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8),\n col = c(12, 2, 7, 11, 12, 7, 3, 2, 6, 4, 5, 8, 10, 11, 6, 7, 11, 12, 10, 9, 8, 5, 4, 3, 2, 2, 5, 11, 10, 9, 8, 7, 6, 4, 3, 3, 5, 6, 10, 9, 8, 7, 4, 6, 8, 9, 7, 5, 2, 10, 5),\n stringsAsFactors = FALSE\n)\n\n## Combine data into geo_grid for tiling:\ngeo_grid_data <- mygrid %>% \n left_join(chip_with_geographies, by=c(\"code\" = \"state_abbv\")) \n\n## plot tile grid\ngeo_grid_data %>% \n ggplot(aes(x = col, y = row, fill = chip_pct_interval)) +\n scale_fill_manual(values = palette_urbn_cyan[c(8, 7, 5, 3, 1)], \n \t\t\t\t\t\t\t\t\t name = \"CHIP Enrollment %\") +\n geom_tile(color = \"white\", linewidth = 1) +\n geom_text(aes(label = code), color=\"white\", size = 4) +\n scale_y_reverse() +\n coord_equal() +\n labs(fill=NULL)\n```\n\n\n## Cartograms\n\nCartograms are a modified form of a choropleth map with intentionally\ndistorted sizes that map to a variable in your data. Below we create a\ncartogram with `library(cartogram)` where the state sizes are\nproportional to the population.\n\n```{r cartogram-example, cache = TRUE}\nlibrary(cartogram)\n\nset_urbn_defaults(style = \"map\")\n\nchip_with_geographies_weighted <- chip_with_geographies %>%\n # Note column name needs to be in quotes for this package\n cartogram_cont(weight = \"population\")\n\nggplot() +\n geom_sf(\n data = chip_with_geographies_weighted,\n # Color in states by chip percentages\n aes(fill = chip_pct)\n )\n```\n\n## Interactive Maps\n\nInteractive maps can be a great exploratory tool to explore and\nunderstand your data. And luckily there are a lot of new R packages that\nmake it really easy to create them. Interactive maps are powerful but\n**we do not recommend them for official use in Urban publications** as\ngetting them in Urban styles and appropriate basemaps can be tricky\n(reach out to\n[anarayanan\\@urban.org](mailto:anarayanan@urban.org){.email} if you\nreally want to include them).\n\n### `library(mapview)`\n\n`library(mapview)` is probably the most user friendly of the interactive\nmapping R libraries. All you have to do to create an interactive map is:\n\n```{r show-mapview}\nlibrary(mapview)\n\n\nchip_with_geographies_for_interactive_mapping <- chip_with_geographies %>%\n # Filter out AL and HI bc they would appear in Mexico. If you want AL, HI and\n # in the correct place in interactive maps, make sure to use tigris::states()\n filter(!state_abbv %in% c(\"AK\", \"HI\"))\n\nmapview(chip_with_geographies_for_interactive_mapping)\n```\n\nWhen you click on an object, you get a popup table of all it's\nattributes. And when you hover over an object, you get a popup with an\nobject id.\n\nEach of the above behaviors can be changed if desired. As you'll see in\nthe below section, the syntax for `library(mapview)` is significantly\ndifferent from `library(ggplot2)` so be careful!\n\n#### Coloring in points/polygons\n\nIn order to create a choropleth map where we color in the\npoints/polygons by a variable, we need to feed in a column name *in\nquotes* to the`zcol` argument inside the `mapview()` function:\n\n```{r mapview_zcol}\n# Create interactive state map colored in by chip enrollment\nmapview(chip_with_geographies_for_interactive_mapping, zcol = \"chip_enrollment\")\n```\n\nIf you want more granular control over the color palette for the legend\ncan also feed in a vector of color hex codes to `col.regions` along with\na column name to `zcol`. This will create a continuous color range along\nthe provided colors. Be careful though as the color interpolation is not\nperfect.\n\n```{r mapview-colors-granular}\n# library(RColorBrewer)\nmapview(chip_with_geographies_for_interactive_mapping,\n col.regions = c(\n palette_urbn_green[6],\n \"white\",\n palette_urbn_cyan[6]\n ),\n zcol = \"chip_enrollment\"\n)\n```\n\nIf you want to color in all points/polygons as the same color, just feed\nin a single color hex code to the `col.regions` argument:\n\n```{r mapview-colors}\nmapview(chip_with_geographies_for_interactive_mapping,\n col.regions = palette_urbn_green[5]\n)\n```\n\n#### Adding layers\n\nYou can add multiple `sf` objects on the same map by using the `+`\noperator. This is very useful when comparing 2 or more spatial datasets.\n\n```{r mapview-layers}\nmapview(chip_with_geographies_for_interactive_mapping, col.regions = palette_urbn_green[5]) +\n mapview(state_capitals, col.regions = palette_urbn_cyan[5])\n```\n\nYou can even create slider maps by using the `|` operator!\n\n```{r mapview-sliders}\nmapview(chip_with_geographies_for_interactive_mapping, col.regions = palette_urbn_green[5]) |\n mapview(state_capitals, col.regions = palette_urbn_cyan[5])\n```\n\n### More details\n\nTo learn more about more advanced options with `mapview` maps, check out\nthe\n[documentation](https://r-spatial.github.io/mapview/articles/articles/mapview_02-advanced.html)\npage and the [reference\nmanual](https://cran.r-project.org/web/packages/mapview/mapview.pdf).\n\nThere are also other interactive map making packages in R like `leaflet`\n(which `mapview` is a more user friendly wrapper of), `tmap`, and\n`mapdeck`. To learn about these other packages, [this book\nchapter](https://geocompr.robinlovelace.net/adv-map.html#interactive-maps)\nis a good starting point.\n\n# Spatial Operations\n\n## Cropping\n\nCropping (or clipping) is geographically filtering an `sf` dataframe to\njust the area we are interested in. Say we wanted to look at the roads\naround Fire Station 24 in DC.\n\n```{r roads_cropping_before, cache = TRUE}\nlibrary(tigris)\nlibrary(units)\n\ndc_firestations <- dc_firestations %>%\n st_transform(\"EPSG:6487\")\n\n\n# Draw 500 meter circle around one fire station\nfire_station_24_buffered <- dc_firestations %>%\n filter(NAME == \"Engine 24 Station\") %>%\n st_buffer(set_units(500, \"meter\"))\n\n# Get listing of all roads in DC\ndc_roads <- roads(\n state = \"DC\",\n county = \"District of Columbia\",\n class = \"sf\",\n progress_bar = FALSE\n) %>%\n st_transform(\"EPSG:6487\")\n\n# View roads on top of fire_station\nggplot() +\n # Order matters! We need to plot fire_stations first, and then roads on top\n # to see overlapping firestations\n geom_sf(\n data = fire_station_24_buffered,\n fill = palette_urbn_cyan[1],\n color = palette_urbn_cyan[7]\n ) +\n geom_sf(\n data = dc_roads,\n color = palette_urbn_gray[7]\n ) +\n theme_urbn_map()\n```\n\nWe can clip the larger roads dataframe to just roads that overlap with\nthe circle around the fire station with `st_intersection()`.\n\n```{r roads_cropping_after}\n\n# Use st_intersection() to crop the roads data to just roads within the\n# fire_station radius\ndc_roads_around_fire_station_24_buffered <- fire_station_24_buffered %>%\n st_intersection(dc_roads)\n\nggplot() +\n geom_sf(\n data = fire_station_24_buffered,\n fill = palette_urbn_cyan[1],\n color = palette_urbn_cyan[7]\n ) +\n geom_sf(\n data = dc_roads_around_fire_station_24_buffered,\n color = palette_urbn_gray[7]\n ) +\n theme_urbn_map()\n```\n\n**More Coming Soon!**\n\n## Calculating Distance\n\n## Spatial Joins\n\n### Point to Polygon\n\n### Polygon to Polygon\n\n## Aggregating\n\n## Drive/Transit times\n\n## Geocoding\n\nGeocoding is the process of turning text (usually addresses) into\ngeographic coordinates (usually latitudes/longitudes) for use in\nmapping. For Urban researchers, we highly recommend using the [Urban\ngeocoder](https://tech-tools.urban.org/geocoding/) as it is fast,\naccurate, designed to work with sensitive/confidential data and most\nimportantly free to use for Urban researchers! To learn about how we set\nup and chose the geocoder for the Urban Institute, you can read our\n[Data\\@Urban\nblog](https://medium.com/@urban_institute/choosing-a-geocoder-for-the-urban-institute-86192f656c5f).\n\n### Cleaning Addresses\n\nThe single most important factor in getting accurate geocoded data is\nhaving cleaned, well structured address data. This can prove difficult\nas address data out in the wild is often messy and unstandardized. While\nthe rules for cleaning addresses are very data specific, below are some\nexamples of clean addresses you should aim for in your data cleaning\nprocess:\n\n```{r cleaned-addr, cache=TRUE,eval=TRUE,results=TRUE, echo=FALSE}\nlibrary(gt)\ncleaned_address_table <- tribble(\n ~\"f_address\", ~\"Type of address\",\n \"123 Troy Drive, Pillowtown, CO, 92432\", \"residnetial address\",\n \"789 Abed Avenue, Apt 666, Blankesburg, CO, 92489\", \"residential apartment address\",\n \"Shirley Boulevard and Britta Drive, Blanketsburg, CO, 92489\", \"street intersection\",\n \"Pillowtown, CO\", \"city\",\n \"92489, CO\", \"Zip Code\",\n)\n\ngt(cleaned_address_table) %>%\n # tab_header(title = md(\"Clean Address Examples\")) %>%\n opt_row_striping(row_striping = TRUE) %>%\n tab_style(\n style = list(\n cell_text(weight = \"bold\")\n ),\n locations = cells_column_labels(\n columns = vars(f_address, `Type of address`)\n )\n ) %>%\n opt_align_table_header(align = c(\"left\")) %>%\n tab_options(\n container.width = \"100%\",\n container.height = \"400px\",\n # column_labels.background.color = palette_urbn_cyan[1],\n table.border.top.width = 0,\n table.border.bottom.width = 0,\n column_labels.border.bottom.width = 0,\n )\n```\n\nAll that being said, our geocoder is pretty tolerant of different\naddress formats, typos/spelling errors and missing states, zip codes,\netc. So don't spend too much time cleaning every address in the data.\nAlso note that while our geocoder is able to geocode cities and zip\ncodes, it will return the lat/lon of the center of the city/zip code,\nwhich may not be what you want.\n\n### Instructions\n\nTo use the [Urban geocoder](https://tech-tools.urban.org/geocoding/),\nyou will need to:\n\n1) Generate a CSV with a column named `f_address` which contains the\n addresses in single line format (ie\n `123 Abed Avenue, Blanketsburg, CO, 94328`). This means that if you\n have the addresses split across multiple columns (ie `Address`,\n `City`, `State`, `Zip` columns), you will need to concatenate them\n into one column. Also see our Address cleaning section above.\n\n2) Go to the Urban geocoder and answer the initial questions. This will\n tell you whether your data is non-confidential or confidential data,\n and allow you to upload your CSV for geocoding.\n\n3) Wait for an email telling you your results are ready. If your data\n is non-confidential, this email will contain a link to your geocoded\n results. This link expires in 24 hours, so make sure to download\n your data before then. If you data is confidential, the email will\n contain a link to the location on the Y Drive where your\n confidential geocoded data is stored. You can specify this output\n folder when submitting the CSV in step 1.\n\n### Geocoder outputs\n\n
The geocoded file will be your original data, plus a few more columns\n(including latitude and longitude). each of the new columns that have\nbeen appended to your original data. [It's very important that you take\na look at the Addr_type\ncolumn]{style=\"background-color: #FFFF00; font-weight: bold\"} in the\nCSV before doing further analysis to check the accuracy of the geocoding\nprocess.
\n\n+---------------+---------------------------------------------------+\n| Column | Description |\n+:==============+:==================================================+\n| Match_addr | The actual address that the inputted address was |\n| | matched to. This is the address that the geocoder |\n| | used to get Latitudes / Longitudes. If there are |\n| | potentially many typos or non standard address |\n| | formats in your data file, you will want to take |\n| | a close look at this column to confirm that the |\n| | matched address correctly handled typos and badly |\n| | formatted addresses. |\n+---------------+---------------------------------------------------+\n| Longitude | The WGS 84 datum Longitude (EPSG code 4326) |\n+---------------+---------------------------------------------------+\n| Latitude | The WGS 84 datum Latitude (EPSG code 4326) |\n+---------------+---------------------------------------------------+\n| Addr_type | The match level for a geocode request. This |\n| | should be used as an indicator of the precision |\n| | of geocode results. Generally, Subaddress, |\n| | PointAddress, StreetAddress, and StreetInt |\n| | represent accurate matches. The list below |\n| | contains all possible values for this field. |\n| | **Green values represent High accuracy matches, |\n| | yellow represents Medium accuracy matches and red |\n| | represents Low accuracy/inaccurate matches**. If |\n| | you have many yellow and red values in your data, |\n| | you should manually check the results before |\n| | proceeding with analysis. All possible values:\\ |\n| | \\ |\n| | **Subaddress:** A subset of a PointAddress that |\n| | represents a house or building subaddress |\n| | location, such as an apartment unit, floor, or |\n| | individual building within a complex. The |\n| | UnitName, UnitType, LevelName, LevelType, |\n| | BldgName, and BldgType field values help to |\n| | distinguish subaddresses which may be associated |\n| | with the same PointAddress. Reference data |\n| | consists of point features with associated house |\n| | number, street name, and subaddress elements, |\n| | along with administrative divisions and optional |\n| | postal code; for example, 3836 Emerald Ave, Suite |\n| | C, La Verne, CA, 91750.\\ |\n| | \\ |\n| | **PointAddress:** A street address based on |\n| | points that represent house and building |\n| | locations. Typically, this is the most spatially |\n| | accurate match level. Reference data contains |\n| | address points with associated house numbers and |\n| | street names, along with administrative divisions |\n| | and optional postal code. The X / Y |\n| | (`Longitude`/`Latitude`) and `geometry` output |\n| | values for a PointAddress match represent the |\n| | street entry location for the address; this is |\n| | the location used for routing operations. The |\n| | `DisplayX` and `DisplayY` values represent the |\n| | rooftop, or actual, location of the address. |\n| | Example: 380 New York St, Redlands, CA, 92373.\\ |\n| | \\ |\n| | **StreetAddress** --- A street address that |\n| | differs from PointAddress because the house |\n| | number is interpolated from a range of numbers. |\n| | Reference data contains street center lines with |\n| | house number ranges, along with administrative |\n| | divisions and optional postal code information, |\n| | for example, 647 Haight St, San Francisco, CA, |\n| | 94117.\\ |\n| | \\ |\n| | **StreetInt:** A street address consisting of a |\n| | street intersection along with city and optional |\n| | state and postal code information. This is |\n| | derived from StreetAddress reference data, for |\n| | example, Redlands Blvd & New York St, Redlands, |\n| | CA, 92373.\\ |\n| | \\ |\n| | **StreetName:** Similar to a street address but |\n| | without the house number. Reference data contains |\n| | street centerlines with associated street names |\n| | (no numbered address ranges), along with |\n| | administrative divisions and optional postal |\n| | code, for example, W Olive Ave, Redlands, CA, |\n| | 92373.\\ |\n| | \\ |\n| | **StreetAddressExt:** An interpolated street |\n| | address match that is returned when parameter |\n| | matchOutOfRange=true and the input house number |\n| | exceeds the house number range for the matched |\n| | street segment.\\ |\n| | \\ |\n| | **DistanceMarker:** A street address that |\n| | represents the linear distance along a street, |\n| | typically in kilometers or miles, from a |\n| | designated origin location. Example: Carr 682 KM |\n| | 4, Barceloneta, 00617.\\ |\n| | \\ |\n| | **PostalExt:** A postal code with an additional |\n| | extension, such as the United States Postal |\n| | Service ZIP+4. Reference data is postal code |\n| | points with extensions, for example, 90210-3841.\\ |\n| | \\ |\n| | **POI:** ---Points of interest. Reference data |\n| | consists of administrative division place-names, |\n| | businesses, landmarks, and geographic features, |\n| | for example, Golden Gate Bridge.\\ |\n| | \\ |\n| | **Locality:** A place-name representing a |\n| | populated place. The Type output field provides |\n| | more detailed information about the type of |\n| | populated place. Possible Type values for |\n| | Locality matches include Block, Sector, |\n| | Neighborhood, District, City, MetroArea, County, |\n| | State or Province, Territory, Country, and Zone. |\n| | Example: Bogotá, COL,\\ |\n| | \\ |\n| | **PostalLoc:** A combination of postal code and |\n| | city name. Reference data is typically a union of |\n| | postal boundaries and administrative (locality) |\n| | boundaries, for example, 7132 Frauenkirchen.\\ |\n| | \\ |\n| | **Postal:** Postal code. Reference data is postal |\n| | code points, for example, 90210 USA. |\n+---------------+---------------------------------------------------+\n| Score | A number from 1--100 indicating the degree to |\n| | which the input tokens in a geocoding request |\n| | match the address components in a candidate |\n| | record. A score of 100 represents a perfect |\n| | match, while lower scores represent decreasing |\n| | match accuracy. |\n+---------------+---------------------------------------------------+\n| Status | Indicates whether a batch geocode request results |\n| | in a match, tie, or unmatched. Possible values |\n| | include\\ |\n| | \\ |\n| | M - Match. The returned address matches the input |\n| | address and is the highest scoring candidate.\\ |\n| | \\ |\n| | T - Tied. The returned address matches the input |\n| | address but has the same score as one or more |\n| | additional candidates.\\ |\n| | \\ |\n| | U - Unmatched. No addresses match the inputted |\n| | address. |\n+---------------+---------------------------------------------------+\n| geometry | The WKT (Well-known text) representation of the |\n| | latitudes and longitudes. This column may be |\n| | useful if you're reading the CSV into R, Python, |\n| | or ArcGIS |\n+---------------+---------------------------------------------------+\n| Region | The state that `Match_addr` is located in |\n+---------------+---------------------------------------------------+\n| RegionAbbr | Abbreviated State Name. For example, CA for |\n| | California |\n+---------------+---------------------------------------------------+\n| Subregion | The county that the input address is located in |\n+---------------+---------------------------------------------------+\n| MetroArea | The name of the Metropolitan area that |\n| | `Match_addr` is located in. This field may be |\n| | blank if the input address is not located within |\n| | a metro area. |\n+---------------+---------------------------------------------------+\n| City | The city that `Match_addr` is located in |\n+---------------+---------------------------------------------------+\n| Nbrhd | The Neighborhood that `Match_addr` is located in. |\n| | Note these are ESRI defined neighborhoods which |\n| | may or may not align with other sources |\n| | neighborhood definitions |\n+---------------+---------------------------------------------------+\n\n\\\n\n# Geospatial Modeling\n\nComing soon!\n\n# Bibliography and references\n\n------------------------------------------------------------------------\n\n```{r session-info}\n\nsessionInfo()\n```\n","srcMarkdownNoYaml":"\n\n\n\n::: {#header}\n\n:::\n\n```{r markdown-setup, include=FALSE}\nknitr::opts_chunk$set(fig.path = \"mapping/www/images/\")\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\nknitr::opts_chunk$set(warning = FALSE)\n\noptions(scipen = 999)\n```\n\n```{r setup, include=FALSE}\nlibrary(tidyverse)\nlibrary(knitr)\nlibrary(kableExtra)\nlibrary(here)\nlibrary(sf)\n```\n\n# Introduction\n\nThis guide will teach you the concepts and code you will need for\nmapping and geospatial analysis in R. **This is a long guide, so if you\nneed something specific, we encourage you to scroll to the appropriate\nsection using the Table of Contents on the left.** If you just want copy\nand pasteable code to create different kinds of maps, head to the\n[`Map Gallery`](#map_gallery).\n\nNow let's start mapping!\n\n![](mapping/www/images/yay_maps.gif)\n\n## Geospatial Workflow\n\nThis picture below outlines what we think are the main steps in a\ngeospatial workflow. This guide will be split into sections describing\neach of the steps.\n\n![](mapping/www/images/geospatial_workflow.png)\n\n## Should this be a map?\n\nThe [Urban Institute Data Visualization Style\nGuide](http://urbaninstitute.github.io/graphics-styleguide/) offers some\nblunt but useful suggestions for maps:\n\n> Just because you've got geographic data, doesn't mean that you have to\n> make a map. Many times, there are more efficient storyforms that will\n> get your point across more clearly. If your data shows a very clear\n> geographic trend or if the absolute location of a place or event\n> matters, maps might be the best approach, but sometimes the reflexive\n> impulse to map the data can make you forget that showing the data in\n> another form might answer other---and sometimes more\n> important---questions.\n\nSo we would encourage you to think critically before making a map.\n\n## Why map with R?\n\nR can have a steeper learning curve than point-and-click tools - like\nQGIS or ArcGIS - for geospatial analysis and mapping. But creating maps\nin R has many advantages including:\n\n1) **Reproducibility**: By creating maps with R code, you can easily\n share the outputs and the code that generated the output with\n collaborators, allowing them to replicate your work and catch errors\n easily.\n\n2) **Iteration**: With point and click software like ArcGIS, making 50\n maps would be 50 times the work/time. But using R, we can easily\n make make many iterations of the same map with a few changes to the\n code.\n\n3) **Easy Updates**: Writing code provides a roadmap for others (and\n future you!) to quickly update parts of the map as needed. Say for\n example a collaborator wanted to change the legend colors of 50\n state maps. With R, this is possible in just a few seconds!\n\n4) **An Expansive ecosystem**: There are several R packages that make\n it very easy to get spatial data, create static and interactive\n maps, and perform spatial analyses. This feature rich package\n ecosystem which all play nice together is frankly unmatched by other\n programming languages and even point and click tools like QGIS and\n ArcGIS. Some of these R packages include:\n\n - `sf`: For managing and analyzing spatial dataframes\n - `tigris`: For downloading in Census geographies\n - `ggplot2`: For making publication ready static maps\n - `urbnmapr`: For automatically adding Urban styling to static\n maps\n - `mapview`: For making expxploratory interactive maps\n\n5) **Cost**: Most point-and-click tools for geospatial analysis are\n proprietary and expensive. R is free open-source software. The\n software and most of its packages can be used for free by anyone for\n almost any use case.\n\n## Helpful Learning Resources\n\nIn addition to this guide, you may want to look at these other helpful\nresources:\n\n- The Urban Institute [mapping training\n series](https://ui-research.github.io/urbn101-mapping/) (with video\n lectures and notes)\n- Chapters\n [5](https://walker-data.com/census-r/census-geographic-data-and-applications-in-r.html),\n [6](https://walker-data.com/census-r/mapping-census-data-with-r.html),\n and\n [7](https://walker-data.com/census-r/spatial-analysis-with-us-census-data.html)\n from Kyle Walker's Analyzing US Census Data\n [book](https://walker-data.com/census-r/index.html).\n- Andrew Heiss' fantastic mapping\n [guide](https://datavizm20.classes.andrewheiss.com/example/12-example/)\n- All of the vignettes for the [`sf`\n package](https://cran.r-project.org/web/packages/sf/sf.pdf)\n- [Geocomputation with\n R](https://geocompr.robinlovelace.net/index.html): A book by Robin\n Lovelace and others\n- UChicago's R Spatial Workshops:\n \n\n# Get Spatial Data {#get_spatial_data}\n\n------------------------------------------------------------------------\n\n## library(sf) {.tabset .tabset-pills}\n\n### The short version\n\n`library(sf)` stores geospatial data, which are\n**points** (a single longitude/latitude),\n**lines** (a pair of connected points), or\n**polygons** (a collection of points which\nmake a polygon) in a `geometry` column within R dataframes\n\n![](mapping/www/images/amtrak_points_lines_polygons.jpg)\n\nThis is what `sf` dataframe looks like in the console:\n\n```{r print-sf-dataframe}\ndc_parks <- st_read(\"mapping/data/dc_parks.geojson\", \n\t\t\t\t\t\t\t\t\t\tquiet = TRUE)\n\n# Print just the NAME and geometry column\ndc_parks %>%\n select(NAME) %>%\n head(2)\n```\n\n### The long version\n\nThe `sf` library is a key tool for reading in, managing, and working\nwith spatial data in R. `sf` stands for simple features (not San\nFrancisco you Bay Area folks) and denotes a way to describe the spatial\nattributes of real life objects. The R object you will be working with\nmost frequently for mapping is an `sf` dataframe. An `sf` dataframe is\nessentially a regular R dataframe, with a couple of extra features for\nuse in mapping. These extra features exclusive to `sf` dataframes\ninclude:\n\n- sticky `geometry` columns\n- attached coordinate reference systems\n- some other spatial metadata\n\nThe most important of the above list is the sticky `geometry` column,\nwhich is a magical column that contains all of the geographic\ninformation for each row of data. Say for example you had a `sf`\ndataframe of all DC census tracts. Then the `geometry` column would\ncontain all of the geographic points used to define DC census tract\npolygons. The stickiness of this column means that no matter what data\nmunging/filtering you do, you will not be able to drop or delete the\n`geometry` column. Below is a graphic to help you understand this:\n\n![](mapping/www/images/sf_sticky_geometry.png)\n\ncredits: @allisonhorst\n\nThis is what an `sf` dataframe looks like in the console:\n\n```{r print_sf}\n# Read in spatial data about DC parks from DC Open Data Portal\ndc_parks <- st_read(\"https://opendata.arcgis.com/api/v3/datasets/287eaa2ecbff4d699762bbc6795ffdca_9/downloads/data?format=geojson&spatialRefId=4326\",\n\t\t\t\t\t\t\t\t\t\tquiet = TRUE)\n\n# dc_parks <- st_read(\"mapping/data/dc_parks.geojson\")\n\n# Select just a few columns for readability\ndc_parks <- dc_parks %>%\n select(NAME, geometry)\n\n# Print to the console\ndc_parks\n```\n\nNote that there is some spatial metadata such as the `Geometry Type`,\n`Bounding Box`, and `CRS` which shows up as a header before the actual\ncontents of the dataframe.\n\nSince `sf` dataframes operate similarly to regular dataframes, we can\nuse all our familiar `tidyverse` functions for data wrangling, including\n`select`, `filter`, `rename`, `mutate`, `group_by` and `summarize`. The\n`sf` package also has many functions that provide easy ways to replicate\ncommon tasks done in other GIS software like spatial joins, clipping,\nand buffering. Almost all of the mapping and geospatial analysis methods\ndescribed in this guide rely on you having an `sf` dataframe. So let's\ntalk about how to get one!\n\n## Importing spatial data {.tabset .tabset-pills}\n\nGetting an `sf` dataframe is always the first step in the geospatial\nworkflow. Here's how to import spatial data for...\n\n### States and counties\n\nWe highly recommend using the `library(urbnmapr)` package, which was\ncreated by folks here at Urban to easily create state and county level\nmaps. The `get_urbn_map()` function in the package allows you to read in\nspatial data on states and counties, with options to include\nterritories. Importantly, it will also display AL and HI as insets on\nthe map in accordance with the Urban Institute Data Visualization Style\nGuide. For information on how to install `urbnmapr`, see the [GitHub\nrepository](https://github.com/UrbanInstitute/urbnmapr).\n\nBelow is an example of how you would use `urbnmapr` to get an `sf`\ndataframe of all the states or counties in the US.\n\n```{r urbnmapr-1, eval=FALSE}\nlibrary(urbnmapr)\n\n# Get state data\nstates <- get_urbn_map(\"states\", sf = TRUE)\n\n# Can also get county data\ncounties <- get_urbn_map(\"counties\", sf = TRUE)\n```\n\n### Other Census geographies\n\nUse the `library(tigris)` package, which allows you to easily download\nTIGER and other cartographic boundaries from the US Census Bureau. In\norder to automatically load in the boundaries as `sf` objects, run\n`r options(tigris_class = \"sf\")` once per R session.\n\n`library(tigris)` has all the standard census geographies, including\ncensus tracts, counties, CBSAs, ZCTAs, congressional districts, tribal\nareas, and more. It also includes other elements such as water, roads,\nand military bases.\n\nBy default, `libraray(tigris)` will download large very large and\ndetailed TIGER line boundary files. For thematic mapping, the smaller\ncartographic boundary files are a better choice, as they are clipped to\nthe shoreline, generalized, and therefore usually smaller in size\nwithout losing too much accuracy. To load cartographic boundaries, use\nthe `cb = TRUE` argument. If you are doing detailed geospatial analysis\nand need the most detailed shapefiles, then you should use the detailed\nTIGER line boundary files and set `cb = FALSE`.\n\nBelow is an example of how you would use `library(tigris)` to get a `sf`\ndataframe of all Census tracts in DC for 2019.\n\n```{r tigris-1, eval=FALSE}\nlibrary(tigris)\n\n# Only need to set once per script\noptions(tigris_class = \"sf\")\n\ndc_tracts <- tracts(\n state = \"DC\",\n cb = TRUE,\n year = 2019\n)\n```\n\nUnlike `library(urbnmapr)`, different functions are used to get\ngeographic data for different geographic levels. For instance, the\n`blocks()` function will load census block group data, and the\n`tracts()` function will load tract data. Other functions include\n`block_groups()`, `zctas()` , and `core_based_statistical_areas()`. For\nthe full list of supported geographies and functions, see the [package\nvignette](https://cran.r-project.org/web/packages/tigris/tigris.pdf).\n\nFor folks interested in pulling in Census demographic information along\nwith Census geographies, we recommend checking out the sister package to\n`library(tigris)`: `library(tidycensus)`. That package allows you to\ndownload in Census variables and Census geographic data simultaneously.\n\n### Countries\n\nWe recommend using the `library(rnaturalearth)` package, which is\nsimilar to `library(tigris)` but allows you to download and use\nboundaries beyond the US. Instead of setting class to `sf` one time per\nsession as we did with `library(tigris)`, you must set the\n`returnclass = \"sf\"` argument each time you use a function from the\npackage. Below is an example of downloading in an `sf` dataframe of all\nthe countries in the world.\n\n```{r natural-earth, eval = FALSE}\n\nlibrary(rnaturalearth)\n\nworld <- ne_countries(returnclass = \"sf\")\n\nggplot() +\n geom_sf(data = world, mapping = aes())\n```\n\n### Your own files\n\n#### Shapefiles/GeoJSONS\n\nShapefiles and GeoJSONs are 2 common spatial file formats you will found\nout in the wild. `library(sf)` has a function called `st_read` which\nallows you to easily read in these files as `sf` dataframes. The only\nrequired argument is `dsn` or data source name. This is the filepath of\nthe `.shp` file or the `.geojson` file on your local computer. For\ngeojsons, `dsn` can also be a URL.\n\nBelow is an example of reading in a shapefile of fire stations in DC\nwhich is stored in `mapping/data/shapefiles/`. Note that shapefiles are\nactually stored as 6+ different files inside a folder. You need to\nprovide the filepath to the file ending in `.shp`.\n\n```{r list f-ei}\nlibrary(sf)\n\n# Print out all files in the directory\nlist.files(\"mapping/data/shapefiles\")\n\n# Read in .shp file\ndc_firestations <- st_read(\n dsn = \"mapping/data/shapefiles/Fire_Stations.shp\",\n quiet = TRUE\n)\n```\n\nAnd now `dc_firestations` is an `sf` dataframe you can use for all your\nmapping needs! `st_read` supports reading in a wide variety of other\nspatial file formats, including geodatabases, KML files, and over 200\nothers. For an incomplete list, please see the this `sf`\n[vignette](https://r-spatial.github.io/sf/articles/sf2.html).\n\n#### CSVs or dataframes with lat/lons\n\nIf you have a CSV with geographic information stored in columns, you\nwill need to read in the CSV as a regular R dataframe and then convert\nto an `sf` dataframe. `library(sf)` contains the `st_as_sf()` function\nfor converting regular R dataframes into an `sf` dataframe. The two\narguments you must specify for this function are:\n\n- `coords`: A length 2 vector with the names of the columns\n corresponding to longitude and latitude (in that order!). For\n example, `c(\"lon\", \"lat\")`.\n- `crs`: The CRS (coordinate references system) for your\n longitude/latitude coordinates. Remember you need to specify both\n the\\\n authority and the SRID code, for example (\"EPSG:4326\"). For more\n information on finding and setting CRS codes, please see the\n [`CRS`](#crs) section.\n\nBelow is an example of reading in data from a CSV and converting it to\nan `sf` dataframe.\n\n```{r make-sf}\nlibrary(sf)\n\n# Read in dataset of state capitals which is stored as a csv\nstate_capitals <- read_csv(\"mapping/data/state-capitals.csv\")\n\nstate_capitals <- state_capitals %>%\n # Specify names of the lon/lat columns in the CSV to use to make geometry col\n st_as_sf(\n coords = c(\"longitude\", \"latitude\"),\n crs = 4326\n )\n```\n\nOne common mistake is that before converting to an `sf` dataframe, you\nmust drop any rows that have `NA` values for latitude or longitude. If\nyour data contains `NA` values, then the `st_as_sf()` function will\nthrow an error.\n\n## Appending spatial info to your data\n\nOftentimes, the data you are working with will just have state or county\nidentifiers - like FIPS codes or state abbreviations - but will not\ncontain any geographic information. In this case, you must do the extra\nwork of downloading in the geographic data as an `sf` dataframe and then\njoining your non-spatial data to the spatial data. Generally this\ninvolves 3 steps:\n\n1) Reading in your own data as a data frame\n2) Reading in the geographic data as an `sf` dataframe\n3) Using `left_join` to merge the geographic data with your own non\n spatial data and create a new expanded `sf` dataframe\n\nLet's say we had a dataframe on CHIP enrollment by state with state\nabbreviations.\n\n```{r readin-chip-data}\n\n# read the state CHIP data\nchip_by_state <- read_csv(\"mapping/data/chip-enrollment.csv\") %>%\n # clean column names so there are no random spaces/uppercase letters\n janitor::clean_names()\n\n# print to the console\nchip_by_state %>% head()\n```\n\nIn order to convert this to an `sf` dataframe, we need to read in the\nspatial boundaries for each state and append it to our dataframe. Here\nis how we do that with `get_urbn_map()` and `left_join()` .\n\n```{r append-spatial-info, cache = FALSE}\nlibrary(urbnmapr)\n\n# read in state geographic data from urbnmapr\nstates <- get_urbn_map(map = \"states\", sf = TRUE)\n\n# left join state geographies to chip data\nchip_with_geographies <- states %>%\n left_join(\n chip_by_state,\n # Specify join column, which are slightly differently named in states and chip\n # respectively\n by = c(\"state_abbv\" = \"state_abbreviation\")\n )\n\nchip_with_geographies %>%\n select(state_fips, state_abbv, chip_enrollment)\n```\n\n```{r append-state-pops, include = FALSE, eval = TRUE, echo = FALSE}\n# TODO: DELETE THIS\n\n# Read in data on state populations from 2010\nstate_pops <-\n read_csv(\"https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv\",\n # Set this to disable printing column info to console\n col_types = cols()\n ) %>%\n filter(ages == \"total\", year == \"2010\") %>%\n select(state_abbv = `state/region`, population)\n\nchip_with_geographies <- chip_with_geographies %>%\n # Specify left_join from tidylog to print summary messages\n tidylog::left_join(state_pops, by = \"state_abbv\") %>%\n # Calculate the chip enrollment percentage and append as a column\n mutate(chip_pct = chip_enrollment / population)\n```\n\n# Project\n\n## Coordinate Reference Systems {#crs .tabset .tabset-pills}\n\n### The short version\n\nJust watch [this video](https://www.youtube.com/watch?v=vVX-PrBRtTY%60)\nand know the following:\n\n- All spatial data has a CRS, which specifies how to identify a\n location on earth.\n\n- It's important that all spatial datasets you are working with be in\n the same CRS. You can find the CRS with `st_crs()` and change the\n CRS with `st_transform()`.\n\n- The Urban Institute Style Guide requires the use of the Atlas Equal\n Earth Projection (`\"ESRI:102003\"`) for national maps. For state and\n local maps, use [this](https://github.com/veltman/d3-stateplane)\n handy guide to find an appropriate State Plane projection.\n\n### The long version\n\nCoordinate reference systems (CRS) specify the 3d shape of the earth and\noptionally how we project that 3d shape onto a 2d surface. They are an\nimportant part of working with spatial data as you need to ensure that\nall the data you are working with are in the same CRS in order for\nspatial operations and maps to be accurate.\n\nCRS can be specified either by name (ie Maryland State Plane) or\n**S**patial **R**eference System **ID**entifier (SRID). THe SRID is a\nnumeric identifier that uniquely identifies a coordinate reference\nsystem. Generally when referring to an SRID, you need to refer to an\nauthority (ie the data source) and a unique ID. An example is\n`EPSG:26985` which refers to the Maryland State plane projection from\nthe EPSG, or `ESRI:102003` which refers to the Atlas Equal Area\nprojection from ESRI. Most CRS codes will be from the EPSG, and some\nfrom ESRI and others. A good resource for finding/validating CRS codes\nis [epsg.io](epsg.io).\n\nSidenote - EPSG stands for the now defunct European Petroleum Survey\nGroup. And while oil companies have generally been terrible for the\nearth, the one nice thing they did for the earth was to set up common\nstandards for coordinate reference systems.\n\nYou might be thinking well isn't the earth just a sphere? Why do we need\nall this complicated stuff? And the answer is well the earth is [kind\nof](https://oceanservice.noaa.gov/facts/earth-round.html) a sphere, but\nit's really more of a misshapen ellipsoid which is pudgier at the\nequator than at the poles. To visualize how coordinate reference systems\nwork, imagine that the earth is a (lumpy) orange. Now peel the skin off\nan orange and try to flatten it. There are many ways to do it, but all\nwill create\n[distortions](https://twitter.com/neilrkaye/status/1050740679008296967)\nof some kind. The CRS will give us the formula we've used to specify the\nshape of the orange (usually a sphere or ellipsoid of some kind) and\noptionally, specify how we flattened the orange into 2d.\n\nBroadly, there are two kinds of Coordinate Reference Systems:\n\n1) [**Geographic coordinate\n systems**](https://www.ibm.com/support/knowledgecenter/en/SSGU8G_12.1.0/com.ibm.spatial.doc/ids_spat_407.html)\n\n - (sometimes called unprojected coordinate systems)\n - Specifies a 3d shape for the earth\n - Uses a spheroid/ellipsoid to approximate shape of the earth\n - Usually use decimal degree units (ie latitude/longitude) to\n identify locations on earth\n\n![](mapping/www/images/gcs_image.png)\n\n1) [**Projected coordinate\n systems**](https://mgimond.github.io/Spatial/chp09-0.html#projected-coordinate-systems)\n\n - Specifies a 3d shape for the earth + a 2d mapping\n\n - Is a geographic coordinate system + a *projection*\n\n ![](mapping/www/images/projecting_xkcd.png)\n\n credit: [xkcd](https://imgs.xkcd.com/comics/projecting.png)\n\n - **projection**: mathematical formula used to convert a 3d\n coordinate system to a 2d flat coordinate system\n\n - Many different kinds of projections, including Equal Area,\n Equidistant, Conformal, etc\n\n - All projections distort the true shape of the earth in some\n way, either in terms of shape, area, or angle. Required\n [xkcd comic](https://xkcd.com/977/)\n\n - Usually use linear units (ie feet, meters) and therefore\n useful for distance based spatial operations (ie creating\n buffers)\n\n## Finding the CRS\n\nIf you are lucky, your data will have embedded CRS data that will be\nautomatically detected when the file is read in. This is usually the\ncase for GeoJSONS (`.geojson`) and shapefiles (`.shp`). When you use\n`st_read()` on these files, you should see the CRS displayed in the\nmetadata:\n\n![](mapping/www/images/sf_crs_pic.png)\n\nYou can also the `st_crs()` function to find the CRS. The CRS code is\nlocated at the end in `ID[authority, SRID]`.\n\n```{r st_crs}\nst_crs(dc_firestations)\n```\n\nSometimes, the CRS will be blank or `NA` as the dataset did not specify\nthe CRS. In that case you **MUST find and set the CRS for your data\nbefore proceeding** with analysis. Below are some good rules of thumb\nfor finding out what the CRS for your data is:\n\n- For geojsons, the CRS should always be `EPSG:4326` (or WGS 84). The\n official geojson specification states that this is the only valid\n CRS for geojsons, but in the wild, this may not be true 100% of the\n time.\n- For shapefiles, there should be a file that ends in `.proj` in the\n same directory as the `.shp` file. This file contains the projection\n information for that file and should be used automatically when\n reading in shapefiles.\n- For CSV's with latitude/longitude columns, the CRS is usually\n `EPSG:4326` (or WGS 84).\n- Look at the metadata and any accompanying documentation to see if\n the coordinate reference system for the data is specified\n\nIf none of the above rules of thumb apply to you, check out the\n`crsuggest` R [package](https://github.com/walkerke/crsuggest).\n\nOnce you've identified the appropriate CRS, you can set the CRS for your\ndata with `st_crs()`:\n\n```{r set_crs, eval = FALSE}\n\n# If you are certain that your data contains coordinates in the ESRI Atlas Equal Earth projections\nst_crs(some_sf_dataframe) <- st_crs(\"ESRI:102003\")\n```\n\n## Transforming the CRS\n\nOften you will need to change the CRS for your `sf` dataframe so that\nall datasets you are using have the same CRS, or to use a projected CRS\nfor performing more accurate spatial operations. You can do this with\n`st_transform`:\n\n```{r transform-crs}\n# Transforming CRS from WGS 84 to Urban required Equal Earth Projection\nstate_capitals <- state_capitals %>% st_transform(\"ESRI:102003\")\n```\n\n`st_transform()` also allows you to just use the CRS of another `sf`\ndataframe when transforming.\n\n```{r transform-crs-with-another-sf-object}\n# transform CRS of chip_with_geographies to be the same as CRS of dc_firestations\nchip_with_geographies <- chip_with_geographies %>%\n st_transform(crs = st_crs(state_capitals))\n```\n\nIf you are working with local data, you should use an appropriate state\nplane projection instead of the Atlas Equal Earth projection which is\nmeant for national maps. `library(crsuggest)` can simplify the process\nof picking an appropriate state plane CRS.\n\n```{r crsuggest-ex, cache = TRUE}\nlibrary(crsuggest)\n\nsuggest_crs(dc_firestations) %>%\n # Use the value in the \"crs_code\" column to transform CRS's\n head(4)\n```\n\n# Map\n\nIn order to start mapping, you need an `sf` dataframe. If you don't have\none, see the [`Get Spatial Data`](#get_spatial_data) section above.\n\n## The basics\n\n### library(ggplot2)\n\nMost mapping in R fits the same theoretical framework as plotting in R\nusing `library(ggplot2)`. To learn more about ggplot2, visit the Data\nViz\n[page](https://urbaninstitute.github.io/r-at-urban/graphics-guide.html#Grammar_of_Graphics_and_Conventions)\nor read the official ggplot [book](html).\n\nThe key function for mapping is **the special `geom_sf()` function**\nwhich works with `sf` dataframes. This function magically detects\nwhether you have point or polygon spatial data and displays the results\non a map.\n\n### A simple map\n\nTo make a simple map, add `geom_sf()` to a `ggplot()` and set\n`data = an_sf_dataframe`. Below is code for making a map of all 50\nstates using `library(urbnmapr)`:\n\n```{r first-map, cache = TRUE}\nlibrary(urbnmapr)\n\nstates <- get_urbn_map(\"states\", sf = TRUE)\n\nggplot() +\n geom_sf(\n data = states,\n mapping = aes()\n )\n```\n\n## Styling\n\n### `library(urbnthemes)`\n\n`library(urbnthemes)` automatically styles maps in accordance with the\n[Urban Institute Data Visualization Style\nGuide](http://urbaninstitute.github.io/graphics-styleguide/). By using\n`library(urbnthemes)`, you can create publication ready maps you can\nimmediately drop in to Urban research briefs or blog posts.\n\nTo install `urbnthemes`, visit the package's [GitHub\nrepository](https://github.com/UrbanInstitute/urbnthemes) and follow the\ninstructions. There are 2 ways to use the `urbnthemes` functions:\n\n```{r urbnthemes}\n\nlibrary(urbnthemes)\n\n# You can either run this once per script to automatically style all maps with\n# the Urban theme\nset_urbn_defaults(style = \"map\")\n\n# Or you can add `+ theme_urbn_map()` to the end of every map you make\nggplot() +\n geom_sf(states, mapping = aes()) +\n theme_urbn_map()\n```\n\n### Layering\n\nYou can layer multiple points/lines/polygons on top of each other using\nthe `+` operator from `library(ggplot2)`. The shapes will appear from\nbottom to top (ie the last mapped object will show up on top). It is\nimportant that all layers are in the same CRS (coordinate reference\nsystem).\n\n```{r layers, cache = TRUE}\n\nstate_capitals <- state_capitals %>%\n # This will change CRS to ESRI:102003 and shift the AK and HI state capitals\n # point locations to the appropriate locations on the inset maps.\n tigris::shift_geometry() %>%\n # For now filter out AL and HI as their state capitals will be slightly off.\n filter(!state %in% c(\"Alaska\", \"Hawaii\"))\n\nggplot() +\n geom_sf(\n data = states,\n mapping = aes()\n ) +\n # Note we change the data argument\n geom_sf(\n data = state_capitals,\n mapping = aes(),\n # urbnthemes library has urbn color palettes built in.\n color = palette_urbn_main[\"yellow\"],\n size = 2.0\n ) +\n theme_urbn_map()\n```\n\n### Fill and Outline Colors\n\nThe same commands used to change colors, opacity, lines, size, etc. in\ncharts can be used for maps too. To change the colors of the map , just\nuse the `fill =` and `color =` parameters in `geom_sf()`. `fill` will\nchange the fill color of polygons; `color` will change the color of\npolygon outlines, lines, and points.\n\nGenerally, maps that show the magnitude of a variable use the blue\nsequential ramp and maps that display positives and negatives use the\ndiverging color ramp.`library(urbnthemes)` contains inbuilt. helper\nvariables (like `palette_urbn_main`) for accessing color palettes from\nthe Urban Data Viz Style guide. If for example you want states to be\nUrban's magenta color:\n\n```{r urbnthemes- pink}\n\nggplot() +\n geom_sf(states,\n mapping = aes(),\n # Adjust polygon fill color\n fill = palette_urbn_main[\"magenta\"],\n # Adjust polygon outline color\n color = \"white\"\n ) +\n theme_urbn_map()\n```\n\n### Adding text\n\nYou can also add text, like state abbreviations, directly to your map\nusing `geom_sf_text` and the helper function `get_urbn_labels()`.\n\n```{r geom_sf_text}\nlibrary(urbnmapr)\n\nggplot() +\n geom_sf(states,\n mapping = aes(),\n color = \"white\"\n ) +\n theme_urbn_map() +\n # Generates dataframe of state abbv and appropriate location to plot them\n geom_sf_text(\n data = get_urbn_labels(\n map = \"states\",\n sf = TRUE\n ),\n aes(label = state_abbv),\n size = 3\n )\n```\n\nThere's also `geom_sf_label()` if you want labels with a border.\n\n# Map Gallery {#map_gallery}\n\nBelow are copy and pasteable examples of maps you can make, after you\nhave an `sf` dataframe.\n\n## Choropleth Maps\n\nChoropleth maps display geographic areas with shades, colors, or\npatterns in proportion to a variable or variables. Choropleth maps can\nrepresent massive geographies like the entire world and small\ngeographies like Census Tracts. To make a choropleth map, you need to\nset `geom_sf(aes(fill = some_variable_name))`. Below are examples\n\n### Continuous color scale\n\n```{r choropoleth_continious}\n# Map of CHIP enrollment percentage by state\nchip_with_geographies_map <- chip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct\n ))\n\n\n# Below add-ons to the map are optional, but make the map look prettier.\nchip_with_geographies_map +\n # scale_fill_gradientn adds colors with more interpolation and reverses color scale\n scale_fill_gradientn(\n # Convert legend from decimal to percentages\n labels = scales::percent_format(),\n # Make legend title more readable\n name = \"CHIP Enrollment %\",\n # Manually add 0 to lower limit to include it in legend. NA=use maximum value in data\n limits = c(0, NA),\n # Set number of breaks on legend = 3\n n.breaks = 3\n )\n```\n\n### Discrete color scale\n\nThe quick and dirty way is with `scale_fill_steps()`, which creates\ndiscretized bins for continuous variables:\n\n```{r chorpleth_disccrete}\nchip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct\n )) +\n scale_fill_steps(\n # Convert legend from decimal to percentages\n labels = scales::percent_format(),\n # Make legend title more readable\n name = \"CHIP Enrollment %\",\n # Show top and bottom limits on legend\n show.limits = TRUE,\n # Roughly set number of bins. Won't be exact as R uses algorithms under the\n # hood for pretty looking breaks.\n n.breaks = 4\n )\n```\n\nOften you will want to manually generate the bins yourself to give you\nmore fine grained control over the exact legend text. (ie `1% - 1.8%`,\n`1.8 - 2.5%`, etc). Below is an example of discretizing the continuous\n`chip_pct` variable yourself using `cut_interval()` and a helper\nfunction to get nice looking interval labels:\n\n```{r format_intervals}\n\n# Helper function to clean up R generated intervals into nice looking interval labels\nformat_interval <- function(interval_text) {\n text <- interval_text %>%\n # Remove open and close brackets which is R generated math notation\n str_remove_all(\"\\\\(\") %>%\n str_remove_all(\"\\\\)\") %>%\n str_remove_all(\"\\\\[\") %>%\n str_remove_all(\"\\\\]\") %>%\n str_replace_all(\",\", \" — \")\n\n # Convert decimal ranges to percent ranges\n text <- text %>%\n str_split(\" — \") %>%\n map(~ as.numeric(.x) %>%\n scales::percent() %>%\n paste0(collapse = \" — \")) %>%\n unlist() %>%\n # By default character vectors are plotted in alphabetical order. We want\n # factors in reverse alphabetical order to get correct colors in ggplot\n fct_rev()\n\n return(text)\n}\n\nchip_with_geographies <- chip_with_geographies %>%\n # cut_interval into n groups with equal range. Set boundary so 0 is included in the bins\n mutate(chip_pct_interval = cut_interval(chip_pct, n = 5)) %>%\n # Generate nice looking interval labels\n mutate(chip_pct_interval = format_interval(chip_pct_interval))\n```\n\nAnd now we can map the discretized `chip_pct_interval` variable using\n`geom_sf()`:\n\n```{r make_discrete_map}\nchip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct_interval\n )) +\n # Default is to use main urban palette, which assumes unrelated groups. We\n # adjust colors manually to be on Urban cyan palette\n scale_fill_manual(\n values = palette_urbn_cyan[c(8, 7, 5, 3, 1)],\n name = \"CHIP Enrollment %\"\n )\n```\n\nIn addition to `cut_interval` there are [similar\nfunctions](https://ggplot2.tidyverse.org/reference/cut_interval.html)\nfor creating intervals/bins with slightly different rules. When creating\nbins, be careful as changing the number of bins can drastically change\nhow the map looks.\n\n## Bubble Maps\n\nThis is just a layered map with one polygon layer and one point layer,\nwhere the points are sized in accordance with a variable in your data.\n\n```{r bubble_maps, cache = TRUE}\nset_urbn_defaults(style = \"map\")\n\n# Get sf dataframe of DC tracts\nlibrary(tigris)\ndc_tracts <- tracts(\n state = \"DC\",\n year = 2019,\n progress_bar = FALSE\n)\n\n# Add bubbles for firestations\nggplot() +\n geom_sf(data = dc_tracts, fill = palette_urbn_main[\"gray\"]) +\n geom_sf(\n data = dc_firestations,\n # Size bubbles by number of trucks at each station\n aes(size = TRUCK),\n color = palette_urbn_main[\"yellow\"],\n # Adjust transparency for readability\n alpha = 0.8\n )\n```\n\n## Dot-density Maps\n\nThese maps scatter dots within a geographic area. Typically each dot\nrepresents a unit (like 100 people, or 1000 houses). To create this kind\nof map, you need to start with an `sf` dataframe that is of `geometry`\ntype `POLYGON` or `MULTIPOLYGON` and then sample points within the\npolygon.\n\nThe below code generates a dot-density map representing people of\ndifferent races within Washington DC tracts The code may look a little\ncomplicated, but the key workhorse function is `st_sample()` which\nsamples points within each polygon to use in the dot density map:\n\n```{r dot_density_maps, cache = TRUE}\nlibrary(tidycensus)\n\n# Get counts by race of DC tracts\ndc_pop <- get_acs(\n geography = \"tract\",\n state = \"DC\",\n year = 2019,\n variables = c(\n Hispanic = \"DP05_0071\",\n White = \"DP05_0077\",\n Black = \"DP05_0078\",\n Asian = \"DP05_0080\"\n ),\n geometry = TRUE,\n progress_bar = FALSE\n)\n\n# Get unique groups (ie races)\ngroups <- unique(dc_pop$variable)\n\n# For each unique group (ie race), generate sampled points\ndc_race_dots <- map_dfr(groups, ~ {\n dc_pop %>%\n # .x = the group used in the loop\n filter(variable == .x) %>%\n # Use the projected MD state plane for accuracy\n st_transform(crs = \"EPSG:6487\") %>%\n # Have every dot represent 100 people\n mutate(est100 = as.integer(estimate / 100)) %>%\n st_sample(size = .$est100, exact = TRUE) %>%\n st_sf() %>%\n # Add group (ie race) as a column so we can use it when plotting\n mutate(group = .x)\n})\n\n\nggplot() +\n # Plot tracts, then dots on top of tracts\n geom_sf(\n data = dc_pop,\n # Make interior of tracts transparent and boundaries black\n fill = \"transparent\",\n color = \"black\"\n ) +\n geom_sf(\n data = dc_race_dots,\n # Color in dots by racial group\n aes(color = group),\n # Adjust transparency and size to be more readable\n alpha = 0.5,\n size = 1.1,\n stroke = FALSE\n )\n```\n\n## Geofacets\n\nGeofaceting arranges sub-geography-specific plots into a grid that\nresembles a larger geography (usually the US). This can be a useful\nalternative to choropleth maps, which tend to overemphasize\nlow-population density areas with large areas. To make geofacetted\ncharts, use the `facet_geo()` function from the `geofacet` library,\nwhich can be thought of as equivalent to ggplot2's `facet_wrap()`. For\nthis example, we'll use the built-in `state_ranks` data.\n\n```{r geofacet-data}\nlibrary(geofacet)\n\nhead(state_ranks %>% as_tibble())\n```\n\n```{r geofacet-ex, cache = TRUE}\nset_urbn_defaults(style = \"print\")\n\nstate_ranks %>%\n filter(variable %in% c(\"education\", \"employment\")) %>%\n ggplot(aes(x = rank, y = variable)) +\n geom_col() +\n facet_geo(\n facets = \"state\",\n # Use custom urban geofacet grid which is built into urbnthemes\n # For now we need to rename a few columns as urbnthemes has to be\n # updated\n grid = urbnthemes::urbn_geofacet %>%\n rename(\n code = state_code,\n name = state_name\n )\n )\n```\n\nInteractive geofacets of the United States have been used in Urban\nFeatures like [A Matter of\nTime](https://apps.urban.org/features/long-prison-terms/trends.html)\nwhich included geofaceted line charts showing trends in incarceration by\nstate. Static geofacets of the United States were included in [Barriers\nto Accessing Homeownership Down Payment, Credit, and\nAffordability](https://www.urban.org/sites/default/files/publication/94801/barriers-to-homeownership-down-payments-credit-access-and-affordability_3.pdf)\nby the Housing Finance Policy Center.\n\n### Tile grid map\n\nYou can select predefined grids, or create your own at https://hafen.github.io/grid-designer/ \n\n```{r}\n# create a grid with all of the US states and territories \nmygrid <- data.frame(\n code = c(\"ME\", \"AK\", \"WI\", \"VT\", \"NH\", \"IL\", \"ID\", \"WA\", \"MN\", \"MT\", \"ND\", \"MI\", \"NY\", \"MA\", \"IA\", \"IN\", \"CT\", \"RI\", \"NJ\", \"PA\", \"OH\", \"SD\", \"WY\", \"NV\", \"OR\", \"CA\", \"NE\", \"DE\", \"MD\", \"VA\", \"WV\", \"KY\", \"MO\", \"CO\", \"UT\", \"AZ\", \"KS\", \"AR\", \"DC\", \"SC\", \"NC\", \"TN\", \"NM\", \"LA\", \"AL\", \"GA\", \"MS\", \"OK\", \"HI\", \"FL\", \"TX\"),\n row = c(1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8),\n col = c(12, 2, 7, 11, 12, 7, 3, 2, 6, 4, 5, 8, 10, 11, 6, 7, 11, 12, 10, 9, 8, 5, 4, 3, 2, 2, 5, 11, 10, 9, 8, 7, 6, 4, 3, 3, 5, 6, 10, 9, 8, 7, 4, 6, 8, 9, 7, 5, 2, 10, 5),\n stringsAsFactors = FALSE\n)\n\n## Combine data into geo_grid for tiling:\ngeo_grid_data <- mygrid %>% \n left_join(chip_with_geographies, by=c(\"code\" = \"state_abbv\")) \n\n## plot tile grid\ngeo_grid_data %>% \n ggplot(aes(x = col, y = row, fill = chip_pct_interval)) +\n scale_fill_manual(values = palette_urbn_cyan[c(8, 7, 5, 3, 1)], \n \t\t\t\t\t\t\t\t\t name = \"CHIP Enrollment %\") +\n geom_tile(color = \"white\", linewidth = 1) +\n geom_text(aes(label = code), color=\"white\", size = 4) +\n scale_y_reverse() +\n coord_equal() +\n labs(fill=NULL)\n```\n\n\n## Cartograms\n\nCartograms are a modified form of a choropleth map with intentionally\ndistorted sizes that map to a variable in your data. Below we create a\ncartogram with `library(cartogram)` where the state sizes are\nproportional to the population.\n\n```{r cartogram-example, cache = TRUE}\nlibrary(cartogram)\n\nset_urbn_defaults(style = \"map\")\n\nchip_with_geographies_weighted <- chip_with_geographies %>%\n # Note column name needs to be in quotes for this package\n cartogram_cont(weight = \"population\")\n\nggplot() +\n geom_sf(\n data = chip_with_geographies_weighted,\n # Color in states by chip percentages\n aes(fill = chip_pct)\n )\n```\n\n## Interactive Maps\n\nInteractive maps can be a great exploratory tool to explore and\nunderstand your data. And luckily there are a lot of new R packages that\nmake it really easy to create them. Interactive maps are powerful but\n**we do not recommend them for official use in Urban publications** as\ngetting them in Urban styles and appropriate basemaps can be tricky\n(reach out to\n[anarayanan\\@urban.org](mailto:anarayanan@urban.org){.email} if you\nreally want to include them).\n\n### `library(mapview)`\n\n`library(mapview)` is probably the most user friendly of the interactive\nmapping R libraries. All you have to do to create an interactive map is:\n\n```{r show-mapview}\nlibrary(mapview)\n\n\nchip_with_geographies_for_interactive_mapping <- chip_with_geographies %>%\n # Filter out AL and HI bc they would appear in Mexico. If you want AL, HI and\n # in the correct place in interactive maps, make sure to use tigris::states()\n filter(!state_abbv %in% c(\"AK\", \"HI\"))\n\nmapview(chip_with_geographies_for_interactive_mapping)\n```\n\nWhen you click on an object, you get a popup table of all it's\nattributes. And when you hover over an object, you get a popup with an\nobject id.\n\nEach of the above behaviors can be changed if desired. As you'll see in\nthe below section, the syntax for `library(mapview)` is significantly\ndifferent from `library(ggplot2)` so be careful!\n\n#### Coloring in points/polygons\n\nIn order to create a choropleth map where we color in the\npoints/polygons by a variable, we need to feed in a column name *in\nquotes* to the`zcol` argument inside the `mapview()` function:\n\n```{r mapview_zcol}\n# Create interactive state map colored in by chip enrollment\nmapview(chip_with_geographies_for_interactive_mapping, zcol = \"chip_enrollment\")\n```\n\nIf you want more granular control over the color palette for the legend\ncan also feed in a vector of color hex codes to `col.regions` along with\na column name to `zcol`. This will create a continuous color range along\nthe provided colors. Be careful though as the color interpolation is not\nperfect.\n\n```{r mapview-colors-granular}\n# library(RColorBrewer)\nmapview(chip_with_geographies_for_interactive_mapping,\n col.regions = c(\n palette_urbn_green[6],\n \"white\",\n palette_urbn_cyan[6]\n ),\n zcol = \"chip_enrollment\"\n)\n```\n\nIf you want to color in all points/polygons as the same color, just feed\nin a single color hex code to the `col.regions` argument:\n\n```{r mapview-colors}\nmapview(chip_with_geographies_for_interactive_mapping,\n col.regions = palette_urbn_green[5]\n)\n```\n\n#### Adding layers\n\nYou can add multiple `sf` objects on the same map by using the `+`\noperator. This is very useful when comparing 2 or more spatial datasets.\n\n```{r mapview-layers}\nmapview(chip_with_geographies_for_interactive_mapping, col.regions = palette_urbn_green[5]) +\n mapview(state_capitals, col.regions = palette_urbn_cyan[5])\n```\n\nYou can even create slider maps by using the `|` operator!\n\n```{r mapview-sliders}\nmapview(chip_with_geographies_for_interactive_mapping, col.regions = palette_urbn_green[5]) |\n mapview(state_capitals, col.regions = palette_urbn_cyan[5])\n```\n\n### More details\n\nTo learn more about more advanced options with `mapview` maps, check out\nthe\n[documentation](https://r-spatial.github.io/mapview/articles/articles/mapview_02-advanced.html)\npage and the [reference\nmanual](https://cran.r-project.org/web/packages/mapview/mapview.pdf).\n\nThere are also other interactive map making packages in R like `leaflet`\n(which `mapview` is a more user friendly wrapper of), `tmap`, and\n`mapdeck`. To learn about these other packages, [this book\nchapter](https://geocompr.robinlovelace.net/adv-map.html#interactive-maps)\nis a good starting point.\n\n# Spatial Operations\n\n## Cropping\n\nCropping (or clipping) is geographically filtering an `sf` dataframe to\njust the area we are interested in. Say we wanted to look at the roads\naround Fire Station 24 in DC.\n\n```{r roads_cropping_before, cache = TRUE}\nlibrary(tigris)\nlibrary(units)\n\ndc_firestations <- dc_firestations %>%\n st_transform(\"EPSG:6487\")\n\n\n# Draw 500 meter circle around one fire station\nfire_station_24_buffered <- dc_firestations %>%\n filter(NAME == \"Engine 24 Station\") %>%\n st_buffer(set_units(500, \"meter\"))\n\n# Get listing of all roads in DC\ndc_roads <- roads(\n state = \"DC\",\n county = \"District of Columbia\",\n class = \"sf\",\n progress_bar = FALSE\n) %>%\n st_transform(\"EPSG:6487\")\n\n# View roads on top of fire_station\nggplot() +\n # Order matters! We need to plot fire_stations first, and then roads on top\n # to see overlapping firestations\n geom_sf(\n data = fire_station_24_buffered,\n fill = palette_urbn_cyan[1],\n color = palette_urbn_cyan[7]\n ) +\n geom_sf(\n data = dc_roads,\n color = palette_urbn_gray[7]\n ) +\n theme_urbn_map()\n```\n\nWe can clip the larger roads dataframe to just roads that overlap with\nthe circle around the fire station with `st_intersection()`.\n\n```{r roads_cropping_after}\n\n# Use st_intersection() to crop the roads data to just roads within the\n# fire_station radius\ndc_roads_around_fire_station_24_buffered <- fire_station_24_buffered %>%\n st_intersection(dc_roads)\n\nggplot() +\n geom_sf(\n data = fire_station_24_buffered,\n fill = palette_urbn_cyan[1],\n color = palette_urbn_cyan[7]\n ) +\n geom_sf(\n data = dc_roads_around_fire_station_24_buffered,\n color = palette_urbn_gray[7]\n ) +\n theme_urbn_map()\n```\n\n**More Coming Soon!**\n\n## Calculating Distance\n\n## Spatial Joins\n\n### Point to Polygon\n\n### Polygon to Polygon\n\n## Aggregating\n\n## Drive/Transit times\n\n## Geocoding\n\nGeocoding is the process of turning text (usually addresses) into\ngeographic coordinates (usually latitudes/longitudes) for use in\nmapping. For Urban researchers, we highly recommend using the [Urban\ngeocoder](https://tech-tools.urban.org/geocoding/) as it is fast,\naccurate, designed to work with sensitive/confidential data and most\nimportantly free to use for Urban researchers! To learn about how we set\nup and chose the geocoder for the Urban Institute, you can read our\n[Data\\@Urban\nblog](https://medium.com/@urban_institute/choosing-a-geocoder-for-the-urban-institute-86192f656c5f).\n\n### Cleaning Addresses\n\nThe single most important factor in getting accurate geocoded data is\nhaving cleaned, well structured address data. This can prove difficult\nas address data out in the wild is often messy and unstandardized. While\nthe rules for cleaning addresses are very data specific, below are some\nexamples of clean addresses you should aim for in your data cleaning\nprocess:\n\n```{r cleaned-addr, cache=TRUE,eval=TRUE,results=TRUE, echo=FALSE}\nlibrary(gt)\ncleaned_address_table <- tribble(\n ~\"f_address\", ~\"Type of address\",\n \"123 Troy Drive, Pillowtown, CO, 92432\", \"residnetial address\",\n \"789 Abed Avenue, Apt 666, Blankesburg, CO, 92489\", \"residential apartment address\",\n \"Shirley Boulevard and Britta Drive, Blanketsburg, CO, 92489\", \"street intersection\",\n \"Pillowtown, CO\", \"city\",\n \"92489, CO\", \"Zip Code\",\n)\n\ngt(cleaned_address_table) %>%\n # tab_header(title = md(\"Clean Address Examples\")) %>%\n opt_row_striping(row_striping = TRUE) %>%\n tab_style(\n style = list(\n cell_text(weight = \"bold\")\n ),\n locations = cells_column_labels(\n columns = vars(f_address, `Type of address`)\n )\n ) %>%\n opt_align_table_header(align = c(\"left\")) %>%\n tab_options(\n container.width = \"100%\",\n container.height = \"400px\",\n # column_labels.background.color = palette_urbn_cyan[1],\n table.border.top.width = 0,\n table.border.bottom.width = 0,\n column_labels.border.bottom.width = 0,\n )\n```\n\nAll that being said, our geocoder is pretty tolerant of different\naddress formats, typos/spelling errors and missing states, zip codes,\netc. So don't spend too much time cleaning every address in the data.\nAlso note that while our geocoder is able to geocode cities and zip\ncodes, it will return the lat/lon of the center of the city/zip code,\nwhich may not be what you want.\n\n### Instructions\n\nTo use the [Urban geocoder](https://tech-tools.urban.org/geocoding/),\nyou will need to:\n\n1) Generate a CSV with a column named `f_address` which contains the\n addresses in single line format (ie\n `123 Abed Avenue, Blanketsburg, CO, 94328`). This means that if you\n have the addresses split across multiple columns (ie `Address`,\n `City`, `State`, `Zip` columns), you will need to concatenate them\n into one column. Also see our Address cleaning section above.\n\n2) Go to the Urban geocoder and answer the initial questions. This will\n tell you whether your data is non-confidential or confidential data,\n and allow you to upload your CSV for geocoding.\n\n3) Wait for an email telling you your results are ready. If your data\n is non-confidential, this email will contain a link to your geocoded\n results. This link expires in 24 hours, so make sure to download\n your data before then. If you data is confidential, the email will\n contain a link to the location on the Y Drive where your\n confidential geocoded data is stored. You can specify this output\n folder when submitting the CSV in step 1.\n\n### Geocoder outputs\n\n
The geocoded file will be your original data, plus a few more columns\n(including latitude and longitude). each of the new columns that have\nbeen appended to your original data. [It's very important that you take\na look at the Addr_type\ncolumn]{style=\"background-color: #FFFF00; font-weight: bold\"} in the\nCSV before doing further analysis to check the accuracy of the geocoding\nprocess.
\n\n+---------------+---------------------------------------------------+\n| Column | Description |\n+:==============+:==================================================+\n| Match_addr | The actual address that the inputted address was |\n| | matched to. This is the address that the geocoder |\n| | used to get Latitudes / Longitudes. If there are |\n| | potentially many typos or non standard address |\n| | formats in your data file, you will want to take |\n| | a close look at this column to confirm that the |\n| | matched address correctly handled typos and badly |\n| | formatted addresses. |\n+---------------+---------------------------------------------------+\n| Longitude | The WGS 84 datum Longitude (EPSG code 4326) |\n+---------------+---------------------------------------------------+\n| Latitude | The WGS 84 datum Latitude (EPSG code 4326) |\n+---------------+---------------------------------------------------+\n| Addr_type | The match level for a geocode request. This |\n| | should be used as an indicator of the precision |\n| | of geocode results. Generally, Subaddress, |\n| | PointAddress, StreetAddress, and StreetInt |\n| | represent accurate matches. The list below |\n| | contains all possible values for this field. |\n| | **Green values represent High accuracy matches, |\n| | yellow represents Medium accuracy matches and red |\n| | represents Low accuracy/inaccurate matches**. If |\n| | you have many yellow and red values in your data, |\n| | you should manually check the results before |\n| | proceeding with analysis. All possible values:\\ |\n| | \\ |\n| | **Subaddress:** A subset of a PointAddress that |\n| | represents a house or building subaddress |\n| | location, such as an apartment unit, floor, or |\n| | individual building within a complex. The |\n| | UnitName, UnitType, LevelName, LevelType, |\n| | BldgName, and BldgType field values help to |\n| | distinguish subaddresses which may be associated |\n| | with the same PointAddress. Reference data |\n| | consists of point features with associated house |\n| | number, street name, and subaddress elements, |\n| | along with administrative divisions and optional |\n| | postal code; for example, 3836 Emerald Ave, Suite |\n| | C, La Verne, CA, 91750.\\ |\n| | \\ |\n| | **PointAddress:** A street address based on |\n| | points that represent house and building |\n| | locations. Typically, this is the most spatially |\n| | accurate match level. Reference data contains |\n| | address points with associated house numbers and |\n| | street names, along with administrative divisions |\n| | and optional postal code. The X / Y |\n| | (`Longitude`/`Latitude`) and `geometry` output |\n| | values for a PointAddress match represent the |\n| | street entry location for the address; this is |\n| | the location used for routing operations. The |\n| | `DisplayX` and `DisplayY` values represent the |\n| | rooftop, or actual, location of the address. |\n| | Example: 380 New York St, Redlands, CA, 92373.\\ |\n| | \\ |\n| | **StreetAddress** --- A street address that |\n| | differs from PointAddress because the house |\n| | number is interpolated from a range of numbers. |\n| | Reference data contains street center lines with |\n| | house number ranges, along with administrative |\n| | divisions and optional postal code information, |\n| | for example, 647 Haight St, San Francisco, CA, |\n| | 94117.\\ |\n| | \\ |\n| | **StreetInt:** A street address consisting of a |\n| | street intersection along with city and optional |\n| | state and postal code information. This is |\n| | derived from StreetAddress reference data, for |\n| | example, Redlands Blvd & New York St, Redlands, |\n| | CA, 92373.\\ |\n| | \\ |\n| | **StreetName:** Similar to a street address but |\n| | without the house number. Reference data contains |\n| | street centerlines with associated street names |\n| | (no numbered address ranges), along with |\n| | administrative divisions and optional postal |\n| | code, for example, W Olive Ave, Redlands, CA, |\n| | 92373.\\ |\n| | \\ |\n| | **StreetAddressExt:** An interpolated street |\n| | address match that is returned when parameter |\n| | matchOutOfRange=true and the input house number |\n| | exceeds the house number range for the matched |\n| | street segment.\\ |\n| | \\ |\n| | **DistanceMarker:** A street address that |\n| | represents the linear distance along a street, |\n| | typically in kilometers or miles, from a |\n| | designated origin location. Example: Carr 682 KM |\n| | 4, Barceloneta, 00617.\\ |\n| | \\ |\n| | **PostalExt:** A postal code with an additional |\n| | extension, such as the United States Postal |\n| | Service ZIP+4. Reference data is postal code |\n| | points with extensions, for example, 90210-3841.\\ |\n| | \\ |\n| | **POI:** ---Points of interest. Reference data |\n| | consists of administrative division place-names, |\n| | businesses, landmarks, and geographic features, |\n| | for example, Golden Gate Bridge.\\ |\n| | \\ |\n| | **Locality:** A place-name representing a |\n| | populated place. The Type output field provides |\n| | more detailed information about the type of |\n| | populated place. Possible Type values for |\n| | Locality matches include Block, Sector, |\n| | Neighborhood, District, City, MetroArea, County, |\n| | State or Province, Territory, Country, and Zone. |\n| | Example: Bogotá, COL,\\ |\n| | \\ |\n| | **PostalLoc:** A combination of postal code and |\n| | city name. Reference data is typically a union of |\n| | postal boundaries and administrative (locality) |\n| | boundaries, for example, 7132 Frauenkirchen.\\ |\n| | \\ |\n| | **Postal:** Postal code. Reference data is postal |\n| | code points, for example, 90210 USA. |\n+---------------+---------------------------------------------------+\n| Score | A number from 1--100 indicating the degree to |\n| | which the input tokens in a geocoding request |\n| | match the address components in a candidate |\n| | record. A score of 100 represents a perfect |\n| | match, while lower scores represent decreasing |\n| | match accuracy. |\n+---------------+---------------------------------------------------+\n| Status | Indicates whether a batch geocode request results |\n| | in a match, tie, or unmatched. Possible values |\n| | include\\ |\n| | \\ |\n| | M - Match. The returned address matches the input |\n| | address and is the highest scoring candidate.\\ |\n| | \\ |\n| | T - Tied. The returned address matches the input |\n| | address but has the same score as one or more |\n| | additional candidates.\\ |\n| | \\ |\n| | U - Unmatched. No addresses match the inputted |\n| | address. |\n+---------------+---------------------------------------------------+\n| geometry | The WKT (Well-known text) representation of the |\n| | latitudes and longitudes. This column may be |\n| | useful if you're reading the CSV into R, Python, |\n| | or ArcGIS |\n+---------------+---------------------------------------------------+\n| Region | The state that `Match_addr` is located in |\n+---------------+---------------------------------------------------+\n| RegionAbbr | Abbreviated State Name. For example, CA for |\n| | California |\n+---------------+---------------------------------------------------+\n| Subregion | The county that the input address is located in |\n+---------------+---------------------------------------------------+\n| MetroArea | The name of the Metropolitan area that |\n| | `Match_addr` is located in. This field may be |\n| | blank if the input address is not located within |\n| | a metro area. |\n+---------------+---------------------------------------------------+\n| City | The city that `Match_addr` is located in |\n+---------------+---------------------------------------------------+\n| Nbrhd | The Neighborhood that `Match_addr` is located in. |\n| | Note these are ESRI defined neighborhoods which |\n| | may or may not align with other sources |\n| | neighborhood definitions |\n+---------------+---------------------------------------------------+\n\n\\\n\n# Geospatial Modeling\n\nComing soon!\n\n# Bibliography and references\n\n------------------------------------------------------------------------\n\n```{r session-info}\n\nsessionInfo()\n```\n"},"formats":{"html":{"identifier":{"display-name":"HTML","target-format":"html","base-format":"html"},"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"knitr"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"inline-includes":false,"preserve-yaml":false,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[],"notebook-links":true,"format-links":true},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"mapping.html"},"language":{"toc-title-document":"Table of contents","toc-title-website":"On this page","related-formats-title":"Other Formats","related-notebooks-title":"Notebooks","source-notebooks-prefix":"Source","section-title-abstract":"Abstract","section-title-appendices":"Appendices","section-title-footnotes":"Footnotes","section-title-references":"References","section-title-reuse":"Reuse","section-title-copyright":"Copyright","section-title-citation":"Citation","appendix-attribution-cite-as":"For attribution, please cite this work as:","appendix-attribution-bibtex":"BibTeX citation:","title-block-author-single":"Author","title-block-author-plural":"Authors","title-block-affiliation-single":"Affiliation","title-block-affiliation-plural":"Affiliations","title-block-published":"Published","title-block-modified":"Modified","callout-tip-title":"Tip","callout-note-title":"Note","callout-warning-title":"Warning","callout-important-title":"Important","callout-caution-title":"Caution","code-summary":"Code","code-tools-menu-caption":"Code","code-tools-show-all-code":"Show All Code","code-tools-hide-all-code":"Hide All Code","code-tools-view-source":"View Source","code-tools-source-code":"Source Code","code-line":"Line","code-lines":"Lines","copy-button-tooltip":"Copy to Clipboard","copy-button-tooltip-success":"Copied!","repo-action-links-edit":"Edit this page","repo-action-links-source":"View source","repo-action-links-issue":"Report an issue","back-to-top":"Back to top","search-no-results-text":"No results","search-matching-documents-text":"matching documents","search-copy-link-title":"Copy link to search","search-hide-matches-text":"Hide additional matches","search-more-match-text":"more match in this document","search-more-matches-text":"more matches in this document","search-clear-button-title":"Clear","search-detached-cancel-button-title":"Cancel","search-submit-button-title":"Submit","search-label":"Search","toggle-section":"Toggle section","toggle-sidebar":"Toggle sidebar navigation","toggle-dark-mode":"Toggle dark mode","toggle-reader-mode":"Toggle reader mode","toggle-navigation":"Toggle navigation","crossref-fig-title":"Figure","crossref-tbl-title":"Table","crossref-lst-title":"Listing","crossref-thm-title":"Theorem","crossref-lem-title":"Lemma","crossref-cor-title":"Corollary","crossref-prp-title":"Proposition","crossref-cnj-title":"Conjecture","crossref-def-title":"Definition","crossref-exm-title":"Example","crossref-exr-title":"Exercise","crossref-ch-prefix":"Chapter","crossref-apx-prefix":"Appendix","crossref-sec-prefix":"Section","crossref-eq-prefix":"Equation","crossref-lof-title":"List of Figures","crossref-lot-title":"List of Tables","crossref-lol-title":"List of Listings","environment-proof-title":"Proof","environment-remark-title":"Remark","environment-solution-title":"Solution","listing-page-order-by":"Order By","listing-page-order-by-default":"Default","listing-page-order-by-date-asc":"Oldest","listing-page-order-by-date-desc":"Newest","listing-page-order-by-number-desc":"High to Low","listing-page-order-by-number-asc":"Low to High","listing-page-field-date":"Date","listing-page-field-title":"Title","listing-page-field-description":"Description","listing-page-field-author":"Author","listing-page-field-filename":"File Name","listing-page-field-filemodified":"Modified","listing-page-field-subtitle":"Subtitle","listing-page-field-readingtime":"Reading Time","listing-page-field-categories":"Categories","listing-page-minutes-compact":"{0} min","listing-page-category-all":"All","listing-page-no-matches":"No matching items"},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.3.433","editor_options":{"markdown":{"wrap":72}}},"extensions":{"book":{"multiFile":true}}}},"projectFormats":["html"]}
\ No newline at end of file
diff --git a/.quarto/idx/optimization.qmd.json b/.quarto/idx/optimization.qmd.json
index baf3ce4..9712261 100644
--- a/.quarto/idx/optimization.qmd.json
+++ b/.quarto/idx/optimization.qmd.json
@@ -1 +1 @@
-{"title":"Introduction","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}}},"headingText":"Introduction","containsRefs":false,"markdown":"\n\n\n\n::: {#header}\n\n:::\n\n```{r markdown-setup, include=FALSE}\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\nknitr::opts_chunk$set(warning = FALSE)\n\noptions(scipen = 999)\n```\n\n\nThis guide outlines tools and tips for improving the speed and execution of R code.\n\nSometimes, simply tweaking a few lines of code can lead to large performance gains in the execution of a program. Other issues may take more time to work through but can be a huge benefit to a project in the long term.\n\nAn important lesson to learn when it comes to optimising an R (or any) program is knowing both if to start and when to stop. You most likely want to optimize your code because it is \"too slow\", but what that means will vary from project to project. Be sure to consider what \"fast enough\" is for your project and how much needs to be optimized. If your program takes an hour to complete, spending 5 hours trying to make it faster can be time well spent if the script will be run regularly, and a complete waste of time if it's an ad-hoc analysis.\n\nFor more information, see the CRAN Task View [High-Performance and Parallel Computing with R](https://CRAN.R-project.org/view=HighPerformanceComputing).\n\nThe \"Performant Code\" section of Hadley Wickham's [Advanced R](http://adv-r.had.co.nz/) is another great resource and provides a deeper dive into what is covered in this guide.\n\n------------------------------------------------------------------------\n\n# Update Your Installation\n\nOne of the easiest ways to improve the performance of R is to update R. In general, R will have a big annual release (i.e., 3.5.0) in the spring and around 3-4 smaller patch releases (i.e., 3.5.1) throughout the rest of the year. If the middle digit of your installation is behind the current release, you should consider updating.\n\nFor instance, R 3.5.0 implemented an improved read from text files. A 5GB file took over 5 minutes to read in 3.4.4:\n\n![](optimization/images/data-load-3-4.png){width=\"75%\"}\n\nWhile 3.5.0 took less than half the time:\n\n![](optimization/images/data-load-3-5.png){width=\"75%\"}\n\nTo see what the R-core development team is up to, check out the [NEWS](https://cran.r-project.org/doc/manuals/r-devel/NEWS.html) file from the R project.\n\n------------------------------------------------------------------------\n\n# Profiling & Benchmarking\n\nIn order to efficiently optimize your code, you'll first need to know where it's running slowest. The `profvis` package provides a nice way of visualizing the execution time and memory useage of your program.\n\n```{r profile-01}\nlibrary(profvis)\nlibrary(dplyr)\n\nprofvis({\n\tdiamonds <- read.csv(\"optimization/data/diamonds.csv\")\n\n\tdiamonds_by_cut <- diamonds %>%\n\t\tgroup_by(cut) %>%\n\t\tsummarise_if(is.numeric, mean)\n\n\twrite.csv(diamonds_by_cut, file = \"optimization/data/diamonds_by_cut.csv\")\n\n})\n\n```\n\nIn this toy example it looks like the `read.csv` function is the bottleneck, so\n\nwork on optimizing that first.\n\nOnce you find the bottleneck that needs to be optimized, it can be useful to\n\nbenchmark different potential solutions. The `microbenchmark` package can help\n\nyou choose between different options. Continuing with the simple example with\n\nthe `diamonds` dataset, compare the base `read.csv` function with `read_csv`\n\nfrom the `readr` package.\n\n```{r benchmark-01}\n\nlibrary(microbenchmark)\n\nmicrobenchmark(\n\n read.csv(\"optimization/data/diamonds.csv\"),\n\n readr::read_csv(\"optimization/data/diamonds.csv\")\n\n)\n\n```\n\nIn this case, `read_csv` is about twice as fast as the base R implementations.\n\n# Parallel Computing\n\nOften, time-intensive R code can be sped up by breaking the execution of\n\nthe job across additional cores of your computer. This is called parallel computing.\n\n## Learn `lapply`/`purrr::map`\n\nLearning the `lapply` (and variants) function from Base R or the `map` (and variants) function from the `purrr` package is the first step in learning to run R code in parallel. Once you understand how `lapply` and `map` work, running your code in parallel will be simple.\n\nSay you have a vector of numbers and want to find the square root of each one\n\n(ignore for now that `sqrt` is vectorized, which will be covered later).\n\nYou could write a for loop and iterate over each element of the vector:\n\n```{r apply-01}\n\nx <- c(1, 4, 9, 16)\n\nout <- vector(\"list\", length(x))\n\nfor (i in seq_along(x)) {\n\n out[[i]] <- sqrt(x[[i]])\n\n}\n\nunlist(out)\n\n```\n\nThe `lapply` function essentially handles the overhead of constructing a for\n\nloop for you. The syntax is:\n\n```{r apply-02, eval = FALSE}\n\nlapply(X, FUN, ...)\n\n```\n\n`lapply` will then take each element of `X` and apply the `FUN`ction to it.\n\nOur simple example then becomes:\n\n```{r apply-03}\n\nx <- c(1, 4, 9, 16)\n\nout <- lapply(x, sqrt)\n\nunlist(out)\n\n```\n\nThose working within the `tidyverse` may use `map` from the `purrr` package equivalently:\n\n```{r apply-04}\n\nlibrary(purrr)\n\nx <- c(1, 4, 9, 16)\n\nout <- map(x, sqrt)\n\nunlist(out)\n\n```\n\n## Motivating Example\n\nOnce you are comfortable with `lapply` and/or `map`, running the same code in\n\nparallel takes just an additional line of code.\n\nFor `lapply` users, the `future.apply` package contains an equivalent\n\n`future_lapply` function. Just be sure to call `plan(multiprocess)` beforehand,\n\nwhich will handle the back-end orchestration needed to run in parallel.\n\n```{r parallel-01}\n\n# install.packages(\"future.apply\")\n\nlibrary(future.apply)\n\nplan(multisession)\n\nout <- future_lapply(x, sqrt)\n\nunlist(out)\n```\n\nFor `purrr` users, the `furrr` (i.e., future purrr) package includes an\n\nequivalent `future_map` function:\n\n```{r parallel-02}\n\n# install.packages(\"furrr\")\n\nlibrary(furrr)\n\nplan(multisession)\n\ny <- future_map(x, sqrt)\n\nunlist(y)\n\n```\n\nHow much faster did this simple example run in parallel?\n\n```{r parallel-03}\n\nlibrary(future.apply)\n\nplan(multisession)\n\nx <- c(1, 4, 9, 16)\n\nmicrobenchmark::microbenchmark(\n\n sequential = lapply(x, sqrt),\n\n parallel = future_lapply(x, sqrt),\n\n unit = \"s\"\n\n)\n\n```\n\nParallelization was actually slower. In this case, the overhead of\n\nsetting the code to run in parallel far outweighed any performance gain. In\n\ngeneral, parallelization works well on long-running & compute intensive jobs.\n\n## A (somewhat) More Complex Example\n\nIn this example we'll use the `diamonds` dataset from `ggplot2` and perform a\n\nkmeans cluster. We'll use `lapply` to iterate the number of clusters from 2 to\n\n5:\n\n```{r kmeans-01}\n\ndf <- ggplot2::diamonds\n\ndf <- dplyr::select(df, -c(cut, color, clarity))\n\ncenters = 2:5\n\nsystem.time(\n\n lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n\n```\n\nA now running the same code in parallel:\n\n```{r kmeans-02}\n\nlibrary(future.apply)\n\nplan(multisession)\n\nsystem.time(\n\n future_lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n\n```\n\nWhile we didn't achieve perfect scaling, we still get a nice bump in execution\n\ntime.\n\n## Additional Packages\n\nFor the sake of ease and brevity, this guide focused on the `futures` framework\n\nfor parallelization. However, you should be aware that there are a number of\n\nother ways to parallelize your code.\n\n### The `parallel` Package\n\nThe `parallel` package is included in your base R installation. It includes\n\nanalogues of the various `apply` functions:\n\n- `parLapply`\n\n- `mclapply` - not available on Windows\n\nThese functions generally require more setup, especially on Windows machines.\n\n### The `doParallel` Package\n\nThe `doParallel` package builds off of `parallel` and is\n\nuseful for code that uses for loops instead of `lapply`. Like the parallel\n\npackage, it generally requires more setup, especially on Windows machines.\n\n### Machine Learning - `caret`\n\nFor those running machine learning models, the `caret` package can easily\n\nleverage `doParallel` to speed up the execution of multiple models. Lifting\n\nthe example from the package documentation:\n\n```{r caret-01, eval = FALSE}\n\nlibrary(doParallel)\n\ncl <- makePSOCKcluster(5) # number of cores to use\n\nregisterDoParallel(cl)\n\n## All subsequent models are then run in parallel\n\nmodel <- train(y ~ ., data = training, method = \"rf\")\n\n## When you are done:\n\nstopCluster(cl)\n\n```\n\nBe sure to check out the full\n\n[documentation](http://topepo.github.io/caret/parallel-processing.html)\n\nfor more detail.\n\n------------------------------------------------------------------------\n\n# Big Data\n\nAs data collection and storage becomes easier and cheaper, it is relatively\n\nsimple to obtain relatively large data files. An important point to keep in\n\nmind is that the size of your data will generally expand when it is read\n\nfrom a storage device into R. A general rule of thumb is that a file will take\n\nsomewhere around 3-4 times more space in memory than it does on disk.\n\nFor instance, compare the size of the `iris` data set when it is saved as a\n\n.csv file locally vs the size of the object when it is read in to an R session:\n\n```{r size-01, message = FALSE}\n\nfile.size(\"optimization/data/iris.csv\") / 1000\n\ndf <- readr::read_csv(\"optimization/data/iris.csv\")\n\npryr::object_size(df)\n\n```\n\nThis means that on a standard Urban Institute desktop, you may have issues\n\nreading in files that are larger than 4 GB.\n\n## Object Size\n\nThe type of your data can have a big impact on the size of your data frame\n\nwhen you are dealing with larger files. There are four main types of atomic\n\nvectors in R:\n\n1. `logical`\n\n2. `integer`\n\n3. `double` (also called `numeric`)\n\n4. `character`\n\n## Each of these data types occupies a different amount of space in memory\n\n`logical` and `integer` vectors use 4 bytes per element, while a `double` will\n\noccupy 8 bytes. R uses a global string pool, so `character` vectors are hard\n\nto estimate, but will generally take up more space for element.\n\nConsider the following example:\n\n```{r size-02}\n\nx <- 1:100\n\npryr::object_size(x)\n\npryr::object_size(as.double(x))\n\npryr::object_size(as.character(x))\n\n```\n\nAn incorrect data type can easily cost you a lot of space in memory, especially\n\nat scale. This often happens when reading data from a text or csv file - data\n\nmay have a format such as `c(1.0, 2.0, 3.0)` and will be read in as a `numeric`\n\ncolumn, when `integer` is more appropriate and compact.\n\nYou may also be familiar with `factor` variables within R. Essentially a\n\n`factor` will represent your data as integers, and map them back to their\n\ncharacter representation. This can save memory when you have a compact and\n\nunique level of factors:\n\n```{r size-03}\n\nx <- sample(letters, 10000, replace = TRUE)\n\npryr::object_size(as.character(x))\n\npryr::object_size(as.factor(x))\n\n```\n\nHowever if each element is unique, or if there is not a lot of overlap among\n\nelements, than the overhead will make a factor larger than its character\n\nrepresentation:\n\n```{r size-04}\n\npryr::object_size(as.factor(letters))\n\npryr::object_size(as.character(letters))\n\n```\n\n## Cloud Computing\n\nSometimes, you will have data that are simply too large to ever fit on your\n\nlocal desktop machine. If that is the case, then the Elastic Cloud Computing\n\nEnvironment from the Office of Technology and Data Science can provide you with\n\neasy access to powerful analytic tools for computationally intensive project.\n\nThe Elastic Cloud Computing Environment allows researchers to quickly spin-up\n\nan Amazon Web Services (AWS) Elastic Cloud Compute (EC2) instance. These\n\ninstances offer increased memory to read in large datasets, along with\n\nadditional CPUs to provide the ability to process data in parallel at an\n\nimpressive scale.\n\n| Instance \\| CPU \\| Memory (GB) \\|\n\n\\|----------\\|-----\\|--------\\|\n\n| Desktop \\| 8 \\| 16 \\|\n\n| c5.4xlarge \\| 16 \\| 32 \\|\n\n| c5.9xlarge \\| 36 \\| 72 \\|\n\n| c5.18xlarge \\| 72 \\| 144 \\|\n\n| x1e.8xlarge \\| 32 \\| 976 \\|\n\n| x1e.16xlarge \\| 64 \\| 1952 \\|\n\nFeel free to contact Erika Tyagi (etyagi\\@urban.org) if this would be useful\n\nfor your project.\n\n------------------------------------------------------------------------\n\n# Common Pitfalls\n\n## For Loops and Vector Allocation\n\nA refrain you will often hear is that for loops in R are slow and need to be\n\navoided at all costs. This is not true! Rather, an improperly constructed loop\n\nin R can bring the execution of your program to a near standstill.\n\nA common for loop structure may look something like:\n\n```{r loop-01, eval = FALSE}\n\nx <- 1:100\n\nout <- c()\n\nfor (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n\n```\n\nThe bottleneck in this loop is with the allocation of the vector `out`. Every\n\ntime we iterate over an item in `x` and append it to `out`, R makes a copy\n\nof all the items already in `out`. As the size of the loop grows, your code\n\nwill take longer and longer to run.\n\nA better practice is to pre-allocate `out` to be the correct length, and then\n\ninsert the results as the loop runs.\n\n```{r loop-03, eval = FALSE}\n\nx <- 1:100\n\nout <- rep(NA, length(x))\n\nfor (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n}\n\n```\n\nA quick benchmark shows how much more efficient a loop with a pre-allocated\n\nresults vector is:\n\n```{r loop-04}\n\nbad_loop <- function(x) {\n\n out <- c()\n\n for (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n\n}\n\ngood_loop <- function(x) {\n\n out <- rep(NA, length(x))\n\n for (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n }\n\n}\n\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(x),\n\n good_loop(x)\n\n)\n\n```\n\nAnd note how performance of the \"bad\" loop degrades as the loop size grows.\n\n```{r loop-05}\n\ny <- 1:250\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(y),\n\n good_loop(y)\n\n)\n\n```\n\n## Vectorized Functions\n\nMany functions in R are vectorized, meaning they can accept an entire vector\n\n(and not just a single value) as input. The `sqrt` function from the\n\nprior examples is one:\n\n```{r vectorised-01}\n\nx <- c(1, 4, 9, 16)\n\nsqrt(x)\n\n```\n\nThis removes the need to use `lapply` or a for loop. Vectorized functions in\n\nR are generally written in a compiled language like C, C++, or FORTRAN, which\n\nmakes their implementation faster.\n\n```{r vectorised-02}\n\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n lapply(x, sqrt),\n\n sqrt(x)\n\n)\n\n```\n"},"formats":{"html":{"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"knitr"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[]},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"optimization.html"},"language":{},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.2.269"},"extensions":{"book":{"multiFile":true}}}}}
\ No newline at end of file
+{"title":"Introduction","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}}},"headingText":"Introduction","containsRefs":false,"markdown":"\n\n\n\n::: {#header}\n\n:::\n\n```{r markdown-setup, include=FALSE}\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\nknitr::opts_chunk$set(warning = FALSE)\n\noptions(scipen = 999)\n```\n\n\nThis guide outlines tools and tips for improving the speed and execution of R code.\n\nSometimes, simply tweaking a few lines of code can lead to large performance gains in the execution of a program. Other issues may take more time to work through but can be a huge benefit to a project in the long term.\n\nAn important lesson to learn when it comes to optimising an R (or any) program is knowing both if to start and when to stop. You most likely want to optimize your code because it is \"too slow\", but what that means will vary from project to project. Be sure to consider what \"fast enough\" is for your project and how much needs to be optimized. If your program takes an hour to complete, spending 5 hours trying to make it faster can be time well spent if the script will be run regularly, and a complete waste of time if it's an ad-hoc analysis.\n\nFor more information, see the CRAN Task View [High-Performance and Parallel Computing with R](https://CRAN.R-project.org/view=HighPerformanceComputing).\n\nThe \"Performant Code\" section of Hadley Wickham's [Advanced R](http://adv-r.had.co.nz/) is another great resource and provides a deeper dive into what is covered in this guide.\n\n------------------------------------------------------------------------\n\n# Update Your Installation\n\nOne of the easiest ways to improve the performance of R is to update R. In general, R will have a big annual release (i.e., 3.5.0) in the spring and around 3-4 smaller patch releases (i.e., 3.5.1) throughout the rest of the year. If the middle digit of your installation is behind the current release, you should consider updating.\n\nFor instance, R 3.5.0 implemented an improved read from text files. A 5GB file took over 5 minutes to read in 3.4.4:\n\n![](optimization/images/data-load-3-4.png){width=\"75%\"}\n\nWhile 3.5.0 took less than half the time:\n\n![](optimization/images/data-load-3-5.png){width=\"75%\"}\n\nTo see what the R-core development team is up to, check out the [NEWS](https://cran.r-project.org/doc/manuals/r-devel/NEWS.html) file from the R project.\n\n------------------------------------------------------------------------\n\n# Profiling & Benchmarking\n\nIn order to efficiently optimize your code, you'll first need to know where it's running slowest. The `profvis` package provides a nice way of visualizing the execution time and memory useage of your program.\n\n```{r profile-01}\nlibrary(profvis)\nlibrary(dplyr)\n\nprofvis({\n\tdiamonds <- read.csv(\"optimization/data/diamonds.csv\")\n\n\tdiamonds_by_cut <- diamonds %>%\n\t\tgroup_by(cut) %>%\n\t\tsummarise_if(is.numeric, mean)\n\n\twrite.csv(diamonds_by_cut, file = \"optimization/data/diamonds_by_cut.csv\")\n\n})\n\n```\n\nIn this toy example it looks like the `read.csv` function is the bottleneck, so\n\nwork on optimizing that first.\n\nOnce you find the bottleneck that needs to be optimized, it can be useful to\n\nbenchmark different potential solutions. The `microbenchmark` package can help\n\nyou choose between different options. Continuing with the simple example with\n\nthe `diamonds` dataset, compare the base `read.csv` function with `read_csv`\n\nfrom the `readr` package.\n\n```{r benchmark-01}\n\nlibrary(microbenchmark)\n\nmicrobenchmark(\n\n read.csv(\"optimization/data/diamonds.csv\"),\n\n readr::read_csv(\"optimization/data/diamonds.csv\")\n\n)\n\n```\n\nIn this case, `read_csv` is about twice as fast as the base R implementations.\n\n# Parallel Computing\n\nOften, time-intensive R code can be sped up by breaking the execution of\n\nthe job across additional cores of your computer. This is called parallel computing.\n\n## Learn `lapply`/`purrr::map`\n\nLearning the `lapply` (and variants) function from Base R or the `map` (and variants) function from the `purrr` package is the first step in learning to run R code in parallel. Once you understand how `lapply` and `map` work, running your code in parallel will be simple.\n\nSay you have a vector of numbers and want to find the square root of each one\n\n(ignore for now that `sqrt` is vectorized, which will be covered later).\n\nYou could write a for loop and iterate over each element of the vector:\n\n```{r apply-01}\n\nx <- c(1, 4, 9, 16)\n\nout <- vector(\"list\", length(x))\n\nfor (i in seq_along(x)) {\n\n out[[i]] <- sqrt(x[[i]])\n\n}\n\nunlist(out)\n\n```\n\nThe `lapply` function essentially handles the overhead of constructing a for\n\nloop for you. The syntax is:\n\n```{r apply-02, eval = FALSE}\n\nlapply(X, FUN, ...)\n\n```\n\n`lapply` will then take each element of `X` and apply the `FUN`ction to it.\n\nOur simple example then becomes:\n\n```{r apply-03}\n\nx <- c(1, 4, 9, 16)\n\nout <- lapply(x, sqrt)\n\nunlist(out)\n\n```\n\nThose working within the `tidyverse` may use `map` from the `purrr` package equivalently:\n\n```{r apply-04}\n\nlibrary(purrr)\n\nx <- c(1, 4, 9, 16)\n\nout <- map(x, sqrt)\n\nunlist(out)\n\n```\n\n## Motivating Example\n\nOnce you are comfortable with `lapply` and/or `map`, running the same code in\n\nparallel takes just an additional line of code.\n\nFor `lapply` users, the `future.apply` package contains an equivalent\n\n`future_lapply` function. Just be sure to call `plan(multiprocess)` beforehand,\n\nwhich will handle the back-end orchestration needed to run in parallel.\n\n```{r parallel-01}\n\n# install.packages(\"future.apply\")\n\nlibrary(future.apply)\n\nplan(multisession)\n\nout <- future_lapply(x, sqrt)\n\nunlist(out)\n```\n\nFor `purrr` users, the `furrr` (i.e., future purrr) package includes an\n\nequivalent `future_map` function:\n\n```{r parallel-02}\n\n# install.packages(\"furrr\")\n\nlibrary(furrr)\n\nplan(multisession)\n\ny <- future_map(x, sqrt)\n\nunlist(y)\n\n```\n\nHow much faster did this simple example run in parallel?\n\n```{r parallel-03}\n\nlibrary(future.apply)\n\nplan(multisession)\n\nx <- c(1, 4, 9, 16)\n\nmicrobenchmark::microbenchmark(\n\n sequential = lapply(x, sqrt),\n\n parallel = future_lapply(x, sqrt),\n\n unit = \"s\"\n\n)\n\n```\n\nParallelization was actually slower. In this case, the overhead of\n\nsetting the code to run in parallel far outweighed any performance gain. In\n\ngeneral, parallelization works well on long-running & compute intensive jobs.\n\n## A (somewhat) More Complex Example\n\nIn this example we'll use the `diamonds` dataset from `ggplot2` and perform a\n\nkmeans cluster. We'll use `lapply` to iterate the number of clusters from 2 to\n\n5:\n\n```{r kmeans-01}\n\ndf <- ggplot2::diamonds\n\ndf <- dplyr::select(df, -c(cut, color, clarity))\n\ncenters = 2:5\n\nsystem.time(\n\n lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n\n```\n\nA now running the same code in parallel:\n\n```{r kmeans-02}\n\nlibrary(future.apply)\n\nplan(multisession)\n\nsystem.time(\n\n future_lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n\n```\n\nWhile we didn't achieve perfect scaling, we still get a nice bump in execution\n\ntime.\n\n## Additional Packages\n\nFor the sake of ease and brevity, this guide focused on the `futures` framework\n\nfor parallelization. However, you should be aware that there are a number of\n\nother ways to parallelize your code.\n\n### The `parallel` Package\n\nThe `parallel` package is included in your base R installation. It includes\n\nanalogues of the various `apply` functions:\n\n- `parLapply`\n\n- `mclapply` - not available on Windows\n\nThese functions generally require more setup, especially on Windows machines.\n\n### The `doParallel` Package\n\nThe `doParallel` package builds off of `parallel` and is\n\nuseful for code that uses for loops instead of `lapply`. Like the parallel\n\npackage, it generally requires more setup, especially on Windows machines.\n\n### Machine Learning - `caret`\n\nFor those running machine learning models, the `caret` package can easily\n\nleverage `doParallel` to speed up the execution of multiple models. Lifting\n\nthe example from the package documentation:\n\n```{r caret-01, eval = FALSE}\n\nlibrary(doParallel)\n\ncl <- makePSOCKcluster(5) # number of cores to use\n\nregisterDoParallel(cl)\n\n## All subsequent models are then run in parallel\n\nmodel <- train(y ~ ., data = training, method = \"rf\")\n\n## When you are done:\n\nstopCluster(cl)\n\n```\n\nBe sure to check out the full\n\n[documentation](http://topepo.github.io/caret/parallel-processing.html)\n\nfor more detail.\n\n------------------------------------------------------------------------\n\n# Big Data\n\nAs data collection and storage becomes easier and cheaper, it is relatively\n\nsimple to obtain relatively large data files. An important point to keep in\n\nmind is that the size of your data will generally expand when it is read\n\nfrom a storage device into R. A general rule of thumb is that a file will take\n\nsomewhere around 3-4 times more space in memory than it does on disk.\n\nFor instance, compare the size of the `iris` data set when it is saved as a\n\n.csv file locally vs the size of the object when it is read in to an R session:\n\n```{r size-01, message = FALSE}\n\nfile.size(\"optimization/data/iris.csv\") / 1000\n\ndf <- readr::read_csv(\"optimization/data/iris.csv\")\n\npryr::object_size(df)\n\n```\n\nThis means that on a standard Urban Institute desktop, you may have issues\n\nreading in files that are larger than 4 GB.\n\n## Object Size\n\nThe type of your data can have a big impact on the size of your data frame\n\nwhen you are dealing with larger files. There are four main types of atomic\n\nvectors in R:\n\n1. `logical`\n\n2. `integer`\n\n3. `double` (also called `numeric`)\n\n4. `character`\n\n## Each of these data types occupies a different amount of space in memory\n\n`logical` and `integer` vectors use 4 bytes per element, while a `double` will\n\noccupy 8 bytes. R uses a global string pool, so `character` vectors are hard\n\nto estimate, but will generally take up more space for element.\n\nConsider the following example:\n\n```{r size-02}\n\nx <- 1:100\n\npryr::object_size(x)\n\npryr::object_size(as.double(x))\n\npryr::object_size(as.character(x))\n\n```\n\nAn incorrect data type can easily cost you a lot of space in memory, especially\n\nat scale. This often happens when reading data from a text or csv file - data\n\nmay have a format such as `c(1.0, 2.0, 3.0)` and will be read in as a `numeric`\n\ncolumn, when `integer` is more appropriate and compact.\n\nYou may also be familiar with `factor` variables within R. Essentially a\n\n`factor` will represent your data as integers, and map them back to their\n\ncharacter representation. This can save memory when you have a compact and\n\nunique level of factors:\n\n```{r size-03}\n\nx <- sample(letters, 10000, replace = TRUE)\n\npryr::object_size(as.character(x))\n\npryr::object_size(as.factor(x))\n\n```\n\nHowever if each element is unique, or if there is not a lot of overlap among\n\nelements, than the overhead will make a factor larger than its character\n\nrepresentation:\n\n```{r size-04}\n\npryr::object_size(as.factor(letters))\n\npryr::object_size(as.character(letters))\n\n```\n\n## Cloud Computing\n\nSometimes, you will have data that are simply too large to ever fit on your\n\nlocal desktop machine. If that is the case, then the Elastic Cloud Computing\n\nEnvironment from the Office of Technology and Data Science can provide you with\n\neasy access to powerful analytic tools for computationally intensive project.\n\nThe Elastic Cloud Computing Environment allows researchers to quickly spin-up\n\nan Amazon Web Services (AWS) Elastic Cloud Compute (EC2) instance. These\n\ninstances offer increased memory to read in large datasets, along with\n\nadditional CPUs to provide the ability to process data in parallel at an\n\nimpressive scale.\n\n| Instance \\| CPU \\| Memory (GB) \\|\n\n\\|----------\\|-----\\|--------\\|\n\n| Desktop \\| 8 \\| 16 \\|\n\n| c5.4xlarge \\| 16 \\| 32 \\|\n\n| c5.9xlarge \\| 36 \\| 72 \\|\n\n| c5.18xlarge \\| 72 \\| 144 \\|\n\n| x1e.8xlarge \\| 32 \\| 976 \\|\n\n| x1e.16xlarge \\| 64 \\| 1952 \\|\n\nFeel free to contact Erika Tyagi (etyagi\\@urban.org) if this would be useful\n\nfor your project.\n\n------------------------------------------------------------------------\n\n# Common Pitfalls\n\n## For Loops and Vector Allocation\n\nA refrain you will often hear is that for loops in R are slow and need to be\n\navoided at all costs. This is not true! Rather, an improperly constructed loop\n\nin R can bring the execution of your program to a near standstill.\n\nA common for loop structure may look something like:\n\n```{r loop-01, eval = FALSE}\n\nx <- 1:100\n\nout <- c()\n\nfor (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n\n```\n\nThe bottleneck in this loop is with the allocation of the vector `out`. Every\n\ntime we iterate over an item in `x` and append it to `out`, R makes a copy\n\nof all the items already in `out`. As the size of the loop grows, your code\n\nwill take longer and longer to run.\n\nA better practice is to pre-allocate `out` to be the correct length, and then\n\ninsert the results as the loop runs.\n\n```{r loop-03, eval = FALSE}\n\nx <- 1:100\n\nout <- rep(NA, length(x))\n\nfor (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n}\n\n```\n\nA quick benchmark shows how much more efficient a loop with a pre-allocated\n\nresults vector is:\n\n```{r loop-04}\n\nbad_loop <- function(x) {\n\n out <- c()\n\n for (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n\n}\n\ngood_loop <- function(x) {\n\n out <- rep(NA, length(x))\n\n for (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n }\n\n}\n\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(x),\n\n good_loop(x)\n\n)\n\n```\n\nAnd note how performance of the \"bad\" loop degrades as the loop size grows.\n\n```{r loop-05}\n\ny <- 1:250\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(y),\n\n good_loop(y)\n\n)\n\n```\n\n## Vectorized Functions\n\nMany functions in R are vectorized, meaning they can accept an entire vector\n\n(and not just a single value) as input. The `sqrt` function from the\n\nprior examples is one:\n\n```{r vectorised-01}\n\nx <- c(1, 4, 9, 16)\n\nsqrt(x)\n\n```\n\nThis removes the need to use `lapply` or a for loop. Vectorized functions in\n\nR are generally written in a compiled language like C, C++, or FORTRAN, which\n\nmakes their implementation faster.\n\n```{r vectorised-02}\n\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n lapply(x, sqrt),\n\n sqrt(x)\n\n)\n\n```\n","srcMarkdownNoYaml":"\n\n\n\n::: {#header}\n\n:::\n\n```{r markdown-setup, include=FALSE}\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\nknitr::opts_chunk$set(warning = FALSE)\n\noptions(scipen = 999)\n```\n\n# Introduction\n\nThis guide outlines tools and tips for improving the speed and execution of R code.\n\nSometimes, simply tweaking a few lines of code can lead to large performance gains in the execution of a program. Other issues may take more time to work through but can be a huge benefit to a project in the long term.\n\nAn important lesson to learn when it comes to optimising an R (or any) program is knowing both if to start and when to stop. You most likely want to optimize your code because it is \"too slow\", but what that means will vary from project to project. Be sure to consider what \"fast enough\" is for your project and how much needs to be optimized. If your program takes an hour to complete, spending 5 hours trying to make it faster can be time well spent if the script will be run regularly, and a complete waste of time if it's an ad-hoc analysis.\n\nFor more information, see the CRAN Task View [High-Performance and Parallel Computing with R](https://CRAN.R-project.org/view=HighPerformanceComputing).\n\nThe \"Performant Code\" section of Hadley Wickham's [Advanced R](http://adv-r.had.co.nz/) is another great resource and provides a deeper dive into what is covered in this guide.\n\n------------------------------------------------------------------------\n\n# Update Your Installation\n\nOne of the easiest ways to improve the performance of R is to update R. In general, R will have a big annual release (i.e., 3.5.0) in the spring and around 3-4 smaller patch releases (i.e., 3.5.1) throughout the rest of the year. If the middle digit of your installation is behind the current release, you should consider updating.\n\nFor instance, R 3.5.0 implemented an improved read from text files. A 5GB file took over 5 minutes to read in 3.4.4:\n\n![](optimization/images/data-load-3-4.png){width=\"75%\"}\n\nWhile 3.5.0 took less than half the time:\n\n![](optimization/images/data-load-3-5.png){width=\"75%\"}\n\nTo see what the R-core development team is up to, check out the [NEWS](https://cran.r-project.org/doc/manuals/r-devel/NEWS.html) file from the R project.\n\n------------------------------------------------------------------------\n\n# Profiling & Benchmarking\n\nIn order to efficiently optimize your code, you'll first need to know where it's running slowest. The `profvis` package provides a nice way of visualizing the execution time and memory useage of your program.\n\n```{r profile-01}\nlibrary(profvis)\nlibrary(dplyr)\n\nprofvis({\n\tdiamonds <- read.csv(\"optimization/data/diamonds.csv\")\n\n\tdiamonds_by_cut <- diamonds %>%\n\t\tgroup_by(cut) %>%\n\t\tsummarise_if(is.numeric, mean)\n\n\twrite.csv(diamonds_by_cut, file = \"optimization/data/diamonds_by_cut.csv\")\n\n})\n\n```\n\nIn this toy example it looks like the `read.csv` function is the bottleneck, so\n\nwork on optimizing that first.\n\nOnce you find the bottleneck that needs to be optimized, it can be useful to\n\nbenchmark different potential solutions. The `microbenchmark` package can help\n\nyou choose between different options. Continuing with the simple example with\n\nthe `diamonds` dataset, compare the base `read.csv` function with `read_csv`\n\nfrom the `readr` package.\n\n```{r benchmark-01}\n\nlibrary(microbenchmark)\n\nmicrobenchmark(\n\n read.csv(\"optimization/data/diamonds.csv\"),\n\n readr::read_csv(\"optimization/data/diamonds.csv\")\n\n)\n\n```\n\nIn this case, `read_csv` is about twice as fast as the base R implementations.\n\n# Parallel Computing\n\nOften, time-intensive R code can be sped up by breaking the execution of\n\nthe job across additional cores of your computer. This is called parallel computing.\n\n## Learn `lapply`/`purrr::map`\n\nLearning the `lapply` (and variants) function from Base R or the `map` (and variants) function from the `purrr` package is the first step in learning to run R code in parallel. Once you understand how `lapply` and `map` work, running your code in parallel will be simple.\n\nSay you have a vector of numbers and want to find the square root of each one\n\n(ignore for now that `sqrt` is vectorized, which will be covered later).\n\nYou could write a for loop and iterate over each element of the vector:\n\n```{r apply-01}\n\nx <- c(1, 4, 9, 16)\n\nout <- vector(\"list\", length(x))\n\nfor (i in seq_along(x)) {\n\n out[[i]] <- sqrt(x[[i]])\n\n}\n\nunlist(out)\n\n```\n\nThe `lapply` function essentially handles the overhead of constructing a for\n\nloop for you. The syntax is:\n\n```{r apply-02, eval = FALSE}\n\nlapply(X, FUN, ...)\n\n```\n\n`lapply` will then take each element of `X` and apply the `FUN`ction to it.\n\nOur simple example then becomes:\n\n```{r apply-03}\n\nx <- c(1, 4, 9, 16)\n\nout <- lapply(x, sqrt)\n\nunlist(out)\n\n```\n\nThose working within the `tidyverse` may use `map` from the `purrr` package equivalently:\n\n```{r apply-04}\n\nlibrary(purrr)\n\nx <- c(1, 4, 9, 16)\n\nout <- map(x, sqrt)\n\nunlist(out)\n\n```\n\n## Motivating Example\n\nOnce you are comfortable with `lapply` and/or `map`, running the same code in\n\nparallel takes just an additional line of code.\n\nFor `lapply` users, the `future.apply` package contains an equivalent\n\n`future_lapply` function. Just be sure to call `plan(multiprocess)` beforehand,\n\nwhich will handle the back-end orchestration needed to run in parallel.\n\n```{r parallel-01}\n\n# install.packages(\"future.apply\")\n\nlibrary(future.apply)\n\nplan(multisession)\n\nout <- future_lapply(x, sqrt)\n\nunlist(out)\n```\n\nFor `purrr` users, the `furrr` (i.e., future purrr) package includes an\n\nequivalent `future_map` function:\n\n```{r parallel-02}\n\n# install.packages(\"furrr\")\n\nlibrary(furrr)\n\nplan(multisession)\n\ny <- future_map(x, sqrt)\n\nunlist(y)\n\n```\n\nHow much faster did this simple example run in parallel?\n\n```{r parallel-03}\n\nlibrary(future.apply)\n\nplan(multisession)\n\nx <- c(1, 4, 9, 16)\n\nmicrobenchmark::microbenchmark(\n\n sequential = lapply(x, sqrt),\n\n parallel = future_lapply(x, sqrt),\n\n unit = \"s\"\n\n)\n\n```\n\nParallelization was actually slower. In this case, the overhead of\n\nsetting the code to run in parallel far outweighed any performance gain. In\n\ngeneral, parallelization works well on long-running & compute intensive jobs.\n\n## A (somewhat) More Complex Example\n\nIn this example we'll use the `diamonds` dataset from `ggplot2` and perform a\n\nkmeans cluster. We'll use `lapply` to iterate the number of clusters from 2 to\n\n5:\n\n```{r kmeans-01}\n\ndf <- ggplot2::diamonds\n\ndf <- dplyr::select(df, -c(cut, color, clarity))\n\ncenters = 2:5\n\nsystem.time(\n\n lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n\n```\n\nA now running the same code in parallel:\n\n```{r kmeans-02}\n\nlibrary(future.apply)\n\nplan(multisession)\n\nsystem.time(\n\n future_lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n\n```\n\nWhile we didn't achieve perfect scaling, we still get a nice bump in execution\n\ntime.\n\n## Additional Packages\n\nFor the sake of ease and brevity, this guide focused on the `futures` framework\n\nfor parallelization. However, you should be aware that there are a number of\n\nother ways to parallelize your code.\n\n### The `parallel` Package\n\nThe `parallel` package is included in your base R installation. It includes\n\nanalogues of the various `apply` functions:\n\n- `parLapply`\n\n- `mclapply` - not available on Windows\n\nThese functions generally require more setup, especially on Windows machines.\n\n### The `doParallel` Package\n\nThe `doParallel` package builds off of `parallel` and is\n\nuseful for code that uses for loops instead of `lapply`. Like the parallel\n\npackage, it generally requires more setup, especially on Windows machines.\n\n### Machine Learning - `caret`\n\nFor those running machine learning models, the `caret` package can easily\n\nleverage `doParallel` to speed up the execution of multiple models. Lifting\n\nthe example from the package documentation:\n\n```{r caret-01, eval = FALSE}\n\nlibrary(doParallel)\n\ncl <- makePSOCKcluster(5) # number of cores to use\n\nregisterDoParallel(cl)\n\n## All subsequent models are then run in parallel\n\nmodel <- train(y ~ ., data = training, method = \"rf\")\n\n## When you are done:\n\nstopCluster(cl)\n\n```\n\nBe sure to check out the full\n\n[documentation](http://topepo.github.io/caret/parallel-processing.html)\n\nfor more detail.\n\n------------------------------------------------------------------------\n\n# Big Data\n\nAs data collection and storage becomes easier and cheaper, it is relatively\n\nsimple to obtain relatively large data files. An important point to keep in\n\nmind is that the size of your data will generally expand when it is read\n\nfrom a storage device into R. A general rule of thumb is that a file will take\n\nsomewhere around 3-4 times more space in memory than it does on disk.\n\nFor instance, compare the size of the `iris` data set when it is saved as a\n\n.csv file locally vs the size of the object when it is read in to an R session:\n\n```{r size-01, message = FALSE}\n\nfile.size(\"optimization/data/iris.csv\") / 1000\n\ndf <- readr::read_csv(\"optimization/data/iris.csv\")\n\npryr::object_size(df)\n\n```\n\nThis means that on a standard Urban Institute desktop, you may have issues\n\nreading in files that are larger than 4 GB.\n\n## Object Size\n\nThe type of your data can have a big impact on the size of your data frame\n\nwhen you are dealing with larger files. There are four main types of atomic\n\nvectors in R:\n\n1. `logical`\n\n2. `integer`\n\n3. `double` (also called `numeric`)\n\n4. `character`\n\n## Each of these data types occupies a different amount of space in memory\n\n`logical` and `integer` vectors use 4 bytes per element, while a `double` will\n\noccupy 8 bytes. R uses a global string pool, so `character` vectors are hard\n\nto estimate, but will generally take up more space for element.\n\nConsider the following example:\n\n```{r size-02}\n\nx <- 1:100\n\npryr::object_size(x)\n\npryr::object_size(as.double(x))\n\npryr::object_size(as.character(x))\n\n```\n\nAn incorrect data type can easily cost you a lot of space in memory, especially\n\nat scale. This often happens when reading data from a text or csv file - data\n\nmay have a format such as `c(1.0, 2.0, 3.0)` and will be read in as a `numeric`\n\ncolumn, when `integer` is more appropriate and compact.\n\nYou may also be familiar with `factor` variables within R. Essentially a\n\n`factor` will represent your data as integers, and map them back to their\n\ncharacter representation. This can save memory when you have a compact and\n\nunique level of factors:\n\n```{r size-03}\n\nx <- sample(letters, 10000, replace = TRUE)\n\npryr::object_size(as.character(x))\n\npryr::object_size(as.factor(x))\n\n```\n\nHowever if each element is unique, or if there is not a lot of overlap among\n\nelements, than the overhead will make a factor larger than its character\n\nrepresentation:\n\n```{r size-04}\n\npryr::object_size(as.factor(letters))\n\npryr::object_size(as.character(letters))\n\n```\n\n## Cloud Computing\n\nSometimes, you will have data that are simply too large to ever fit on your\n\nlocal desktop machine. If that is the case, then the Elastic Cloud Computing\n\nEnvironment from the Office of Technology and Data Science can provide you with\n\neasy access to powerful analytic tools for computationally intensive project.\n\nThe Elastic Cloud Computing Environment allows researchers to quickly spin-up\n\nan Amazon Web Services (AWS) Elastic Cloud Compute (EC2) instance. These\n\ninstances offer increased memory to read in large datasets, along with\n\nadditional CPUs to provide the ability to process data in parallel at an\n\nimpressive scale.\n\n| Instance \\| CPU \\| Memory (GB) \\|\n\n\\|----------\\|-----\\|--------\\|\n\n| Desktop \\| 8 \\| 16 \\|\n\n| c5.4xlarge \\| 16 \\| 32 \\|\n\n| c5.9xlarge \\| 36 \\| 72 \\|\n\n| c5.18xlarge \\| 72 \\| 144 \\|\n\n| x1e.8xlarge \\| 32 \\| 976 \\|\n\n| x1e.16xlarge \\| 64 \\| 1952 \\|\n\nFeel free to contact Erika Tyagi (etyagi\\@urban.org) if this would be useful\n\nfor your project.\n\n------------------------------------------------------------------------\n\n# Common Pitfalls\n\n## For Loops and Vector Allocation\n\nA refrain you will often hear is that for loops in R are slow and need to be\n\navoided at all costs. This is not true! Rather, an improperly constructed loop\n\nin R can bring the execution of your program to a near standstill.\n\nA common for loop structure may look something like:\n\n```{r loop-01, eval = FALSE}\n\nx <- 1:100\n\nout <- c()\n\nfor (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n\n```\n\nThe bottleneck in this loop is with the allocation of the vector `out`. Every\n\ntime we iterate over an item in `x` and append it to `out`, R makes a copy\n\nof all the items already in `out`. As the size of the loop grows, your code\n\nwill take longer and longer to run.\n\nA better practice is to pre-allocate `out` to be the correct length, and then\n\ninsert the results as the loop runs.\n\n```{r loop-03, eval = FALSE}\n\nx <- 1:100\n\nout <- rep(NA, length(x))\n\nfor (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n}\n\n```\n\nA quick benchmark shows how much more efficient a loop with a pre-allocated\n\nresults vector is:\n\n```{r loop-04}\n\nbad_loop <- function(x) {\n\n out <- c()\n\n for (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n\n}\n\ngood_loop <- function(x) {\n\n out <- rep(NA, length(x))\n\n for (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n }\n\n}\n\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(x),\n\n good_loop(x)\n\n)\n\n```\n\nAnd note how performance of the \"bad\" loop degrades as the loop size grows.\n\n```{r loop-05}\n\ny <- 1:250\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(y),\n\n good_loop(y)\n\n)\n\n```\n\n## Vectorized Functions\n\nMany functions in R are vectorized, meaning they can accept an entire vector\n\n(and not just a single value) as input. The `sqrt` function from the\n\nprior examples is one:\n\n```{r vectorised-01}\n\nx <- c(1, 4, 9, 16)\n\nsqrt(x)\n\n```\n\nThis removes the need to use `lapply` or a for loop. Vectorized functions in\n\nR are generally written in a compiled language like C, C++, or FORTRAN, which\n\nmakes their implementation faster.\n\n```{r vectorised-02}\n\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n lapply(x, sqrt),\n\n sqrt(x)\n\n)\n\n```\n"},"formats":{"html":{"identifier":{"display-name":"HTML","target-format":"html","base-format":"html"},"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"knitr"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"inline-includes":false,"preserve-yaml":false,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[],"notebook-links":true,"format-links":true},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"optimization.html"},"language":{"toc-title-document":"Table of contents","toc-title-website":"On this page","related-formats-title":"Other Formats","related-notebooks-title":"Notebooks","source-notebooks-prefix":"Source","section-title-abstract":"Abstract","section-title-appendices":"Appendices","section-title-footnotes":"Footnotes","section-title-references":"References","section-title-reuse":"Reuse","section-title-copyright":"Copyright","section-title-citation":"Citation","appendix-attribution-cite-as":"For attribution, please cite this work as:","appendix-attribution-bibtex":"BibTeX citation:","title-block-author-single":"Author","title-block-author-plural":"Authors","title-block-affiliation-single":"Affiliation","title-block-affiliation-plural":"Affiliations","title-block-published":"Published","title-block-modified":"Modified","callout-tip-title":"Tip","callout-note-title":"Note","callout-warning-title":"Warning","callout-important-title":"Important","callout-caution-title":"Caution","code-summary":"Code","code-tools-menu-caption":"Code","code-tools-show-all-code":"Show All Code","code-tools-hide-all-code":"Hide All Code","code-tools-view-source":"View Source","code-tools-source-code":"Source Code","code-line":"Line","code-lines":"Lines","copy-button-tooltip":"Copy to Clipboard","copy-button-tooltip-success":"Copied!","repo-action-links-edit":"Edit this page","repo-action-links-source":"View source","repo-action-links-issue":"Report an issue","back-to-top":"Back to top","search-no-results-text":"No results","search-matching-documents-text":"matching documents","search-copy-link-title":"Copy link to search","search-hide-matches-text":"Hide additional matches","search-more-match-text":"more match in this document","search-more-matches-text":"more matches in this document","search-clear-button-title":"Clear","search-detached-cancel-button-title":"Cancel","search-submit-button-title":"Submit","search-label":"Search","toggle-section":"Toggle section","toggle-sidebar":"Toggle sidebar navigation","toggle-dark-mode":"Toggle dark mode","toggle-reader-mode":"Toggle reader mode","toggle-navigation":"Toggle navigation","crossref-fig-title":"Figure","crossref-tbl-title":"Table","crossref-lst-title":"Listing","crossref-thm-title":"Theorem","crossref-lem-title":"Lemma","crossref-cor-title":"Corollary","crossref-prp-title":"Proposition","crossref-cnj-title":"Conjecture","crossref-def-title":"Definition","crossref-exm-title":"Example","crossref-exr-title":"Exercise","crossref-ch-prefix":"Chapter","crossref-apx-prefix":"Appendix","crossref-sec-prefix":"Section","crossref-eq-prefix":"Equation","crossref-lof-title":"List of Figures","crossref-lot-title":"List of Tables","crossref-lol-title":"List of Listings","environment-proof-title":"Proof","environment-remark-title":"Remark","environment-solution-title":"Solution","listing-page-order-by":"Order By","listing-page-order-by-default":"Default","listing-page-order-by-date-asc":"Oldest","listing-page-order-by-date-desc":"Newest","listing-page-order-by-number-desc":"High to Low","listing-page-order-by-number-asc":"Low to High","listing-page-field-date":"Date","listing-page-field-title":"Title","listing-page-field-description":"Description","listing-page-field-author":"Author","listing-page-field-filename":"File Name","listing-page-field-filemodified":"Modified","listing-page-field-subtitle":"Subtitle","listing-page-field-readingtime":"Reading Time","listing-page-field-categories":"Categories","listing-page-minutes-compact":"{0} min","listing-page-category-all":"All","listing-page-no-matches":"No matching items"},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.3.433"},"extensions":{"book":{"multiFile":true}}}},"projectFormats":["html"]}
\ No newline at end of file
diff --git a/.quarto/idx/resources.qmd.json b/.quarto/idx/resources.qmd.json
index 6489c7e..66ed437 100644
--- a/.quarto/idx/resources.qmd.json
+++ b/.quarto/idx/resources.qmd.json
@@ -1 +1 @@
-{"title":"Free Books","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}}},"headingText":"Free Books","containsRefs":false,"markdown":"\n\n\n\n
\n\n
\n\n\n### Intro\n\n* [R for Data Science](https://r4ds.had.co.nz/) by Garrett Grolemund and Hadley Wickham\n\n### Data Viz\n\n* [ggplot2: Elegant Graphics for Data Analysis](https://ggplot2-book.org/) by Hadley Wickham\n* [Data Visualization - A practical introduction](http://socviz.co/index.html#preface) by Kieran Healy\n\n### *down\n\n* [R Markdown: The Definitive Guide](https://bookdown.org/yihui/rmarkdown/) by Yihui Xie, J. J. Allaire, and Garrett Grolemund\n* [blogdown: Creating Websites with R Markdown](https://bookdown.org/yihui/blogdown/) by Yihui Xie, Amber Thomas, and Alison Presmanes Hill\n* [bookdown: Authoring Books and Technical Documents with R Markdown](https://bookdown.org/yihui/bookdown/) by Yihui Xie\n\n### Statistics\n\n* [Learning Statistics with R](https://learningstatisticswithr.com/) by Danielle Navarro\n* [Introduction to Econometrics with R](https://www.econometrics-with-r.org/) by Christoph Hanck, Martin Arnold, Alexander Gerber and Martin Schmelzer\n* [An Introduction to Bayesian Thinking](https://statswithr.github.io/book/) by Merlise Clyde et. al.\n* [Statistical Inference via Data Science](https://moderndive.com/index.html) by Chester Ismay and Albert Y. Kim\n\n### Machine Learning\n\n* [Hands-On Machine Learning with R](https://bradleyboehmke.github.io/HOML/) by Bradley Boehmke & Brandon Greenwell\n* [Feature Engineering and Selection: A Practical Approach for Predictive Models](http://www.feat.engineering/) by Max Kuhn and Kjell Johnson\n\n### Mapping and Geospatial Analysis\n\n* [Geocomputation with R](https://geocompr.robinlovelace.net/) by Robin Lovelace, Jakub Nowosad, Jannes Muenchow\n\n### Text Analysis\n\n* [Text Mining with R A Tidy Approach](https://www.tidytextmining.com/) by Julia Silge and David Robinson\n\n### Programming\n\n* [Advanced R](https://adv-r.hadley.nz/) by Hadley Wickham\n* [R Packages](https://r-pkgs.org/) by Hadley Wickham\n* [Master Spark with R](https://therinspark.com/) by Javier Luraschi, Kevin Kuo, and Edgar Ruiz\n* [Functional programming and unit testing for data munging with R](https://b-rodrigues.github.io/fput/) by Bruno Rodrigues\n\n# Websites\n\n* [RStudio Essentials](https://resources.rstudio.com/)\n* [RStudio Education](https://education.rstudio.com/)\n* [R Cheat Sheets](https://rstudio.com/resources/cheatsheets/)\n* Andrew Heiss' free [Data Viz Course](https://datavizm20.classes.andrewheiss.com/)\n"},"formats":{"html":{"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"markdown"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[]},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"resources.html"},"language":{},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.2.269"},"extensions":{"book":{"multiFile":true}}}}}
\ No newline at end of file
+{"title":"Free Books","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}}},"headingText":"Free Books","containsRefs":false,"markdown":"\n\n\n\n
\n\n
\n\n\n### Intro\n\n* [R for Data Science](https://r4ds.had.co.nz/) by Garrett Grolemund and Hadley Wickham\n\n### Data Viz\n\n* [ggplot2: Elegant Graphics for Data Analysis](https://ggplot2-book.org/) by Hadley Wickham\n* [Data Visualization - A practical introduction](http://socviz.co/index.html#preface) by Kieran Healy\n\n### *down\n\n* [R Markdown: The Definitive Guide](https://bookdown.org/yihui/rmarkdown/) by Yihui Xie, J. J. Allaire, and Garrett Grolemund\n* [blogdown: Creating Websites with R Markdown](https://bookdown.org/yihui/blogdown/) by Yihui Xie, Amber Thomas, and Alison Presmanes Hill\n* [bookdown: Authoring Books and Technical Documents with R Markdown](https://bookdown.org/yihui/bookdown/) by Yihui Xie\n\n### Statistics\n\n* [Learning Statistics with R](https://learningstatisticswithr.com/) by Danielle Navarro\n* [Introduction to Econometrics with R](https://www.econometrics-with-r.org/) by Christoph Hanck, Martin Arnold, Alexander Gerber and Martin Schmelzer\n* [An Introduction to Bayesian Thinking](https://statswithr.github.io/book/) by Merlise Clyde et. al.\n* [Statistical Inference via Data Science](https://moderndive.com/index.html) by Chester Ismay and Albert Y. Kim\n\n### Machine Learning\n\n* [Hands-On Machine Learning with R](https://bradleyboehmke.github.io/HOML/) by Bradley Boehmke & Brandon Greenwell\n* [Feature Engineering and Selection: A Practical Approach for Predictive Models](http://www.feat.engineering/) by Max Kuhn and Kjell Johnson\n\n### Mapping and Geospatial Analysis\n\n* [Geocomputation with R](https://geocompr.robinlovelace.net/) by Robin Lovelace, Jakub Nowosad, Jannes Muenchow\n\n### Text Analysis\n\n* [Text Mining with R A Tidy Approach](https://www.tidytextmining.com/) by Julia Silge and David Robinson\n\n### Programming\n\n* [Advanced R](https://adv-r.hadley.nz/) by Hadley Wickham\n* [R Packages](https://r-pkgs.org/) by Hadley Wickham\n* [Master Spark with R](https://therinspark.com/) by Javier Luraschi, Kevin Kuo, and Edgar Ruiz\n* [Functional programming and unit testing for data munging with R](https://b-rodrigues.github.io/fput/) by Bruno Rodrigues\n\n# Websites\n\n* [RStudio Essentials](https://resources.rstudio.com/)\n* [RStudio Education](https://education.rstudio.com/)\n* [R Cheat Sheets](https://rstudio.com/resources/cheatsheets/)\n* Andrew Heiss' free [Data Viz Course](https://datavizm20.classes.andrewheiss.com/)\n","srcMarkdownNoYaml":"\n\n\n\n
\n\n
\n\n# Free Books\n\n### Intro\n\n* [R for Data Science](https://r4ds.had.co.nz/) by Garrett Grolemund and Hadley Wickham\n\n### Data Viz\n\n* [ggplot2: Elegant Graphics for Data Analysis](https://ggplot2-book.org/) by Hadley Wickham\n* [Data Visualization - A practical introduction](http://socviz.co/index.html#preface) by Kieran Healy\n\n### *down\n\n* [R Markdown: The Definitive Guide](https://bookdown.org/yihui/rmarkdown/) by Yihui Xie, J. J. Allaire, and Garrett Grolemund\n* [blogdown: Creating Websites with R Markdown](https://bookdown.org/yihui/blogdown/) by Yihui Xie, Amber Thomas, and Alison Presmanes Hill\n* [bookdown: Authoring Books and Technical Documents with R Markdown](https://bookdown.org/yihui/bookdown/) by Yihui Xie\n\n### Statistics\n\n* [Learning Statistics with R](https://learningstatisticswithr.com/) by Danielle Navarro\n* [Introduction to Econometrics with R](https://www.econometrics-with-r.org/) by Christoph Hanck, Martin Arnold, Alexander Gerber and Martin Schmelzer\n* [An Introduction to Bayesian Thinking](https://statswithr.github.io/book/) by Merlise Clyde et. al.\n* [Statistical Inference via Data Science](https://moderndive.com/index.html) by Chester Ismay and Albert Y. Kim\n\n### Machine Learning\n\n* [Hands-On Machine Learning with R](https://bradleyboehmke.github.io/HOML/) by Bradley Boehmke & Brandon Greenwell\n* [Feature Engineering and Selection: A Practical Approach for Predictive Models](http://www.feat.engineering/) by Max Kuhn and Kjell Johnson\n\n### Mapping and Geospatial Analysis\n\n* [Geocomputation with R](https://geocompr.robinlovelace.net/) by Robin Lovelace, Jakub Nowosad, Jannes Muenchow\n\n### Text Analysis\n\n* [Text Mining with R A Tidy Approach](https://www.tidytextmining.com/) by Julia Silge and David Robinson\n\n### Programming\n\n* [Advanced R](https://adv-r.hadley.nz/) by Hadley Wickham\n* [R Packages](https://r-pkgs.org/) by Hadley Wickham\n* [Master Spark with R](https://therinspark.com/) by Javier Luraschi, Kevin Kuo, and Edgar Ruiz\n* [Functional programming and unit testing for data munging with R](https://b-rodrigues.github.io/fput/) by Bruno Rodrigues\n\n# Websites\n\n* [RStudio Essentials](https://resources.rstudio.com/)\n* [RStudio Education](https://education.rstudio.com/)\n* [R Cheat Sheets](https://rstudio.com/resources/cheatsheets/)\n* Andrew Heiss' free [Data Viz Course](https://datavizm20.classes.andrewheiss.com/)\n"},"formats":{"html":{"identifier":{"display-name":"HTML","target-format":"html","base-format":"html"},"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"markdown"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"inline-includes":false,"preserve-yaml":false,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[],"notebook-links":true,"format-links":true},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"resources.html"},"language":{"toc-title-document":"Table of contents","toc-title-website":"On this page","related-formats-title":"Other Formats","related-notebooks-title":"Notebooks","source-notebooks-prefix":"Source","section-title-abstract":"Abstract","section-title-appendices":"Appendices","section-title-footnotes":"Footnotes","section-title-references":"References","section-title-reuse":"Reuse","section-title-copyright":"Copyright","section-title-citation":"Citation","appendix-attribution-cite-as":"For attribution, please cite this work as:","appendix-attribution-bibtex":"BibTeX citation:","title-block-author-single":"Author","title-block-author-plural":"Authors","title-block-affiliation-single":"Affiliation","title-block-affiliation-plural":"Affiliations","title-block-published":"Published","title-block-modified":"Modified","callout-tip-title":"Tip","callout-note-title":"Note","callout-warning-title":"Warning","callout-important-title":"Important","callout-caution-title":"Caution","code-summary":"Code","code-tools-menu-caption":"Code","code-tools-show-all-code":"Show All Code","code-tools-hide-all-code":"Hide All Code","code-tools-view-source":"View Source","code-tools-source-code":"Source Code","code-line":"Line","code-lines":"Lines","copy-button-tooltip":"Copy to Clipboard","copy-button-tooltip-success":"Copied!","repo-action-links-edit":"Edit this page","repo-action-links-source":"View source","repo-action-links-issue":"Report an issue","back-to-top":"Back to top","search-no-results-text":"No results","search-matching-documents-text":"matching documents","search-copy-link-title":"Copy link to search","search-hide-matches-text":"Hide additional matches","search-more-match-text":"more match in this document","search-more-matches-text":"more matches in this document","search-clear-button-title":"Clear","search-detached-cancel-button-title":"Cancel","search-submit-button-title":"Submit","search-label":"Search","toggle-section":"Toggle section","toggle-sidebar":"Toggle sidebar navigation","toggle-dark-mode":"Toggle dark mode","toggle-reader-mode":"Toggle reader mode","toggle-navigation":"Toggle navigation","crossref-fig-title":"Figure","crossref-tbl-title":"Table","crossref-lst-title":"Listing","crossref-thm-title":"Theorem","crossref-lem-title":"Lemma","crossref-cor-title":"Corollary","crossref-prp-title":"Proposition","crossref-cnj-title":"Conjecture","crossref-def-title":"Definition","crossref-exm-title":"Example","crossref-exr-title":"Exercise","crossref-ch-prefix":"Chapter","crossref-apx-prefix":"Appendix","crossref-sec-prefix":"Section","crossref-eq-prefix":"Equation","crossref-lof-title":"List of Figures","crossref-lot-title":"List of Tables","crossref-lol-title":"List of Listings","environment-proof-title":"Proof","environment-remark-title":"Remark","environment-solution-title":"Solution","listing-page-order-by":"Order By","listing-page-order-by-default":"Default","listing-page-order-by-date-asc":"Oldest","listing-page-order-by-date-desc":"Newest","listing-page-order-by-number-desc":"High to Low","listing-page-order-by-number-asc":"Low to High","listing-page-field-date":"Date","listing-page-field-title":"Title","listing-page-field-description":"Description","listing-page-field-author":"Author","listing-page-field-filename":"File Name","listing-page-field-filemodified":"Modified","listing-page-field-subtitle":"Subtitle","listing-page-field-readingtime":"Reading Time","listing-page-field-categories":"Categories","listing-page-minutes-compact":"{0} min","listing-page-category-all":"All","listing-page-no-matches":"No matching items"},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.3.433"},"extensions":{"book":{"multiFile":true}}}},"projectFormats":["html"]}
\ No newline at end of file
diff --git a/docs/.DS_Store b/docs/.DS_Store
deleted file mode 100644
index 5f93ba0..0000000
Binary files a/docs/.DS_Store and /dev/null differ
diff --git a/docs/getting-data.html b/docs/getting-data.html
index ac4dae6..c51eec7 100644
--- a/docs/getting-data.html
+++ b/docs/getting-data.html
@@ -2,7 +2,7 @@
-
+
@@ -11,10 +11,16 @@
@@ -112,7 +88,8 @@
"search-more-matches-text": "more matches in this document",
"search-clear-button-title": "Clear",
"search-detached-cancel-button-title": "Cancel",
- "search-submit-button-title": "Submit"
+ "search-submit-button-title": "Submit",
+ "search-label": "Search"
}
}
@@ -136,37 +113,48 @@
@@ -176,7 +164,7 @@
We can make any of the previous plots interactive with the powerful and easy plotly library. All we have to do is wrap a ggplot object in the ggplotly function. Note: You can’t add ggplotly to the end of a ggplot object, but have to actually save the ggplot as a variable and then wrap that in the function call as shown below.
+
You can customize the tooltip text by adding a value to text in aes() and then specifying tooltip = "text" in the ggplotly call.
+
+
library(plotly)
+
+stock_plot <-as_tibble(EuStockMarkets) %>%
+mutate(date =time(EuStockMarkets)) %>%
+gather(key ="key", value ="value", -date) %>%
+ggplot(mapping =aes(x = date, y = value, color = key,
+# sometimes ggplotly messes with line charts,
+# adding a group value usually helps with that
+group = key,
+# customize the tooltip with the text aes
+text =paste0("Value: ", round(value, 2), "<br>",
+"Date: ", round(date, 3), "<br>",
+"Key: ", key))
+ ) +
+geom_line() +
+scale_x_continuous(expand =expansion(mult =c(0.002, 0)),
+limits =c(1991, 1999),
+breaks =c(1991, 1993, 1995, 1997, 1999)) +
+scale_y_continuous(expand =expansion(mult =c(0, 0.002)),
+breaks =0:4*2500,
+labels = scales::dollar,
+limits =c(0, 10000)) +
+labs(x ="Date",
+y ="Value")
+
+# make interactive with gggplotly
+# Uncomment pipe to hide the interative toolbar in the top right
+ggplotly(stock_plot, tooltip ="text") # %>% config(displayModeBar = FALSE)
+
+
+
+
+
+
urbnthemes
@@ -1683,17 +1705,17 @@
Overview
Usage
Use set_urbn_defaults(style = "print") to set the default styles. scatter_grid(), remove_ticks(), add_axis(), and remove_axis() can all be used to improve graphics.
Sometimes it’s important to horizontally add the y-axis title above the plot. urbn_y_title() can be sued for this task. The following example goes one step further and adds the title between the legend and the plot.
for (var i=0; i {
+ let cellAttr = 'data-code-cell="' + cell + '"';
+ let lineAttr = 'data-code-annotation="' + annotation + '"';
+ const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+ return selector;
+ }
+ const selectCodeLines = (annoteEl) => {
+ const doc = window.document;
+ const targetCell = annoteEl.getAttribute("data-target-cell");
+ const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+ const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+ const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+ const lineIds = lines.map((line) => {
+ return targetCell + "-" + line;
+ })
+ let top = null;
+ let height = null;
+ let parent = null;
+ if (lineIds.length > 0) {
+ //compute the position of the single el (top and bottom and make a div)
+ const el = window.document.getElementById(lineIds[0]);
+ top = el.offsetTop;
+ height = el.offsetHeight;
+ parent = el.parentElement.parentElement;
+ if (lineIds.length > 1) {
+ const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+ const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+ height = bottom - top;
+ }
+ if (top !== null && height !== null && parent !== null) {
+ // cook up a div (if necessary) and position it
+ let div = window.document.getElementById("code-annotation-line-highlight");
+ if (div === null) {
+ div = window.document.createElement("div");
+ div.setAttribute("id", "code-annotation-line-highlight");
+ div.style.position = 'absolute';
+ parent.appendChild(div);
+ }
+ div.style.top = top - 2 + "px";
+ div.style.height = height + 4 + "px";
+ let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+ if (gutterDiv === null) {
+ gutterDiv = window.document.createElement("div");
+ gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+ gutterDiv.style.position = 'absolute';
+ const codeCell = window.document.getElementById(targetCell);
+ const gutter = codeCell.querySelector('.code-annotation-gutter');
+ gutter.appendChild(gutterDiv);
+ }
+ gutterDiv.style.top = top - 2 + "px";
+ gutterDiv.style.height = height + 4 + "px";
+ }
+ selectedAnnoteEl = annoteEl;
+ }
+ };
+ const unselectCodeLines = () => {
+ const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+ elementsIds.forEach((elId) => {
+ const div = window.document.getElementById(elId);
+ if (div) {
+ div.remove();
+ }
+ });
+ selectedAnnoteEl = undefined;
+ };
+ // Attach click handler to the DT
+ const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+ for (const annoteDlNode of annoteDls) {
+ annoteDlNode.addEventListener('click', (event) => {
+ const clickedEl = event.target;
+ if (clickedEl !== selectedAnnoteEl) {
+ unselectCodeLines();
+ const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+ if (activeEl) {
+ activeEl.classList.remove('code-annotation-active');
+ }
+ selectCodeLines(clickedEl);
+ clickedEl.classList.add('code-annotation-active');
+ } else {
+ // Unselect the line
+ unselectCodeLines();
+ clickedEl.classList.remove('code-annotation-active');
+ }
+ });
+ }
+ const findCites = (el) => {
+ const parentEl = el.parentElement;
+ if (parentEl) {
+ const cites = parentEl.dataset.cites;
+ if (cites) {
+ return {
+ el,
+ cites: cites.split(' ')
+ };
+ } else {
+ return findCites(el.parentElement)
+ }
+ } else {
+ return undefined;
+ }
+ };
var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
for (var i=0; i
diff --git a/docs/search.json b/docs/search.json
index b71de2f..f7bca66 100644
--- a/docs/search.json
+++ b/docs/search.json
@@ -1,436 +1,310 @@
[
- {
- "objectID": "intro-to-r.html#what-is-r",
- "href": "intro-to-r.html#what-is-r",
- "title": "R@URBAN",
- "section": "What is R?",
- "text": "What is R?\n\nSource\nR is a free, open-source software for statistical computing. It is known for intuitive, crisp graphics and an extensive, growing library of statistical and analytic methods. Above all, R boasts an enthusiastic community of developers, instructors, and users.\nThe copyright and documentation for R is held by a not-for-profit organization called The R Foundation.\n\nSource, Fair use\nRStudio is a free, open-source integrated development environment (IDE) that runs on top of R. In practice, R users almost exclusively open RStudio and rarely directly open R.\nRStudio is developed by a for-profit company called RStudio. RStudio, the company, employs some of the R community’s most prolific, open-source developers and creates many open-source tools and trainings.\nWhile R code can be written in any text editor, the RStudio IDE is a powerful tool with a console, syntax-highlighting, and debugging tools. This cheatsheet outlines the power of RStudio."
- },
- {
- "objectID": "intro-to-r.html#installation-and-updates",
- "href": "intro-to-r.html#installation-and-updates",
- "title": "R@URBAN",
- "section": "Installation and Updates",
- "text": "Installation and Updates\n\n\nWhen should you update?\nAll Urban computers should come pre-installed with R and Rstudio. However your R version may be out of date and require updating. We recommend having at least R version 3.6.0 or higher. You can check what version of R you have installed by opening Rstudio and submitting the following line of code to the console: R.Version()$version.string.\nIf you’re working on a personal computer, you may not have R or Rstudio installed. So follow this guide to install both on your computer.\n\n\nUpdating/Installing R\n\nVisit https://cran.r-project.org/bin/windows/base/. The latest R version will be the downloadable link at the top. As of 1/1/2020, that R version is 3.6.2. Click on the link at the top and download the R-x.x.x-win.exe file.\nOpen the R-x.x.x-win.exe` file. Click next, accept all the defaults, and install R. After R has been installed, click the Finish button. You should not need admin privileges for this.\nCheck that your version of R has been updated in Rstudio. If Rstudio is already open, first close it. Then open Rstudio and retype in R.Version()$version.string. You should see an updated version number printed out on the console.\nTest that R packages are loading as expected. Packages you already had installed should continue to work with newer versions of R. But in some cases, you may need to re-install the packages to work properly with new versions of R.\n\n\n\nUpdating/Installing Rstudio\n\nOpen Rstudio and go to Help > Check for Updates to see if RStudio is up-to-date\nIf it is out-of-date, download the appropriate update.\nBefore you run the installer, contact IT at helpdesk@urban.org for administrative approval as the program requires admin access.\nRun the installer and accept all defaults.\n\nMoving forward, RStudio will automatically and regularly update on Windows computers at the Urban Institute."
- },
- {
- "objectID": "intro-to-r.html#learning-r",
- "href": "intro-to-r.html#learning-r",
- "title": "R@URBAN",
- "section": "Learning R",
- "text": "Learning R\n\n\nWhat to Learn\nThere is often more than one way to accomplish a goal in R because of the language’s flexibility. At first, this flexibility can be overwhelming. That’s why it is useful to pick and master one set of tools in R before branching out and learning everything R.\nFortunately, Hadley Wickham’s tidyverse offers a comprehensive set of tools for data analysis that are good for both beginners and experts. The tidyverse is self-described as “an opinionated collection of R packages designed for data science.” The tidyverse consists of almost two dozen clear and concise tools for every part of an analysis workflow. At first, focus on the function read_csv() for loading data, the package dplyr for manipulating data, and the package ggplot2 for plotting.\nHere’s a quick example that reads a .csv, filters the data, and creates a publishable column plot in just fifteen lines of code:\n\n# load packages and source the Urban Institute ggplot2 theme\nlibrary(tidyverse) # contains read_csv, library(dplyr), and library(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\n# read bankdata.csv\nbank <- read_csv(\"intro-to-r/data/bankdata.csv\") \n\nbank_subset <- bank %>%\n # filter to observations of unmarried mothers less than age 30\n filter(married == \"NO\" & age < 30) %>% \n # drop all variables except children and income\n select(children, income) \n\n# plot!\nbank_subset %>%\n ggplot(mapping = aes(x = children, y = income)) +\n geom_bar(stat = \"summary\", fun.y = \"mean\") +\n scale_y_continuous(expand = c(0, 0), labels = scales::dollar) +\n labs(title = \"Mean income\",\n subtitle = \"Unmarried mothers less than age 30\",\n caption = \"Urban Institute analysis of bank data\",\n x = \"Number of children\",\n y = \"Income\")\n\n\n\n\n\n\nResources for Learning\nR for Data Science by Hadley Wickham and Garrett Grolemund is the best print resource for learning R and the tidyverse. The book is available online for free and begins with visualization which is motivating and practical. R for Data Science contains dozens of worthwhile exercises but no solutions guide. Please check your solutions against the Urban Institute r4ds solutions guide on GitHub and please contribute if the exercise isn’t already in the guide!\nRStudio publishes a number of cheat sheets that cover the tidyverse. The main cheat sheets can be accessed in RStudio at Help > Cheat Sheets. Additional cheat sheets are accessible here on the RStudio website.\nDavid Robinson, a data scientist from Data Camp, has a new video course about the tidyverse. Few people know as much about R and communicate as effectively as David Robinson.\nAdvanced R by Hadley Wickham is a good resource for new R users that have experience with other programming languages and computer science. It is available online for free.\n\n\nLibrary\nIt’s easy to feel overwhelmed by the frenetic development of the extended R universe. Books are an invaluable resource for slowing down and focusing on fully-formed ideas.\nAaron Williams (awilliams@urban.org) has a number of books that can be checked out:\n\nThe Art of R Programming\nggplot2\nEfficient R Programming (Online!)\nText Mining with R (Online!)\nReasoning with Data\nPractical Statistics for Data Scientists\n\n\n\nBuilt-in Data Sets\nR has many built-in data sets that are useful for practice and even more data sets are accessible through R packages.\nSubmitting data() shows a list of all available data sets. cars and iris are two classic sets that are used in many examples.\nlibrary(tidyverse) loads many more “tidy” data sets including diamonds and starwars.\n\nlibrary(tidyverse)\nstarwars %>%\n count(species) %>%\n arrange(desc(n)) %>%\n head()\n\n# A tibble: 6 × 2\n species n\n \n1 Human 35\n2 Droid 6\n3 4\n4 Gungan 3\n5 Kaminoan 2\n6 Mirialan 2\n\n\nlibrary(dslabs) by Rafael Irizarry includes varied data sets that are intentionally imperfect that are useful for practice. Students of econometrics will enjoy library(wooldridge). It loads 105 data sets from Introductory Econometrics: A Modern Approach by Jeffrey Wooldridge. Now you can practice estimating your hedonic pricing models in R!\n\nlibrary(wooldridge)\nlibrary(tidyverse)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nas_tibble(hprice1) %>%\n ggplot(aes(x = sqrft, y = price)) +\n geom_point() +\n scale_y_continuous(expand = c(0, 0), lim = c(0, 800)) +\n labs(title = '\"hprice1\" data from Wooldridge') \n\n\n\n\n\n\nGetting Help\nEven the best R programmers spend hours each week searching the Internet for answers. Here are some of the best ways to find answers:\nSubmit ? and any function name without parentheses (ex. ?mean) to see the function documentation in RStudio.\nWhen Googling, set the search range to the last year to avoid out-of-date solutions and to focus on up-to-date practices.\nStack Overflow contains numerous solutions. Add [r] to any search to limit results to R. If a problem is particularly perplexing, it is simple to submit questions. Exercise caution when submitting questions because the Stack Overflow community has strict norms about questions and loose norms about respecting novices.\nRStudio Community is a new forum for R Users. It has a smaller back catalog than Stack Overflow but users are friendlier than on Stack Overflow.\nFinally, Aaron Williams (awilliams@urban.org) from IBP and Amy Rogin (arogin@urban.org) from METRO are available to solve problems, offer guidance, and share R enthusiasm.\n\n\nCRAN Task Views\nR has sub-communities, frameworks, and tools focused on different subject-matter and and methodological areas. CRAN Task Views is invaluable for understanding these communities and finding the best frameworks and tools for different disciplines in R.\nCRAN Task Views has 35 pages focused on subcategories of R ranging from econometrics to natural language processing. Each page is maintained by a subject-matter expert and contains methods, packages, books, and mailing lists that are useful for researchers.\nThe econometrics page alone contains detailed information on basic linear regression, microeconometrics, instrumental variables, panel data models, further regression models, time series data and models, data sets, CRAN packages, articles, books, and more."
- },
- {
- "objectID": "intro-to-r.html#r-code",
- "href": "intro-to-r.html#r-code",
- "title": "R@URBAN",
- "section": "R Code",
- "text": "R Code\n\nIt’s time to start writing R code. Remember, most R users never open R and exclusively use RStudio. Go ahead and open R once to admire its dated text editor. Then, close R and never directly open it again. Now, open RStudio.\n\nSubmitting Code\nRStudio has four main panels: code editor (top left by default), R console (bottom left by default), environment and history (top right by default), and files, plots, packages, help, and viewer pane (bottom right by default).\nThere are two main ways to submit code:\n\nType code to the right of in the R console and hit enter. Note: R won’t create a long-term record of this code.\nClick in the top left to create a new R script in the code editor panel. Type code in the script. Highlight desired code and either click run the in top right of the code editor panel or type Ctrl/command-enter to run code. Scripts can be saved, so they are the best way to write code that will be used again.\n\nFor practice, submit state.name in the R console to create a vector with all fifty state names (sorry statehood advocates, no Washington, D.C.). Next, create a script, paste state.name, highlight the text, and click run at the top right of the code editor. You should get the same output both times.\n\nstate.name\n\n [1] \"Alabama\" \"Alaska\" \"Arizona\" \"Arkansas\" \n [5] \"California\" \"Colorado\" \"Connecticut\" \"Delaware\" \n [9] \"Florida\" \"Georgia\" \"Hawaii\" \"Idaho\" \n[13] \"Illinois\" \"Indiana\" \"Iowa\" \"Kansas\" \n[17] \"Kentucky\" \"Louisiana\" \"Maine\" \"Maryland\" \n[21] \"Massachusetts\" \"Michigan\" \"Minnesota\" \"Mississippi\" \n[25] \"Missouri\" \"Montana\" \"Nebraska\" \"Nevada\" \n[29] \"New Hampshire\" \"New Jersey\" \"New Mexico\" \"New York\" \n[33] \"North Carolina\" \"North Dakota\" \"Ohio\" \"Oklahoma\" \n[37] \"Oregon\" \"Pennsylvania\" \"Rhode Island\" \"South Carolina\"\n[41] \"South Dakota\" \"Tennessee\" \"Texas\" \"Utah\" \n[45] \"Vermont\" \"Virginia\" \"Washington\" \"West Virginia\" \n[49] \"Wisconsin\" \"Wyoming\" \n\n\n\n\nSyntax\nThe are five fundamental pieces of syntax in R.\n\n<- is the assignment operator. An object created on the right side of an assignment operator is assigned to a name on the left side of an assignment operator. Assignment operators are important for saving the consequences of operations and functions. Operations without assignment operators will typically be printed to the console but not saved.\n# begins a comment. Comments are useful for explaining decisions in scripts. As Haldey Wickham notes in the Tidyverse styleguide, ’In code, use comments to explain the “why” not the “what” or “how”.\nc() combines similar vectors into larger vectors. For example, c(1, 2, 3) is a numeric vector of length three made up of three numeric vectors of length one.\n? in front of any function name without parentheses returns function documentation. For example, ?mean.\n%>% from library(magrittr) and library(tidyverse) is the “pipe operator”. It passes the output from one function to another function. This is useful because strings of operations can be “piped” together instead of each individual operation needing to be assigned to an object.\n\n\n\nVectors\nVectors are the fundamental piece of data in R. R has six vector types (you can’t mix vector types): logical, integer, double, character, complex, and raw. . You can check the type of a vector with typeof() and the length with length()\n\n\nData frames\nData frames are combinations of equal length vectors. Data analysis in R is built around the data frames. As a guiding principle working with data frames, you want to have “tidy data” whenever possible. A tidy data frame means that :\n\nEach variable has its own column.\nEach observation has its own row.\nEach value has its own cell.\n\n\n\n\n[Source](https://r4ds.had.co.nz/tidy-data.html)\n\n\nHaving data in a tidy format allows R’s vectorized nature to shine and many of the tidyverse functions are designed for tidy data.\n\n\nMissing values\nR stores missing values as NA. A single NA in a calculation can cause the entire result to return as NA.\n\nsum(c(2, 2, NA))\n\n[1] NA\n\n\nThe contagiousness of NA is good, it makes users explicitly acknowledge dropping missing values with na.rm = TRUE.\n\nsum(c(2, 2, NA), na.rm = TRUE)\n\n[1] 4\n\n\n== NA does not test for missing values. Instead, use is.na().\n\nis.na() and math with booleans\ncomplete.cases\n\n\n\nFunctions\nFunctions in R are collections of code that when called cause certain actions. R contains hundreds of functions and thousands of more functions can be accessed through packages.\nMost functions take arguments. For example, the function mean() has arguments x, trim, na.rm, and .... The first argument in most functions, in this case x, is an input object. Arguments can be passed to functions by name or position. mean(c(1, 2, 3)) is equivalent to mean(x = c(1, 2, 3)).\nNotice how the other three arguments were skipped. Most arguments in functions have default values. The best way to see default values is to submit the function name with a question mark, like ?mean. In this case, trim = 0, na.rm = FALSE, and no further arguments were passed through with ....\nIn the previous example, the c() function was nested inside of the mean() function. It is also possible to assign a vector of 1, 2, and 3 to a name and pass the name to the mean function.\n\napples <- c(1, 2, 3)\n\nmean(apples)\n\nR is a functional programming language. In addition to having many pre-made functions like mean(), R has powerful tools for creating and manipulating custom functions. This is useful because:\n\nIt avoids tedious and error-prone copying-and-pasting and makes iterating processes simple;\nIs a powerful way to organize sets of operations;\nIs a standardized way to save code for later and to share operations with others.\n\nThis last bullet is key to the package system in R.\n\n\nPackages\nOpening RStudio automatically loads “base R”, a fundamental collection of code and functions that handles simple operations like math and system management. R can be extended with collections of code and functions developed by the R community called packages. This sounds wild, but most packages are created and maintained by some of the best statisticians and developers in the world.\nMost packages can be installed with install.packages(\"dplyr\"), where the string between the quotation marks is the name of the package. Packages installed with install.packages() come from CRAN and must pass certain checks for performance and documentation. Popular packages on CRAN, like dplyr, have as much, if not more support, standards, and quality than code in proprietary software packages like Stata or SAS.\nIt is possible, but less common, to install packages from places like GitHub. This is less secure and the functionality of the packages is more likely to change over time. install.packages() need only be run once per version of package per machine and should rarely be included in .R scripts.\nPackages are loaded once per R session with the function library(). It is a good idea to include library(package-name) at the top of scripts for each package used in the script. This way it is obvious at the top of the script which packages are installed and loaded.\nNote: install.packages() uses quoted package names and library() uses unquoted package names.\nFor practice, submit the following three lines of code to install RXKCD, load library(RXKCD), and get a random XKCD comic.\n\ninstall.packages(\"RXKCD\")\nlibrary(RXKCD)\ngetXKCD(\"random\")\n\n\n\n\n\n\nPackages are frequently updated, especially around the time R versions change. The easiest way to update packages is Tools > Check for Package Updated in RStudio.\nOccasionally, two loaded packages will have functions with identical names. Any conflicts with be announced when loading packages. See how filter() and lag() from library(tidyverse) and library(stats) conflict:\n In this case, the tidyverse functions are usually favored. If there is ever a conflict or any doubt about which function is used, use the package name and :: to directly call the function. For example, dplyr::select(apples). :: can also be used to call a function without loading the entire package.\n\n\nCRAN\nThe Comprehensive R Archive Network (CRAN) contains almost 12,000 packages contributed over the last two decades by a range of developers. New packages are added to CRAN almost every day.\nCRAN enables R to have all of the benefits of open-source development and the security and predictability of proprietary statistical packages like SAS and Stata. CRAN weds the benefits of broad-based, real-time package development with certain standards for functionality and documentation. Methods and tools make it to R before SAS or Stata, if they ever make it to SAS or Stata, but have standards that generally exceed Python or other open-source languages. (See: Malicious Libraries Found on Python Package Index (PyPI))\nBecause of CRAN’s long history and R’s place in the statistics community, CRAN contains many methods that can’t be accessed, much less duplicated, using proprietary software. In addition to being useful now, this also ensures that R isn’t a temporary fad and will have staying power because of the challenge of replicating or besting CRAN.\nR’s extensible design is important, but most tasks can be accomplished with a handful of packages:\n\nggplot2 data visualization\ndplyr data management\ntidyr data tidying\nreadr data import\npurrr functional programming\ntibble data frames\nhms times\nstringr character strings\nlubridate dates/times\n\nforcats factors\nDBI databases\nhaven SPSS, SAS, and Stata files\nreadxl.xls and .xlsx\nmodelr simple modeling within a pipeline\nbroom turning models into tidy data\ntidyverse loads all of the packages listed up to this point; see Hadley Wichkham’s “tidyverse”"
- },
- {
- "objectID": "intro-to-r.html#organizing-analyses",
- "href": "intro-to-r.html#organizing-analyses",
- "title": "R@URBAN",
- "section": "Organizing Analyses",
- "text": "Organizing Analyses\n\nThis section outlines how to organize an analysis to get the most out of R. Newer users may want to skip this section and work through R for Data Science until they understand library(readr), library(dplyr), and library(ggplot2).\n\nProjects\nOrganizing scripts, files, and data is one of the most important steps to creating a clear and reproducible analysis.\nR Projects, proper noun, are the best way to organize an analysis. They have several advantages:\n\nThey make it possible to concurrently run multiple RStudio sessions.\nThey allow for project-specific RStudio settings.\nThey integrate well with Git version control.\nThey are the “node” of relative file paths. (more on this in a second)\n\nBefore setting up an R Project, go to Tools > Global Options and uncheck “Restore most recently opened project at startup”.\n\nEvery new analysis in R should start with an R Project. First, create a directory that holds all data, scripts, and files for the analysis. Storing files and data in a sub-directories is encouraged. For example, data can be stored in a folder called data/.\nNext, click “New Project…” in the top right corner.\n\nWhen prompted, turn your recently created “Existing Directory” into a project.\n\nUpon completion, the name of the R Project should now be displayed in the top right corner of RStudio where it previously displayed “Project: (None)”. Once opened, .RProj files do not need to be saved. Double-clicking .Rproj files in the directory is now the best way to open RStudio. This will allow for the concurrent use of multiple R sessions and ensure the portability of file paths. Once an RStudio project is open, scripts can be opened by double-clicking individual files in the computer directory or clicking files in the “Files” tab in the top right of RStudio.\nR Projects make code highly portable because of the way they handle file paths. Here are a few rules:\n\nFilepaths\nNever use \\ in file paths in R. \\ is a regular expression and will complicate an analysis. Fortunately, RStudio understands / in file paths regardless of operating system.\nNever use setwd() in R. It is unnecessary, it makes code unreproducible across machines, and it is rude to collaborators. R Projects create a better framework for file paths. Simply treat the directory where the R Project lives as the working directory and directories inside of that directory as sub-directories.\nFor example, say there’s a .Rproj called starwars-analysis.Rproj in a directory called starwars-analysis. If there is a .csv in that folder called jedi.csv, the file can be loaded with read_csv(\"jedi.csv\") instead of read_csv(\"H:/ibp/analyses/starwars-analysis/diamonds.csv\"). If that file is in a sub-directory of starwars-analysis called data, it can be loaded with read_csv(\"data/jedi.csv\"). The same concepts hold for writing data and graphics.\nThis simplifies code and makes it portable because all relative filepaths will be identical on all computers. To share an analysis, simply send the entire directory to a collaborator or share it with GitHub.\nHere’s an example directory:\n\n\n\nIt isn’t always possible to avoid absolute file paths because of the many different ways the Urban Institute stores data. Avoid absolute paths when possible and be deliberate about where analyses live in relation to where data live.\nFinally, it’s good practice to include a README in the same directory as the .Rproj. The README should outline the purpose and the directories and can include information about how to contribute, licenses, dependencies, and acknowledgements. This GitHub page is a good README template.\nCheck out R for Data Science by Hadley Wickham and Garrett Grolemund for a more thorough explanation of this workflow. Jenny Bryan also has a good blogpost about avoiding setwd().\n\n\n\nNaming Conventions\nNaming functions, objects, variables, files, and scripts is one of the toughest and least-taught dimensions of computer programming. Better names can add clarity to code, save time and effort, and minimize errors caused by accidentally overwriting existing functions or other objects.\n\nThere are only two hard things in Computer Science: cache invalidation and naming things. ~ Phil Karlton\n\n\nFunctions and Other Objects\nR is case-sensitive.\nObjects in R can be named anything - even unicode characters. But just because something can be named anything doesn’t mean it should.\nMost functions and objects in R are lowerCamelCase, period.separated, or underscore_separated. As an individual or team, it’s important to pick a style and stick with it, but as this article from 2012 shows, there isn’t much consistency across the R community. Hadley Wickham’s tidyverse uses underscores, so expect to see some consolidation into this style.\nIn general, it’s good practice to name functions with verbs and other objects with nouns.\nVariable and object names that start with numbers, have spaces, or use peculiar syntax require back-ticks.\n\nselect(urban, `R Users Group`)\n\n\nurban$`R Users Group`)\n\nFinally, it’s possible to overwrite existing functions and other objects in R with the assignment operator. Don’t give vectors or data frames the same names as exisiting functions and don’t overwrite existing functions with custom functions.\n\n\nFiles\nNaming conventions for scripts and files is probably the most overlooked dimension in programming and analysis. The first three bullets from this section come from this rich slide deck by Jenny Bryan. This may seem pedantic, but picking a file naming convention now can save a bunch of time and headaches in the future.\n1) Machine readable\nCreate file names that are easily machine readable. Use all lower case letters and skip punctuation other than delimiters. Use underscores as characters for splitting the file name. For example, stringr::str_split_fixed(\"2018-01-10_r-introduction_machine-readable-example_01.csv\", \"[_\\\\.]\", 5) splits the file name on underscores and periods and returns date, project, file name, file number, and file type. This information can then be stored and sorted in a data frame.\n2) Human readable\nCreate file names that are human readable. The example from above is informative without any machine interpretation.\n3) Plays well with default ordering\nIt is often useful to include date or sequence numbers in script and file names. For example, include 2018-10-01 for data collected on January 10th, 2018 or include 3 for the third script a sequence of five .R programs. Starting file names with the date or sequence numbers means files will show up in a logical order by default. Be sure to use ISO 8601 standard for dates (YYYY-MM-DD).\n4) Don’t Use File Names for Version Control\nVersion control with file names is unwieldy and usually results in names that are barely human readable and definitely not machine readable.\n\n“2018-01-10_r-introduction_machine-readable-example_01_v2_for-aaron_after-review_before-submission.R”\n\nIterations usually don’t iterate sensibly. For example, what was “v1”, “v2” abandoned for “for-aaron”, “after-review”, “before-submission”. Furthermore, version control with file names is poor for concurrent work and merging.\nThe next section will outline the optimal tool for version control.\n\n\n\nVersion Control\nThe workflow outlined above integrates perfectly with version control like Git and distributed version control repository hosting services like GitHub.\nVersion control is a system for recording changes to files over time. Version control is built around repositories. In this case, the folder containing the .Rproj is the perfect directory to use as a repository. A handful of simple commands are used to track and commit changes to text files (.R, .Rmd, etc.) and data. This record is valuable for testing alternatives, communicating with others and your future self, and documenting progress on projects.\nGitHub is a distributed repository system built on top of Git. GitHub has a number of valuable tools for collaboration and project management. In particular, it makes concurrent collaboration on code simpler with branches and has a slick system for issues. Here are the branches and issues for the Urban Institute R Graphics Guide. It also has free web hosting for websites like the website you are reading right now. GitHub has a quick guide that is a good place to start learning Git.\nThe Urban Institute has a number of legacy models and code bases that span years and have been touched by scores of brilliant researchers. The future value of a record of all code changes and development is borderline unthinkable.\n\n\nCoding Style\n\n“Good coding style is like using correct punctuation. You can manage without it, but it sure makes things easier to read.” ~Hadley Wickham (2014)\n\ngood coding style is like using correct punctuation you can manage without it but it sure makes thing easier to read\nThe details of a coding style are less important than consistently sticking to that style. Be flexible when working with collaborators so the style doesn’t change inside an analysis.\nHere are three good sources for inspiration:\n\nTidyverse Style Guide\nGoogle’s R Style Guide\nHadley Wickham’s R Style Guide"
- },
- {
- "objectID": "intro-to-r.html#putting-it-all-together",
- "href": "intro-to-r.html#putting-it-all-together",
- "title": "R@URBAN",
- "section": "Putting it All Together",
- "text": "Putting it All Together\n\nR can augment or replace a traditional proprietary statistical packages like SAS or Stata with a few extra bells and whistles, but hopefully this guide and other resources show a fuller vision for developing reproducible, accurate, and collaborative analyses.1\nThis research pipeline, to use the phrase by Roger Peng, Jeff Leek, and Brian Caffo, combines the best of traditional economic and social policy research, computer science/software development, and statistics.2 Here are the rules:\n\n1) No steps in an analysis are done by hand and all steps are recorded with executable scripts.\nIt is common to use executable scripts to estimate a regression equation or to tabulate weighted summary statistics. But for some reason, other steps like file management, data munging, and visualization are often done “by hand”. Good science demands that every step of an analysis is recorded - and if possible - with executable scripts.\nFortunately, it is possible to script most steps in R from downloading data from the Internet and accessing APIs to visualizations and drafting manuscripts. This may be challenging at first, but it will save time and result in better research in the long run.\n\n\n2) All code is entirely reproducible and portable.\nExecutable scripts are for communicating with other researchers and our future selves. Scripts lose value if they aren’t portable and can’t be reproduced in the future or by others. Recording every step with execuatble scripts is a start, but scripts aren’t valuable if they require expensive proprietary software,or if researchers have to significantly alter scripts to run an analysis.\nOpen source software, like R, promotes accessibility, portability, and reproducibility. Also, be sure to avoid setwd() and use relative filepaths.\n\n\n3) Local and collaborative version control is used and all repositories include all code and a README.\nUse local version control like Git and a distributed version control repository hosting service like GitHub to track changes and share analyses. The version control should include all scripts and meta information about the analysis in a README.\n\n\n4) Raw data and tidy analytic data are stored in a collaborative location with a code book.\nMany raw data are already stored in collaborative locations like BLS.gov and don’t need to be duplicated. Tidy analytic data, like the data used to estimate a regression equation, should be stored in a collaborative location. This is good practice, but is less essential if executable scripts are flawless and reproducible. Researcher-entered data and data from less-stable sources should be stored in raw and analytic forms.\nSmall data sets can be stored on GitHub without issue. Larger data sets should be stored in collaborative locations accessible by scripting languages. This is only possible for public data and best-practices for private data are less established.\nSave codebooks for data sets as text files or PDFs in repositories. Creating codebooks for user-entered data or variables created in executable scripts is often worth the time.\n\n\n5) Code review and issue tracking are used to improve accuracy and computational efficiency.\nGetting stronger programmers and/or methodologists to review code is valuable for limiting programming and analytic mistakes, improving computational efficiency, and learning.\nGitHub issues is a powerful tool for managing, discussing, and collaborating on code.\n\n\n6) Projects rely heavily on literate statistical programming and standard means of distribution for execution, validation, and publishing.\nLiterate statistical programming is the combination of natural language explanations for humans and executable code in one document. The idea was created by Donald Knuth and is embodied by R Markdown.\nR Markdown combines text chunks, code chunks, and output chunks in one script that can be “knitted” using library(knitr) to created PDFs, books, .htmls, and websites like the website where this guide lives.\nThis workflow combines the analytic and narrative process in a tool that is flexible, scalable, reproducible, and less error-prone. R Markdown documents can be used for executing programs, validating models and analyses, and publishing. These documents can be submitted to many academic journals and shared easily with GitHub pages.\n\n\n7) Software versions and dependencies are recorded and all software is cited in publications.\nsessionInfo() reports the R version, locale, packages used, and other important information about an R session. citation() creates a text and BibTex entry of the citation for R. citation() creates a text and BibTex entry for R packages. library(packrat) (outlined here) is a tool for saving R dependencies."
- },
- {
- "objectID": "intro-to-r.html#bibliography-and-references",
- "href": "intro-to-r.html#bibliography-and-references",
- "title": "R@URBAN",
- "section": "Bibliography and References",
- "text": "Bibliography and References\n\nHadley Wickham (2017). tidyverse: Easily Install and Load the ‘Tidyverse’. R package version 1.2.1. https://CRAN.R-project.org/package=tidyverse\nHadley Wickham and Garrett Grolemund (2017). R For Data Science http://r4ds.had.co.nz/\nHadley Wickham (2014). Advanced R http://adv-r.had.co.nz/Style.html\nHilary S. Parker (2017. Opinionated Analysis Development https://www.rstudio.com/resources/videos/opinionated-analysis-development/\nJenny Bryan (2017).\nProject-oriented workflow https://www.tidyverse.org/articles/2017/12/workflow-vs-script/\nJenny Bryan (2015). naming things. http://www2.stat.duke.edu/~rcs46/lectures_2015/01-markdown-git/slides/naming-slides/naming-slides.pdf\nJJ Allaire, Yihui Xie, Jonathan McPherson, Javier Luraschi, Kevin Ushey, Aron Atkins, Hadley Wickham, Joe Cheng and Winston Chang (2017). rmarkdown: Dynamic Documents for R. R package version 1.8. https://CRAN.R-project.org/package=rmarkdown\nJustin M. Shea (2017). wooldridge: 105 Data Sets from “Introductory Econometrics: A Modern Approach” by Jeffrey M. Wooldridge. R package version 1.2.0. https://CRAN.R-project.org/package=wooldridge\nRoger Peng Reproducible Research Part 2 https://www.coursera.org/learn/reproducible-research/lecture/abevs/reproducible-research-concepts-and-ideas-part-2\nYihui Xie (2017). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.18.\n\nsessionInfo()\n\nR version 4.2.2 (2022-10-31)\nPlatform: aarch64-apple-darwin20 (64-bit)\nRunning under: macOS Monterey 12.5.1\n\nMatrix products: default\nBLAS: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib\nLAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib\n\nlocale:\n[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8\n\nattached base packages:\n[1] stats graphics grDevices datasets utils methods base \n\nother attached packages:\n [1] RXKCD_1.9.2 wooldridge_1.4-2 urbnthemes_0.0.2 forcats_0.5.2 \n [5] stringr_1.4.1 dplyr_1.0.10 purrr_0.3.5 readr_2.1.3 \n [9] tidyr_1.2.1 tibble_3.1.8 ggplot2_3.4.0 tidyverse_1.3.2 \n\nloaded via a namespace (and not attached):\n [1] httr_1.4.4 bit64_4.0.5 vroom_1.6.0 \n [4] jsonlite_1.8.3 modelr_0.1.10 assertthat_0.2.1 \n [7] renv_0.16.0 googlesheets4_1.0.1 cellranger_1.1.0 \n[10] yaml_2.3.6 ggrepel_0.9.2 Rttf2pt1_1.3.11 \n[13] pillar_1.8.1 backports_1.4.1 glue_1.6.2 \n[16] extrafontdb_1.0 digest_0.6.30 rvest_1.0.3 \n[19] colorspace_2.0-3 plyr_1.8.8 htmltools_0.5.4 \n[22] pkgconfig_2.0.3 broom_1.0.1 haven_2.5.1 \n[25] scales_1.2.1 jpeg_0.1-9 tzdb_0.3.0 \n[28] timechange_0.1.1 googledrive_2.0.0 generics_0.1.3 \n[31] farver_2.1.1 ellipsis_0.3.2 withr_2.5.0 \n[34] cli_3.4.1 RJSONIO_1.3-1.6 magrittr_2.0.3 \n[37] crayon_1.5.2 readxl_1.4.1 evaluate_0.18 \n[40] fs_1.5.2 fansi_1.0.3 xml2_1.3.3 \n[43] tools_4.2.2 hms_1.1.2 gargle_1.2.1 \n[46] lifecycle_1.0.3 munsell_0.5.0 reprex_2.0.2 \n[49] compiler_4.2.2 rlang_1.0.6 grid_4.2.2 \n[52] rstudioapi_0.14 htmlwidgets_1.6.1 labeling_0.4.2 \n[55] rmarkdown_2.18 gtable_0.3.1 DBI_1.1.3 \n[58] R6_2.5.1 gridExtra_2.3 lubridate_1.9.0 \n[61] knitr_1.40 fastmap_1.1.0 bit_4.0.5 \n[64] extrafont_0.18 utf8_1.2.2 stringi_1.7.8 \n[67] parallel_4.2.2 Rcpp_1.0.9 png_0.1-7 \n[70] vctrs_0.5.1 dbplyr_2.2.1 tidyselect_1.2.0 \n[73] xfun_0.34"
- },
- {
- "objectID": "getting-data.html#librarytidycensus",
- "href": "getting-data.html#librarytidycensus",
- "title": "R@URBAN",
- "section": "library(tidycensus)",
- "text": "library(tidycensus)\nlibrary(tidycensus) by Kyle Walker (complete intro here) is the best tool for accessing some Census data sets in R from the Census Bureau API. The package returns tidy data frames and can easily pull shapefiles by adding geometry = TRUE.\nYou will need to apply for a Census API key and add it to your R session. Don’t add your API key to your script and don’t add it to a GitHub repository!\nHere is a simple example for one state with shapefiles:\n\nlibrary(tidyverse)\nlibrary(purrr)\nlibrary(tidycensus)\n\n# pull median household income and shapefiles for Census tracts in Alabama\nget_acs(geography = \"tract\", \n variables = \"B19013_001\", \n state = \"01\",\n year = 2015,\n geometry = TRUE,\n progress = FALSE)\n\nSimple feature collection with 1181 features and 5 fields (with 1 geometry empty)\nGeometry type: MULTIPOLYGON\nDimension: XY\nBounding box: xmin: -88.47323 ymin: 30.22333 xmax: -84.88908 ymax: 35.00803\nGeodetic CRS: NAD83\nFirst 10 features:\n GEOID NAME variable\n1 01003010500 Census Tract 105, Baldwin County, Alabama B19013_001\n2 01003011501 Census Tract 115.01, Baldwin County, Alabama B19013_001\n3 01009050500 Census Tract 505, Blount County, Alabama B19013_001\n4 01015981901 Census Tract 9819.01, Calhoun County, Alabama B19013_001\n5 01025957700 Census Tract 9577, Clarke County, Alabama B19013_001\n6 01025958002 Census Tract 9580.02, Clarke County, Alabama B19013_001\n7 01031011000 Census Tract 110, Coffee County, Alabama B19013_001\n8 01033020500 Census Tract 205, Colbert County, Alabama B19013_001\n9 01037961200 Census Tract 9612, Coosa County, Alabama B19013_001\n10 01039961700 Census Tract 9617, Covington County, Alabama B19013_001\n estimate moe geometry\n1 41944 8100 MULTIPOLYGON (((-87.80249 3...\n2 41417 14204 MULTIPOLYGON (((-87.71719 3...\n3 40055 8054 MULTIPOLYGON (((-86.75735 3...\n4 NA NA MULTIPOLYGON (((-86.01323 3...\n5 32708 4806 MULTIPOLYGON (((-88.1805 31...\n6 29048 14759 MULTIPOLYGON (((-87.98623 3...\n7 44732 7640 MULTIPOLYGON (((-85.92018 3...\n8 49052 6543 MULTIPOLYGON (((-87.76733 3...\n9 31957 9954 MULTIPOLYGON (((-86.46069 3...\n10 32697 6021 MULTIPOLYGON (((-86.6998 31...\n\n\nSmaller geographies like Census tracts can only be pulled state-by-state. This example demonstrates how to iterate across FIPS codes to pull Census tracts for multiple states. The process is as follows:\n\nPick the variables of interest\nCreate a vector of state FIPS codes for the states of interest\nCreate a custom function that works on a single state FIPS code\nIterate the function along the vector of state FIPS codes with map_df() from library(purrr)\n\nHere is an example that pulls median household income at the Census tract level for multiple states:\n\n# variables of interest\nvars <- c(\n \"B19013_001\" # median household income estimate\n)\n\n# states of interest: alabama, alaska, arizona\nstate_fips <- c(\"01\", \"02\", \"04\")\n \n# create a custom function that works for one state\nget_income <- function(state_fips) {\n \n income_data <- get_acs(geography = \"tract\", \n variables = vars, \n state = state_fips,\n year = 2015)\n \n return(income_data)\n \n}\n\n# iterate the function\nmap_df(.x = state_fips, # iterate along the vector of state fips codes\n .f = get_income) # apply get_income() to each fips_code \n\n# A tibble: 2,874 × 5\n GEOID NAME varia…¹ estim…² moe\n \n 1 01001020100 Census Tract 201, Autauga County, Alabama B19013… 61838 11900\n 2 01001020200 Census Tract 202, Autauga County, Alabama B19013… 32303 13538\n 3 01001020300 Census Tract 203, Autauga County, Alabama B19013… 44922 5629\n 4 01001020400 Census Tract 204, Autauga County, Alabama B19013… 54329 7003\n 5 01001020500 Census Tract 205, Autauga County, Alabama B19013… 51965 6935\n 6 01001020600 Census Tract 206, Autauga County, Alabama B19013… 63092 9585\n 7 01001020700 Census Tract 207, Autauga County, Alabama B19013… 34821 7867\n 8 01001020801 Census Tract 208.01, Autauga County, Alaba… B19013… 73728 2447\n 9 01001020802 Census Tract 208.02, Autauga County, Alaba… B19013… 60063 8602\n10 01001020900 Census Tract 209, Autauga County, Alabama B19013… 41287 7857\n# … with 2,864 more rows, and abbreviated variable names ¹variable, ²estimate\n\n\nlibrary(tidycensus) works well with library(tidyverse) and enables access to geospatial data, but it is limited to only some Census Bureau data sets. The next package has less functionality but allows for accessing any data available on the Census API."
- },
- {
- "objectID": "getting-data.html#librarycensusapi",
- "href": "getting-data.html#librarycensusapi",
- "title": "R@URBAN",
- "section": "library(censusapi)",
- "text": "library(censusapi)\nlibrary(censusapi) by Hannah Recht (complete intro here) can access any published table that is accessible through the Census Bureau API. A full listing is available here.\nYou will need to apply for a Census API key and add it to your R session. Don’t add your API key to your script and don’t add it to a GitHub repository!\nHere is a simple example that pulls median household income and its margin of error for Census tracts in Alabama:\n\nlibrary(tidyverse)\nlibrary(purrr)\nlibrary(censusapi)\nvars <- c(\n \"B19013_001E\", # median household income estimate\n \"B19013_001M\" # median household income margin of error\n)\n\ngetCensus(name = \"acs/acs5\",\n key = Sys.getenv(\"CENSUS_API_KEY\"),\n vars = vars, \n region = \"tract:*\",\n regionin = \"state:01\",\n vintage = 2015) %>%\n as_tibble()\n\n# A tibble: 1,181 × 5\n state county tract B19013_001E B19013_001M\n \n 1 01 103 005109 29644 4098\n 2 01 103 005106 35864 3443\n 3 01 103 005107 66739 5468\n 4 01 103 005108 64632 9804\n 5 01 103 005701 46306 7926\n 6 01 103 005702 47769 12939\n 7 01 105 686800 30662 7299\n 8 01 009 050102 43325 9484\n 9 01 009 050300 37548 9655\n10 01 009 050700 46452 5167\n# … with 1,171 more rows\n\n\nSmaller geographies like Census tracts can only be pulled state-by-state. This example demonstrates how to iterate across FIPS codes to pull Census tracts for multiple states. The process is as follows:\n\nPick the variables of interest\nCreate a vector of state FIPS codes for the states of interest\nCreate a custom function that works on a single state FIPS code\nIterate the function along the vector of state FIPS codes with map_df() from library(purrr)\n\nHere is an example that pulls median household income at the Census tract level for multiple states:\n\n# variables of interest\nvars <- c(\n \"B19013_001E\", # median household income estimate\n \"B19013_001M\" # median household income margin of error\n)\n\n# states of interest: alabama, alaska, arizona\nstate_fips <- c(\"01\", \"02\", \"04\")\n \n# create a custom function that works for one state\nget_income <- function(state_fips) {\n \n income_data <- getCensus(name = \"acs/acs5\", \n key = Sys.getenv(\"CENSUS_API_KEY\"),\n vars = vars, \n region = \"tract:*\",\n regionin = paste0(\"state:\", state_fips),\n vintage = 2015)\n \n return(income_data)\n \n}\n\n# iterate the function\nmap_df(.x = state_fips, # iterate along the vector of state fips codes\n .f = get_income) %>% # apply get_income() to each fips_code \n as_tibble() \n\n# A tibble: 2,874 × 5\n state county tract B19013_001E B19013_001M\n \n 1 01 103 005109 29644 4098\n 2 01 103 005106 35864 3443\n 3 01 103 005107 66739 5468\n 4 01 103 005108 64632 9804\n 5 01 103 005701 46306 7926\n 6 01 103 005702 47769 12939\n 7 01 105 686800 30662 7299\n 8 01 009 050102 43325 9484\n 9 01 009 050300 37548 9655\n10 01 009 050700 46452 5167\n# … with 2,864 more rows"
- },
- {
- "objectID": "index.html#r-users-group",
- "href": "index.html#r-users-group",
- "title": "R@URBAN",
- "section": "R Users Group",
- "text": "R Users Group\nThis website contains resources for using R at the Urban Institute for analysis, visualization, mapping, and more. Click on the links above to get started learning about R!\nThe Urban Institute R Users Group is committed to exposing researchers to the joy and power of R; developing beginner, intermediate, and advanced R skills; encouraging and supporting novel applications of R to public policy research; and building a diverse and mutually supportive community of R Users.\n\n\ngif credits: Allison Horst"
- },
- {
- "objectID": "index.html#sign-up-for-list-serv",
- "href": "index.html#sign-up-for-list-serv",
- "title": "R@URBAN",
- "section": "Sign up for List Serv!",
- "text": "Sign up for List Serv!\nPlease fill out the following form to receive email updates about upcoming RUG events and trainings. We promise not to spam your inbox:\n\n\n\n\n\n\n\n\n\n\nFill out this Smartsheet form to unsubscribe from the RUG List Serv."
- },
- {
- "objectID": "index.html#contact-info",
- "href": "index.html#contact-info",
- "title": "R@URBAN",
- "section": "Contact Info",
- "text": "Contact Info\nPlease don’t hesitate to contact Aaron Williams (awilliams@urban.org) or Amy Rogin (arogin@urban.org) with any thoughts or questions about R at the Urban Institute."
- },
- {
- "objectID": "index.html#r-lunch-labs",
- "href": "index.html#r-lunch-labs",
- "title": "R@URBAN",
- "section": "R Lunch Labs",
- "text": "R Lunch Labs\nThe Urban Institute R Users Group hosts weekly lunch labs. R Lunch Labs are hands-on trainings for R users of all skill levels and soon-to-be R users. Each meeting begins with a 5-10 minute quick tip. Afterwards, attendees break into small groups and work on a range of topics including introduction to R, data management and plotting, mapping, and machine learning. Most users bring laptops, but there are a few extras for users without laptops.\nWe have currently paused R Lunch Labs, but they will be back soon! If you have an idea for a topic you want to present informally at a lunch lab, please let us know!"
- },
- {
- "objectID": "optimization.html#learn-lapplypurrrmap",
- "href": "optimization.html#learn-lapplypurrrmap",
- "title": "R@URBAN",
- "section": "Learn lapply/purrr::map",
- "text": "Learn lapply/purrr::map\nLearning the lapply (and variants) function from Base R or the map (and variants) function from the purrr package is the first step in learning to run R code in parallel. Once you understand how lapply and map work, running your code in parallel will be simple.\nSay you have a vector of numbers and want to find the square root of each one\n(ignore for now that sqrt is vectorized, which will be covered later).\nYou could write a for loop and iterate over each element of the vector:\n\nx <- c(1, 4, 9, 16)\n\nout <- vector(\"list\", length(x))\n\nfor (i in seq_along(x)) {\n\n out[[i]] <- sqrt(x[[i]])\n\n}\n\nunlist(out)\n\n[1] 1 2 3 4\n\n\nThe lapply function essentially handles the overhead of constructing a for\nloop for you. The syntax is:\n\nlapply(X, FUN, ...)\n\nlapply will then take each element of X and apply the FUNction to it.\nOur simple example then becomes:\n\nx <- c(1, 4, 9, 16)\n\nout <- lapply(x, sqrt)\n\nunlist(out)\n\n[1] 1 2 3 4\n\n\nThose working within the tidyverse may use map from the purrr package equivalently:\n\nlibrary(purrr)\n\nx <- c(1, 4, 9, 16)\n\nout <- map(x, sqrt)\n\nunlist(out)\n\n[1] 1 2 3 4"
- },
- {
- "objectID": "optimization.html#motivating-example",
- "href": "optimization.html#motivating-example",
- "title": "R@URBAN",
- "section": "Motivating Example",
- "text": "Motivating Example\nOnce you are comfortable with lapply and/or map, running the same code in\nparallel takes just an additional line of code.\nFor lapply users, the future.apply package contains an equivalent\nfuture_lapply function. Just be sure to call plan(multiprocess) beforehand,\nwhich will handle the back-end orchestration needed to run in parallel.\n\n# install.packages(\"future.apply\")\n\nlibrary(future.apply)\n\nplan(multisession)\n\nout <- future_lapply(x, sqrt)\n\nunlist(out)\n\n[1] 1 2 3 4\n\n\nFor purrr users, the furrr (i.e., future purrr) package includes an\nequivalent future_map function:\n\n# install.packages(\"furrr\")\n\nlibrary(furrr)\n\nplan(multisession)\n\ny <- future_map(x, sqrt)\n\nunlist(y)\n\n[1] 1 2 3 4\n\n\nHow much faster did this simple example run in parallel?\n\nlibrary(future.apply)\n\nplan(multisession)\n\nx <- c(1, 4, 9, 16)\n\nmicrobenchmark::microbenchmark(\n\n sequential = lapply(x, sqrt),\n\n parallel = future_lapply(x, sqrt),\n\n unit = \"s\"\n\n)\n\nUnit: seconds\n expr min lq mean median uq\n sequential 0.000001066 0.00000123 0.00000177858 0.0000016195 0.000002132\n parallel 0.014658443 0.01518220 0.01828126942 0.0157106875 0.017450646\n max neval\n 0.000006847 100\n 0.198878864 100\n\n\nParallelization was actually slower. In this case, the overhead of\nsetting the code to run in parallel far outweighed any performance gain. In\ngeneral, parallelization works well on long-running & compute intensive jobs."
- },
- {
- "objectID": "optimization.html#a-somewhat-more-complex-example",
- "href": "optimization.html#a-somewhat-more-complex-example",
- "title": "R@URBAN",
- "section": "A (somewhat) More Complex Example",
- "text": "A (somewhat) More Complex Example\nIn this example we’ll use the diamonds dataset from ggplot2 and perform a\nkmeans cluster. We’ll use lapply to iterate the number of clusters from 2 to\n5:\n\ndf <- ggplot2::diamonds\n\ndf <- dplyr::select(df, -c(cut, color, clarity))\n\ncenters = 2:5\n\nsystem.time(\n\n lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n\n user system elapsed \n 21.846 0.663 22.535 \n\n\nA now running the same code in parallel:\n\nlibrary(future.apply)\n\nplan(multisession)\n\nsystem.time(\n\n future_lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n\n user system elapsed \n 0.376 0.121 11.164 \n\n\nWhile we didn’t achieve perfect scaling, we still get a nice bump in execution\ntime."
- },
- {
- "objectID": "optimization.html#additional-packages",
- "href": "optimization.html#additional-packages",
- "title": "R@URBAN",
- "section": "Additional Packages",
- "text": "Additional Packages\nFor the sake of ease and brevity, this guide focused on the futures framework\nfor parallelization. However, you should be aware that there are a number of\nother ways to parallelize your code.\n\nThe parallel Package\nThe parallel package is included in your base R installation. It includes\nanalogues of the various apply functions:\n\nparLapply\nmclapply - not available on Windows\n\nThese functions generally require more setup, especially on Windows machines.\n\n\nThe doParallel Package\nThe doParallel package builds off of parallel and is\nuseful for code that uses for loops instead of lapply. Like the parallel\npackage, it generally requires more setup, especially on Windows machines.\n\n\nMachine Learning - caret\nFor those running machine learning models, the caret package can easily\nleverage doParallel to speed up the execution of multiple models. Lifting\nthe example from the package documentation:\n\nlibrary(doParallel)\n\ncl <- makePSOCKcluster(5) # number of cores to use\n\nregisterDoParallel(cl)\n\n## All subsequent models are then run in parallel\n\nmodel <- train(y ~ ., data = training, method = \"rf\")\n\n## When you are done:\n\nstopCluster(cl)\n\nBe sure to check out the full\ndocumentation\nfor more detail."
- },
- {
- "objectID": "optimization.html#object-size",
- "href": "optimization.html#object-size",
- "title": "R@URBAN",
- "section": "Object Size",
- "text": "Object Size\nThe type of your data can have a big impact on the size of your data frame\nwhen you are dealing with larger files. There are four main types of atomic\nvectors in R:\n\nlogical\ninteger\ndouble (also called numeric)\ncharacter"
- },
- {
- "objectID": "optimization.html#each-of-these-data-types-occupies-a-different-amount-of-space-in-memory",
- "href": "optimization.html#each-of-these-data-types-occupies-a-different-amount-of-space-in-memory",
- "title": "R@URBAN",
- "section": "Each of these data types occupies a different amount of space in memory",
- "text": "Each of these data types occupies a different amount of space in memory\nlogical and integer vectors use 4 bytes per element, while a double will\noccupy 8 bytes. R uses a global string pool, so character vectors are hard\nto estimate, but will generally take up more space for element.\nConsider the following example:\n\nx <- 1:100\n\npryr::object_size(x)\n\n680 B\n\npryr::object_size(as.double(x))\n\n680 B\n\npryr::object_size(as.character(x))\n\n1.32 kB\n\n\nAn incorrect data type can easily cost you a lot of space in memory, especially\nat scale. This often happens when reading data from a text or csv file - data\nmay have a format such as c(1.0, 2.0, 3.0) and will be read in as a numeric\ncolumn, when integer is more appropriate and compact.\nYou may also be familiar with factor variables within R. Essentially a\nfactor will represent your data as integers, and map them back to their\ncharacter representation. This can save memory when you have a compact and\nunique level of factors:\n\nx <- sample(letters, 10000, replace = TRUE)\n\npryr::object_size(as.character(x))\n\n81.50 kB\n\npryr::object_size(as.factor(x))\n\n42.10 kB\n\n\nHowever if each element is unique, or if there is not a lot of overlap among\nelements, than the overhead will make a factor larger than its character\nrepresentation:\n\npryr::object_size(as.factor(letters))\n\n2.22 kB\n\npryr::object_size(as.character(letters))\n\n1.71 kB"
- },
- {
- "objectID": "optimization.html#cloud-computing",
- "href": "optimization.html#cloud-computing",
- "title": "R@URBAN",
- "section": "Cloud Computing",
- "text": "Cloud Computing\nSometimes, you will have data that are simply too large to ever fit on your\nlocal desktop machine. If that is the case, then the Elastic Cloud Computing\nEnvironment from the Office of Technology and Data Science can provide you with\neasy access to powerful analytic tools for computationally intensive project.\nThe Elastic Cloud Computing Environment allows researchers to quickly spin-up\nan Amazon Web Services (AWS) Elastic Cloud Compute (EC2) instance. These\ninstances offer increased memory to read in large datasets, along with\nadditional CPUs to provide the ability to process data in parallel at an\nimpressive scale.\nInstance | CPU | Memory (GB) |\n|———-|—–|——–|\nDesktop | 8 | 16 |\nc5.4xlarge | 16 | 32 |\nc5.9xlarge | 36 | 72 |\nc5.18xlarge | 72 | 144 |\nx1e.8xlarge | 32 | 976 |\nx1e.16xlarge | 64 | 1952 |\nFeel free to contact Erika Tyagi (etyagi@urban.org) if this would be useful\nfor your project."
- },
- {
- "objectID": "optimization.html#for-loops-and-vector-allocation",
- "href": "optimization.html#for-loops-and-vector-allocation",
- "title": "R@URBAN",
- "section": "For Loops and Vector Allocation",
- "text": "For Loops and Vector Allocation\nA refrain you will often hear is that for loops in R are slow and need to be\navoided at all costs. This is not true! Rather, an improperly constructed loop\nin R can bring the execution of your program to a near standstill.\nA common for loop structure may look something like:\n\nx <- 1:100\n\nout <- c()\n\nfor (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n\nThe bottleneck in this loop is with the allocation of the vector out. Every\ntime we iterate over an item in x and append it to out, R makes a copy\nof all the items already in out. As the size of the loop grows, your code\nwill take longer and longer to run.\nA better practice is to pre-allocate out to be the correct length, and then\ninsert the results as the loop runs.\n\nx <- 1:100\n\nout <- rep(NA, length(x))\n\nfor (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n}\n\nA quick benchmark shows how much more efficient a loop with a pre-allocated\nresults vector is:\n\nbad_loop <- function(x) {\n\n out <- c()\n\n for (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n\n}\n\ngood_loop <- function(x) {\n\n out <- rep(NA, length(x))\n\n for (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n }\n\n}\n\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(x),\n\n good_loop(x)\n\n)\n\nUnit: microseconds\n expr min lq mean median uq max neval\n bad_loop(x) 896.465 967.5590 2027.18719 1054.7250 1132.6660 55959.588 100\n good_loop(x) 4.346 4.7355 21.39134 5.8425 7.9745 1437.009 100\n\n\nAnd note how performance of the “bad” loop degrades as the loop size grows.\n\ny <- 1:250\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(y),\n\n good_loop(y)\n\n)\n\nUnit: microseconds\n expr min lq mean median uq max\n bad_loop(y) 13175.473 17043.3310 18404.84383 17790.6995 18655.1230 65857.726\n good_loop(y) 9.717 10.2705 14.36558 11.3775 16.7485 35.711\n neval\n 100\n 100"
- },
- {
- "objectID": "optimization.html#vectorized-functions",
- "href": "optimization.html#vectorized-functions",
- "title": "R@URBAN",
- "section": "Vectorized Functions",
- "text": "Vectorized Functions\nMany functions in R are vectorized, meaning they can accept an entire vector\n(and not just a single value) as input. The sqrt function from the\nprior examples is one:\n\nx <- c(1, 4, 9, 16)\n\nsqrt(x)\n\n[1] 1 2 3 4\n\n\nThis removes the need to use lapply or a for loop. Vectorized functions in\nR are generally written in a compiled language like C, C++, or FORTRAN, which\nmakes their implementation faster.\n\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n lapply(x, sqrt),\n\n sqrt(x)\n\n)\n\nUnit: nanoseconds\n expr min lq mean median uq max neval\n lapply(x, sqrt) 14801 15047 15325.39 15170 15334 21279 100\n sqrt(x) 205 246 341.53 287 369 1107 100"
- },
{
"objectID": "graphics-guide.html#bar-plots",
"href": "graphics-guide.html#bar-plots",
- "title": "R@URBAN",
+ "title": "Urban Institute R Graphics Guide",
"section": "Bar Plots",
- "text": "Bar Plots\n\n\nOne Color\n\nmtcars %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = factor(cyl), y = n)) +\n geom_col() +\n geom_text(mapping = aes(label = n), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) +\n remove_ticks() +\n remove_axis() \n\n\n\n\n\n\nOne Color (Rotated)\nThis example introduces coord_flip() and remove_axis(axis = \"x\", flip = TRUE). remove_axis() is from library(urbnthemes) and creates a custom theme for rotated bar plots.\n\nmtcars %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = factor(cyl), y = n)) +\n geom_col() +\n geom_text(mapping = aes(label = n), hjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n coord_flip() +\n remove_axis(axis = \"x\", flip = TRUE)\n\n\n\n\n\n\nThree Colors\nThis is identical to the previous plot except colors and a legend are added with fill = cyl. Turning x into a factor with factor(cyl) skips 5 and 7 on the x-axis. Adding fill = cyl without factor() would have created a continuous color scheme and legend.\n\nmtcars %>%\n mutate(cyl = factor(cyl)) %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = cyl, y = n, fill = cyl)) +\n geom_col() +\n geom_text(mapping = aes(label = n), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) +\n remove_ticks() +\n remove_axis()\n\n\n\n\n\n\nStacked Bar Plot\nAn additional aesthetic can easily be added to bar plots by adding fill = categorical variable to the mapping. Here, transmission type subsets each bar showing the count of cars with different numbers of cylinders.\n\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>% \n group_by(am) %>%\n count(cyl) %>%\n group_by(cyl) %>%\n arrange(desc(am)) %>%\n mutate(label_height = cumsum(n)) %>%\n ggplot() +\n geom_col(mapping = aes(x = cyl, y = n, fill = am)) +\n geom_text(aes(x = cyl, y = label_height - 0.5, label = n, color = am)) +\n scale_color_manual(values = c(\"white\", \"black\")) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n remove_axis() +\n guides(color = \"none\")\n\n\n\n\n\n\nStacked Bar Plot With Position = Fill\nThe previous examples used geom_col(), which takes a y value for bar height. This example uses geom_bar() which sums the values and generates a value for bar heights. In this example, position = \"fill\" in geom_bar() changes the y-axis from count to the proportion of each bar.\n\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>% \n ggplot() +\n geom_bar(mapping = aes(x = cyl, fill = am), position = \"fill\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1)), labels = scales::percent) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n guides(color = \"none\")\n\n\n\n\n\n\nDodged Bar Plot\nSubsetted bar charts in ggplot2 are stacked by default. position = \"dodge\" in geom_col() expands the bar chart so the bars appear next to each other.\n\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>%\n group_by(am) %>%\n count(cyl) %>%\n ggplot(mapping = aes(cyl, y = n, fill = factor(am))) +\n geom_col(position = \"dodge\") +\n geom_text(aes(label = n), position = position_dodge(width = 0.7), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n remove_axis()\n\n\n\n\n\n\nLollipop plot/Cleveland dot plot\nLollipop plots and Cleveland dot plots are minimalist alternatives to bar plots. The key to both plots is to order the data based on the continuous variable using arrange() and then turn the discrete variable into a factor with the ordered levels of the continuous variable using mutate(). This step “stores” the order of the data.\n\nLollipop plot\n\nmtcars %>%\n rownames_to_column(\"model\") %>%\n arrange(mpg) %>%\n mutate(model = factor(model, levels = .$model)) %>%\n ggplot(aes(mpg, model)) +\n geom_segment(aes(x = 0, xend = mpg, y = model, yend = model)) + \n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0, 0)), limits = c(0, 40)) +\n labs(x = NULL, \n y = \"Miles Per Gallon\")\n\n\n\n\n\n\nCleveland dot plot\n\nmtcars %>%\n rownames_to_column(\"model\") %>%\n arrange(mpg) %>%\n mutate(model = factor(model, levels = .$model)) %>%\n ggplot(aes(mpg, model)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0, 0)), limits = c(0, 40)) +\n labs(x = NULL, \n y = \"Miles Per Gallon\")\n\n\n\n\n\n\n\nDumbell plot"
+ "text": "Bar Plots\n\n\nOne Color\n\nmtcars %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = factor(cyl), y = n)) +\n geom_col() +\n geom_text(mapping = aes(label = n), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) +\n remove_ticks() +\n remove_axis() \n\n\n\n\n\n\nOne Color (Rotated)\nThis example introduces coord_flip() and remove_axis(axis = \"x\", flip = TRUE). remove_axis() is from library(urbnthemes) and creates a custom theme for rotated bar plots.\n\nmtcars %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = factor(cyl), y = n)) +\n geom_col() +\n geom_text(mapping = aes(label = n), hjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n coord_flip() +\n remove_axis(axis = \"x\", flip = TRUE)\n\n\n\n\n\n\nThree Colors\nThis is identical to the previous plot except colors and a legend are added with fill = cyl. Turning x into a factor with factor(cyl) skips 5 and 7 on the x-axis. Adding fill = cyl without factor() would have created a continuous color scheme and legend.\n\nmtcars %>%\n mutate(cyl = factor(cyl)) %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = cyl, y = n, fill = cyl)) +\n geom_col() +\n geom_text(mapping = aes(label = n), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) +\n remove_ticks() +\n remove_axis()\n\n\n\n\n\n\nStacked Bar Plot\nAn additional aesthetic can easily be added to bar plots by adding fill = categorical variable to the mapping. Here, transmission type subsets each bar showing the count of cars with different numbers of cylinders.\n\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>% \n group_by(am) %>%\n count(cyl) %>%\n group_by(cyl) %>%\n arrange(desc(am)) %>%\n mutate(label_height = cumsum(n)) %>%\n ggplot() +\n geom_col(mapping = aes(x = cyl, y = n, fill = am)) +\n geom_text(aes(x = cyl, y = label_height - 0.5, label = n, color = am)) +\n scale_color_manual(values = c(\"white\", \"black\")) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n remove_axis() +\n guides(color = \"none\")\n\n\n\n\n\n\nStacked Bar Plot With Position = Fill\nThe previous examples used geom_col(), which takes a y value for bar height. This example uses geom_bar() which sums the values and generates a value for bar heights. In this example, position = \"fill\" in geom_bar() changes the y-axis from count to the proportion of each bar.\n\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>% \n ggplot() +\n geom_bar(mapping = aes(x = cyl, fill = am), position = \"fill\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1)), labels = scales::percent) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n guides(color = \"none\")\n\n\n\n\n\n\nDodged Bar Plot\nSubsetted bar charts in ggplot2 are stacked by default. position = \"dodge\" in geom_col() expands the bar chart so the bars appear next to each other.\n\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>%\n group_by(am) %>%\n count(cyl) %>%\n ggplot(mapping = aes(cyl, y = n, fill = factor(am))) +\n geom_col(position = \"dodge\") +\n geom_text(aes(label = n), position = position_dodge(width = 0.7), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n remove_axis()\n\n\n\n\n\n\nLollipop plot/Cleveland dot plot\nLollipop plots and Cleveland dot plots are minimalist alternatives to bar plots. The key to both plots is to order the data based on the continuous variable using arrange() and then turn the discrete variable into a factor with the ordered levels of the continuous variable using mutate(). This step “stores” the order of the data.\n\nLollipop plot\n\nmtcars %>%\n rownames_to_column(\"model\") %>%\n arrange(mpg) %>%\n mutate(model = factor(model, levels = .$model)) %>%\n ggplot(aes(mpg, model)) +\n geom_segment(aes(x = 0, xend = mpg, y = model, yend = model)) + \n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0, 0)), limits = c(0, 40)) +\n labs(x = NULL, \n y = \"Miles Per Gallon\")\n\n\n\n\n\n\nCleveland dot plot\n\nmtcars %>%\n rownames_to_column(\"model\") %>%\n arrange(mpg) %>%\n mutate(model = factor(model, levels = .$model)) %>%\n ggplot(aes(mpg, model)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0, 0)), limits = c(0, 40)) +\n labs(x = NULL, \n y = \"Miles Per Gallon\")\n\n\n\n\n\n\n\nDumbell plot"
},
{
"objectID": "graphics-guide.html#scatter-plots",
"href": "graphics-guide.html#scatter-plots",
- "title": "R@URBAN",
+ "title": "Urban Institute R Graphics Guide",
"section": "Scatter Plots",
- "text": "Scatter Plots\n\n\nOne Color Scatter Plot\nScatter plots are useful for showing relationships between two or more variables. Use scatter_grid() from library(urbnthemes) to easily add vertical grid lines for scatter plots.\n\nmtcars %>%\n ggplot(mapping = aes(x = wt, y = mpg)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:8 * 5) +\n labs(x = \"Weight (thousands of pounds)\",\n y = \"City MPG\") +\n scatter_grid()\n\n\n\n\n\n\nHigh-Density Scatter Plot with Transparency\nLarge numbers of observations can sometimes make scatter plots tough to interpret because points overlap. Adding alpha = with a number between 0 and 1 adds transparency to points and clarity to plots. Now it’s easy to see that jewelry stores are probably rounding up but not rounding down carats!\n\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 20000),\n breaks = 0:4 * 5000,\n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n\n\n\n\n\n\nHex Scatter Plot\nSometimes transparency isn’t enough to bring clarity to a scatter plot with many observations. As n increases into the hundreds of thousands and even millions, geom_hex can be one of the best ways to display relationships between two variables.\n\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_hex(mapping = aes(fill = after_stat(count))) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 20000),\n breaks = 0:4 * 5000,\n labels = scales::dollar) +\n scale_fill_gradientn(labels = scales::comma) + \n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid() +\n theme(legend.position = \"right\",\n legend.direction = \"vertical\")\n\n\n\n\n\n\nScatter Plots With Random Noise\nSometimes scatter plots have many overlapping points but a reasonable number of observations. geom_jitter adds a small amount of random noise so points are less likely to overlap. width and height control the amount of noise that is added. In the following before-and-after, notice how many more points are visible after adding jitter.\n\nBefore\n\nmpg %>%\n ggplot(mapping = aes(x = displ, y = cty)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 8),\n breaks = 0:8) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:4 * 10) +\n labs(x = \"Displacement\",\n y = \"City MPG\") +\n scatter_grid()\n\n\n\n\n\n\nAfter\n\nset.seed(2017)\nmpg %>%\n ggplot(mapping = aes(x = displ, y = cty)) +\n geom_jitter() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 8),\n breaks = 0:8) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:4 * 10) +\n labs(x = \"Displacement\",\n y = \"City MPG\") +\n scatter_grid()\n\n\n\n\n\n\n\nScatter Plots with Varying Point Size\nWeights and populations can be mapped in scatter plots to the size of the points. Here, the number of households in each state is mapped to the size of each point using aes(size = hhpop). Note: ggplot2::geom_point() is used instead of geom_point().\n\nurbnmapr::statedata %>%\n ggplot(mapping = aes(x = medhhincome, y = horate)) +\n ggplot2::geom_point(mapping = aes(size = hhpop), alpha = 0.3) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(30000, 80000),\n breaks = 3:8 * 10000,\n labels = scales::dollar) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 0.8),\n breaks = 0:4 * 0.2) +\n scale_radius(range = c(3, 15),\n breaks = c(2500000, 7500000, 12500000), \n labels = scales::comma) +\n labs(x = \"Household income\",\n y = \"Homeownership rate\") +\n scatter_grid() +\n theme(plot.margin = margin(r = 20))\n\n\n\n\n\n\nScatter Plots with Fill\nA third aesthetic can be added to scatter plots. Here, color signifies the number of cylinders in each car. Before ggplot() is called, Cylinders is created using library(dplyr) and the piping operator %>%.\n\nmtcars %>%\n mutate(cyl = paste(cyl, \"cylinders\")) %>%\n ggplot(aes(x = wt, y = mpg, color = cyl)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:8 * 5) +\n labs(x = \"Weight (thousands of pounds)\",\n y = \"City MPG\") +\n scatter_grid()"
+ "text": "Scatter Plots\n\n\nOne Color Scatter Plot\nScatter plots are useful for showing relationships between two or more variables. Use scatter_grid() from library(urbnthemes) to easily add vertical grid lines for scatter plots.\n\nmtcars %>%\n ggplot(mapping = aes(x = wt, y = mpg)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:8 * 5) +\n labs(x = \"Weight (thousands of pounds)\",\n y = \"City MPG\") +\n scatter_grid()\n\n\n\n\n\n\nHigh-Density Scatter Plot with Transparency\nLarge numbers of observations can sometimes make scatter plots tough to interpret because points overlap. Adding alpha = with a number between 0 and 1 adds transparency to points and clarity to plots. Now it’s easy to see that jewelry stores are probably rounding up but not rounding down carats!\n\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 20000),\n breaks = 0:4 * 5000,\n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n\n\n\n\n\n\nHex Scatter Plot\nSometimes transparency isn’t enough to bring clarity to a scatter plot with many observations. As n increases into the hundreds of thousands and even millions, geom_hex can be one of the best ways to display relationships between two variables.\n\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_hex(mapping = aes(fill = after_stat(count))) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 20000),\n breaks = 0:4 * 5000,\n labels = scales::dollar) +\n scale_fill_gradientn(labels = scales::comma) + \n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid() +\n theme(legend.position = \"right\",\n legend.direction = \"vertical\")\n\n\n\n\n\n\nScatter Plots With Random Noise\nSometimes scatter plots have many overlapping points but a reasonable number of observations. geom_jitter adds a small amount of random noise so points are less likely to overlap. width and height control the amount of noise that is added. In the following before-and-after, notice how many more points are visible after adding jitter.\n\nBefore\n\nmpg %>%\n ggplot(mapping = aes(x = displ, y = cty)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 8),\n breaks = 0:8) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:4 * 10) +\n labs(x = \"Displacement\",\n y = \"City MPG\") +\n scatter_grid()\n\n\n\n\n\n\nAfter\n\nset.seed(2017)\nmpg %>%\n ggplot(mapping = aes(x = displ, y = cty)) +\n geom_jitter() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 8),\n breaks = 0:8) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:4 * 10) +\n labs(x = \"Displacement\",\n y = \"City MPG\") +\n scatter_grid()\n\n\n\n\n\n\n\nScatter Plots with Varying Point Size\nWeights and populations can be mapped in scatter plots to the size of the points. Here, the number of households in each state is mapped to the size of each point using aes(size = hhpop). Note: ggplot2::geom_point() is used instead of geom_point().\n\nurbnmapr::statedata %>%\n ggplot(mapping = aes(x = medhhincome, y = horate)) +\n ggplot2::geom_point(mapping = aes(size = hhpop), alpha = 0.3) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(30000, 80000),\n breaks = 3:8 * 10000,\n labels = scales::dollar) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 0.8),\n breaks = 0:4 * 0.2) +\n scale_radius(range = c(3, 15),\n breaks = c(2500000, 7500000, 12500000), \n labels = scales::comma) +\n labs(x = \"Household income\",\n y = \"Homeownership rate\") +\n scatter_grid() +\n theme(plot.margin = margin(r = 20))\n\n\n\n\n\n\nScatter Plots with Fill\nA third aesthetic can be added to scatter plots. Here, color signifies the number of cylinders in each car. Before ggplot() is called, Cylinders is created using library(dplyr) and the piping operator %>%.\n\nmtcars %>%\n mutate(cyl = paste(cyl, \"cylinders\")) %>%\n ggplot(aes(x = wt, y = mpg, color = cyl)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:8 * 5) +\n labs(x = \"Weight (thousands of pounds)\",\n y = \"City MPG\") +\n scatter_grid()"
},
{
"objectID": "graphics-guide.html#line-plots",
"href": "graphics-guide.html#line-plots",
- "title": "R@URBAN",
+ "title": "Urban Institute R Graphics Guide",
"section": "Line Plots",
- "text": "Line Plots\n\n\neconomics %>%\n ggplot(mapping = aes(x = date, y = unemploy)) +\n geom_line() +\n scale_x_date(expand = expansion(mult = c(0.002, 0)), \n breaks = \"10 years\",\n limits = c(as.Date(\"1961-01-01\"), as.Date(\"2020-01-01\")),\n date_labels = \"%Y\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 4000,\n limits = c(0, 16000),\n labels = scales::comma) +\n labs(x = \"Year\", \n y = \"Number Unemployed (1,000s)\")\n\n\n\n\n\nLines Plots With Multiple Lines\n\nlibrary(gapminder)\n\ngapminder %>%\n filter(country %in% c(\"Australia\", \"Canada\", \"New Zealand\")) %>%\n mutate(country = factor(country, levels = c(\"Canada\", \"Australia\", \"New Zealand\"))) %>%\n ggplot(aes(year, gdpPercap, color = country)) +\n geom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n breaks = c(1952 + 0:12 * 5), \n limits = c(1952, 2007)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:8 * 5000,\n labels = scales::dollar, \n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Per capita GDP (US dollars)\")\n\n\n\n\nPlotting more than one variable can be useful for seeing the relationship of variables over time, but it takes a small amount of data munging.\nThis is because ggplot2 wants data in a “long” format instead of a “wide” format for line plots with multiple lines. gather() and spread() from the tidyr package make switching back-and-forth between “long” and “wide” painless. Essentially, variable titles go into “key” and variable values go into “value”. Then ggplot2, turns the different levels of the key variable (population, unemployment) into colors.\n\nas_tibble(EuStockMarkets) %>%\n mutate(date = time(EuStockMarkets)) %>%\n gather(key = \"key\", value = \"value\", -date) %>%\n ggplot(mapping = aes(x = date, y = value, color = key)) +\n geom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(1991, 1999), \n breaks = c(1991, 1993, 1995, 1997, 1999)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 2500,\n labels = scales::dollar, \n limits = c(0, 10000)) + \n labs(x = \"Date\",\n y = \"Value\")\n\n\n\n\n\n\nStep plot\ngeom_line() connects coordinates with the shortest possible straight line. Sometimes step plots are necessary because y values don’t change between coordinates. For example, the upper-bound of the Federal Funds Rate is set at regular intervals and remains constant until it is changed.\n\n# downloaded from FRED on 2018-12-06\n\n# https://fred.stlouisfed.org/series/DFEDTARU\n\nfed_fund_rate <- read_csv(\n \"date, fed_funds_rate\n 2014-01-01,0.0025\n 2015-12-16,0.0050\n 2016-12-14,0.0075\n 2017-03-16,0.0100\n 2017-06-15,0.0125\n 2017-12-14,0.0150\n 2018-03-22,0.0175\n 2018-06-14,0.0200\n 2018-09-27,0.0225\n 2018-12-06,0.0225\")\n\nfed_fund_rate %>%\n ggplot(mapping = aes(x = date, y = fed_funds_rate)) + \n geom_step() +\n scale_x_date(expand = expansion(mult = c(0.002, 0)), \n breaks = \"1 year\",\n limits = c(as.Date(\"2014-01-01\"), as.Date(\"2019-01-01\")),\n date_labels = \"%Y\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = c(0, 0.01, 0.02, 0.03),\n limits = c(0, 0.03),\n labels = scales::percent) + \n labs(x = \"Date\",\n y = \"Upper-bound of the Federal Funds Rate\")\n\n\n\n\n\n\nPath plot\nThe Beveridge curve is a macroeconomic plot that displays a relationship between the unemployment rate and the vacancy rate. Movements along the curve indicate changes in the business cyle and horizontal shifts of the curve suggest structural changes in the labor market.\nLines in Beveridge curves do not monotonically move from left to right. Therefore, it is necessary to use geom_path().\n\n# seasonally-adjusted, quarterly vacancy rate - JOLTS # seasonally-adjusted, quarterly unemployment rate - CPS\n\n# pulled from FRED on April 11, 2018. \n\nlibrary(ggrepel)\n\nbeveridge <- read_csv(\n \"quarter, vacanacy_rate, unempoyment_rate\n 2006-01-01,0.0310,0.0473\n 2006-04-01,0.0316,0.0463\n 2006-07-01,0.0313,0.0463\n 2006-10-01,0.0310,0.0443\n 2007-01-01,0.0323,0.0450\n 2007-04-01,0.0326,0.0450\n 2007-07-01,0.0316,0.0466\n 2007-10-01,0.0293,0.0480\n 2008-01-01,0.0286,0.0500\n 2008-04-01,0.0280,0.0533\n 2008-07-01,0.0253,0.0600\n 2008-10-01,0.0220,0.0686\n 2009-01-01,0.0196,0.0826\n 2009-04-01,0.0180,0.0930\n 2009-07-01,0.0176,0.0963\n 2009-10-01,0.0180,0.0993\n 2010-01-01,0.0196,0.0983\n 2010-04-01,0.0220,0.0963\n 2010-07-01,0.0216,0.0946\n 2010-10-01,0.0220,0.0950\n 2011-01-01,0.0226,0.0903\n 2011-04-01,0.0236,0.0906\n 2011-07-01,0.0250,0.0900\n 2011-10-01,0.0243,0.0863\n 2012-01-01,0.0270,0.0826\n 2012-04-01,0.0270,0.0820\n 2012-07-01,0.0266,0.0803\n 2012-10-01,0.0260,0.0780\n 2013-01-01,0.0276,0.0773\n 2013-04-01,0.0280,0.0753\n 2013-07-01,0.0280,0.0723\n 2013-10-01,0.0276,0.0693\n 2014-01-01,0.0290,0.0666\n 2014-04-01,0.0323,0.0623\n 2014-07-01,0.0326,0.0610\n 2014-10-01,0.0330,0.0570\n 2015-01-01,0.0350,0.0556\n 2015-04-01,0.0366,0.0540\n 2015-07-01,0.0373,0.0510\n 2015-10-01,0.0360,0.0500\n 2016-01-01,0.0386,0.0493\n 2016-04-01,0.0383,0.0486\n 2016-07-01,0.0383,0.0493\n 2016-10-01,0.0363,0.0473\n 2017-01-01,0.0366,0.0466\n 2017-04-01,0.0390,0.0433\n 2017-07-01,0.0406,0.0430\n 2017-10-01,0.0386,0.0410\")\n\nlabels <- beveridge %>%\n filter(lubridate::month(quarter) == 1)\n\nbeveridge %>%\n ggplot() +\n geom_path(mapping = aes(x = unempoyment_rate, y = vacanacy_rate), alpha = 0.5) +\n geom_point(data = labels, mapping = aes(x = unempoyment_rate, y = vacanacy_rate)) +\n geom_text_repel(data = labels, mapping = aes(x = unempoyment_rate, y = vacanacy_rate, label = lubridate::year(quarter))) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0.04, 0.1),\n labels = scales::percent) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = c(0, 0.01, 0.02, 0.03, 0.04, 0.05),\n limits = c(0, 0.05),\n labels = scales::percent) + \n labs(x = \"Seasonally-adjusted unemployment rate\",\n y = \"Seasonally-adjusted vacancy rate\") + \n scatter_grid()\n\n\n\n\n\n\nSlope plots\n\n# https://www.bls.gov/lau/\nlibrary(ggrepel)\n\nunemployment <- tibble(\n time = c(\"October 2009\", \"October 2009\", \"October 2009\", \"August 2017\", \"August 2017\", \"August 2017\"),\n rate = c(7.4, 7.1, 10.0, 3.9, 3.8, 6.4),\n state = c(\"Maryland\", \"Virginia\", \"Washington, D.C.\", \"Maryland\", \"Virginia\", \"Washington, D.C.\")\n)\n\nlabel <- tibble(label = c(\"October 2009\", \"August 2017\"))\noctober <- filter(unemployment, time == \"October 2009\")\naugust <- filter(unemployment, time == \"August 2017\")\n\nunemployment %>%\n mutate(time = factor(time, levels = c(\"October 2009\", \"August 2017\")),\n state = factor(state, levels = c(\"Washington, D.C.\", \"Maryland\", \"Virginia\"))) %>%\n ggplot() + \n geom_line(aes(time, rate, group = state, color = state), show.legend = FALSE) +\n geom_point(aes(x = time, y = rate, color = state)) +\n labs(subtitle = \"Unemployment Rate\") +\n theme(axis.ticks.x = element_blank(),\n axis.title.x = element_blank(),\n axis.ticks.y = element_blank(),\n axis.title.y = element_blank(), \n axis.text.y = element_blank(),\n panel.grid.major.y = element_blank(),\n panel.grid.minor.y = element_blank(),\n panel.grid.major.x = element_blank(),\n axis.line = element_blank()) +\n geom_text_repel(data = october, mapping = aes(x = time, y = rate, label = as.character(rate)), nudge_x = -0.06) + \n geom_text_repel(data = august, mapping = aes(x = time, y = rate, label = as.character(rate)), nudge_x = 0.06)"
+ "text": "Line Plots\n\n\neconomics %>%\n ggplot(mapping = aes(x = date, y = unemploy)) +\n geom_line() +\n scale_x_date(expand = expansion(mult = c(0.002, 0)), \n breaks = \"10 years\",\n limits = c(as.Date(\"1961-01-01\"), as.Date(\"2020-01-01\")),\n date_labels = \"%Y\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 4000,\n limits = c(0, 16000),\n labels = scales::comma) +\n labs(x = \"Year\", \n y = \"Number Unemployed (1,000s)\")\n\n\n\n\n\nLines Plots With Multiple Lines\n\nlibrary(gapminder)\n\ngapminder %>%\n filter(country %in% c(\"Australia\", \"Canada\", \"New Zealand\")) %>%\n mutate(country = factor(country, levels = c(\"Canada\", \"Australia\", \"New Zealand\"))) %>%\n ggplot(aes(year, gdpPercap, color = country)) +\n geom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n breaks = c(1952 + 0:12 * 5), \n limits = c(1952, 2007)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:8 * 5000,\n labels = scales::dollar, \n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Per capita GDP (US dollars)\")\n\n\n\n\nPlotting more than one variable can be useful for seeing the relationship of variables over time, but it takes a small amount of data munging.\nThis is because ggplot2 wants data in a “long” format instead of a “wide” format for line plots with multiple lines. gather() and spread() from the tidyr package make switching back-and-forth between “long” and “wide” painless. Essentially, variable titles go into “key” and variable values go into “value”. Then ggplot2, turns the different levels of the key variable (population, unemployment) into colors.\n\nas_tibble(EuStockMarkets) %>%\n mutate(date = time(EuStockMarkets)) %>%\n gather(key = \"key\", value = \"value\", -date) %>%\n ggplot(mapping = aes(x = date, y = value, color = key)) +\n geom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(1991, 1999), \n breaks = c(1991, 1993, 1995, 1997, 1999)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 2500,\n labels = scales::dollar, \n limits = c(0, 10000)) + \n labs(x = \"Date\",\n y = \"Value\")\n\n\n\n\n\n\nStep plot\ngeom_line() connects coordinates with the shortest possible straight line. Sometimes step plots are necessary because y values don’t change between coordinates. For example, the upper-bound of the Federal Funds Rate is set at regular intervals and remains constant until it is changed.\n\n# downloaded from FRED on 2018-12-06\n\n# https://fred.stlouisfed.org/series/DFEDTARU\n\nfed_fund_rate <- read_csv(\n \"date, fed_funds_rate\n 2014-01-01,0.0025\n 2015-12-16,0.0050\n 2016-12-14,0.0075\n 2017-03-16,0.0100\n 2017-06-15,0.0125\n 2017-12-14,0.0150\n 2018-03-22,0.0175\n 2018-06-14,0.0200\n 2018-09-27,0.0225\n 2018-12-06,0.0225\")\n\nfed_fund_rate %>%\n ggplot(mapping = aes(x = date, y = fed_funds_rate)) + \n geom_step() +\n scale_x_date(expand = expansion(mult = c(0.002, 0)), \n breaks = \"1 year\",\n limits = c(as.Date(\"2014-01-01\"), as.Date(\"2019-01-01\")),\n date_labels = \"%Y\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = c(0, 0.01, 0.02, 0.03),\n limits = c(0, 0.03),\n labels = scales::percent) + \n labs(x = \"Date\",\n y = \"Upper-bound of the Federal Funds Rate\")\n\n\n\n\n\n\nPath plot\nThe Beveridge curve is a macroeconomic plot that displays a relationship between the unemployment rate and the vacancy rate. Movements along the curve indicate changes in the business cyle and horizontal shifts of the curve suggest structural changes in the labor market.\nLines in Beveridge curves do not monotonically move from left to right. Therefore, it is necessary to use geom_path().\n\n# seasonally-adjusted, quarterly vacancy rate - JOLTS # seasonally-adjusted, quarterly unemployment rate - CPS\n\n# pulled from FRED on April 11, 2018. \n\nlibrary(ggrepel)\n\nbeveridge <- read_csv(\n \"quarter, vacanacy_rate, unempoyment_rate\n 2006-01-01,0.0310,0.0473\n 2006-04-01,0.0316,0.0463\n 2006-07-01,0.0313,0.0463\n 2006-10-01,0.0310,0.0443\n 2007-01-01,0.0323,0.0450\n 2007-04-01,0.0326,0.0450\n 2007-07-01,0.0316,0.0466\n 2007-10-01,0.0293,0.0480\n 2008-01-01,0.0286,0.0500\n 2008-04-01,0.0280,0.0533\n 2008-07-01,0.0253,0.0600\n 2008-10-01,0.0220,0.0686\n 2009-01-01,0.0196,0.0826\n 2009-04-01,0.0180,0.0930\n 2009-07-01,0.0176,0.0963\n 2009-10-01,0.0180,0.0993\n 2010-01-01,0.0196,0.0983\n 2010-04-01,0.0220,0.0963\n 2010-07-01,0.0216,0.0946\n 2010-10-01,0.0220,0.0950\n 2011-01-01,0.0226,0.0903\n 2011-04-01,0.0236,0.0906\n 2011-07-01,0.0250,0.0900\n 2011-10-01,0.0243,0.0863\n 2012-01-01,0.0270,0.0826\n 2012-04-01,0.0270,0.0820\n 2012-07-01,0.0266,0.0803\n 2012-10-01,0.0260,0.0780\n 2013-01-01,0.0276,0.0773\n 2013-04-01,0.0280,0.0753\n 2013-07-01,0.0280,0.0723\n 2013-10-01,0.0276,0.0693\n 2014-01-01,0.0290,0.0666\n 2014-04-01,0.0323,0.0623\n 2014-07-01,0.0326,0.0610\n 2014-10-01,0.0330,0.0570\n 2015-01-01,0.0350,0.0556\n 2015-04-01,0.0366,0.0540\n 2015-07-01,0.0373,0.0510\n 2015-10-01,0.0360,0.0500\n 2016-01-01,0.0386,0.0493\n 2016-04-01,0.0383,0.0486\n 2016-07-01,0.0383,0.0493\n 2016-10-01,0.0363,0.0473\n 2017-01-01,0.0366,0.0466\n 2017-04-01,0.0390,0.0433\n 2017-07-01,0.0406,0.0430\n 2017-10-01,0.0386,0.0410\")\n\nlabels <- beveridge %>%\n filter(lubridate::month(quarter) == 1)\n\nbeveridge %>%\n ggplot() +\n geom_path(mapping = aes(x = unempoyment_rate, y = vacanacy_rate), alpha = 0.5) +\n geom_point(data = labels, mapping = aes(x = unempoyment_rate, y = vacanacy_rate)) +\n geom_text_repel(data = labels, mapping = aes(x = unempoyment_rate, y = vacanacy_rate, label = lubridate::year(quarter))) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0.04, 0.1),\n labels = scales::percent) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = c(0, 0.01, 0.02, 0.03, 0.04, 0.05),\n limits = c(0, 0.05),\n labels = scales::percent) + \n labs(x = \"Seasonally-adjusted unemployment rate\",\n y = \"Seasonally-adjusted vacancy rate\") + \n scatter_grid()\n\n\n\n\n\n\nSlope plots\n\n# https://www.bls.gov/lau/\nlibrary(ggrepel)\n\nunemployment <- tibble(\n time = c(\"October 2009\", \"October 2009\", \"October 2009\", \"August 2017\", \"August 2017\", \"August 2017\"),\n rate = c(7.4, 7.1, 10.0, 3.9, 3.8, 6.4),\n state = c(\"Maryland\", \"Virginia\", \"Washington, D.C.\", \"Maryland\", \"Virginia\", \"Washington, D.C.\")\n)\n\nlabel <- tibble(label = c(\"October 2009\", \"August 2017\"))\noctober <- filter(unemployment, time == \"October 2009\")\naugust <- filter(unemployment, time == \"August 2017\")\n\nunemployment %>%\n mutate(time = factor(time, levels = c(\"October 2009\", \"August 2017\")),\n state = factor(state, levels = c(\"Washington, D.C.\", \"Maryland\", \"Virginia\"))) %>%\n ggplot() + \n geom_line(aes(time, rate, group = state, color = state), show.legend = FALSE) +\n geom_point(aes(x = time, y = rate, color = state)) +\n labs(subtitle = \"Unemployment Rate\") +\n theme(axis.ticks.x = element_blank(),\n axis.title.x = element_blank(),\n axis.ticks.y = element_blank(),\n axis.title.y = element_blank(), \n axis.text.y = element_blank(),\n panel.grid.major.y = element_blank(),\n panel.grid.minor.y = element_blank(),\n panel.grid.major.x = element_blank(),\n axis.line = element_blank()) +\n geom_text_repel(data = october, mapping = aes(x = time, y = rate, label = as.character(rate)), nudge_x = -0.06) + \n geom_text_repel(data = august, mapping = aes(x = time, y = rate, label = as.character(rate)), nudge_x = 0.06)"
},
{
"objectID": "graphics-guide.html#univariate",
"href": "graphics-guide.html#univariate",
- "title": "R@URBAN",
+ "title": "Urban Institute R Graphics Guide",
"section": "Univariate",
- "text": "Univariate\n\nThere are a number of ways to explore the distributions of univariate data in R. Some methods, like strip charts, show all data points. Other methods, like the box and whisker plot, show selected data points that communicate key values like the median and 25th percentile. Finally, some methods don’t show any of the underlying data but calculate density estimates. Each method has advantages and disadvantages, so it is worthwhile to understand the different forms. For more information, read 40 years of boxplots by Hadley Wickham and Lisa Stryjewski.\n\nStrip Chart\nStrip charts, the simplest univariate plot, show the distribution of values along one axis. Strip charts work best with variables that have plenty of variation. If not, the points tend to cluster on top of each other. Even if the variable has plenty of variation, it is often important to add transparency to the points with alpha = so overlapping values are visible.\n\nmsleep %>%\n ggplot(aes(x = sleep_total, y = factor(1))) +\n geom_point(alpha = 0.2, size = 5) +\n labs(y = NULL) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) +\n scale_y_discrete(labels = NULL) +\n labs(title = \"Total Sleep Time of Different Mammals\",\n x = \"Total sleep time (hours)\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n\n\n\n\n\n\nStrip Chart with Highlighting\nBecause strip charts show all values, they are useful for showing where selected points lie in the distribution of a variable. The clearest way to do this is by adding geom_point() twice with filter() in the data argument. This way, the highlighted values show up on top of unhighlighted values.\n\nggplot() +\n geom_point(data = filter(msleep, name != \"Red fox\"), \n aes(x = sleep_total, \n y = factor(1)),\n alpha = 0.2, \n size = 5,\n color = \"grey50\") +\n geom_point(data = filter(msleep, name == \"Red fox\"),\n aes(x = sleep_total, \n y = factor(1), \n color = name),\n alpha = 0.8,\n size = 5) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) + \n scale_y_discrete(labels = NULL) +\n labs(title = \"Total Sleep Time of Different Mammals\",\n x = \"Total sleep time (hours)\",\n y = NULL,\n legend) +\n guides(color = guide_legend(title = NULL)) +\n theme(axis.ticks.y = element_blank())\n\n\n\n\n\n\nSubsetted Strip Chart\nAdd a y variable to see the distributions of the continuous variable in subsets of a categorical variable.\n\nlibrary(forcats)\n\nmsleep %>%\n filter(!is.na(vore)) %>%\n mutate(vore = fct_recode(vore, \n \"Insectivore\" = \"insecti\",\n \"Omnivore\" = \"omni\", \n \"Herbivore\" = \"herbi\", \n \"Carnivore\" = \"carni\"\n )) %>%\n ggplot(aes(x = sleep_total, y = vore)) +\n geom_point(alpha = 0.2, size = 5) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) + \n labs(title = \"Total Sleep Time of Different Mammals by Diet\",\n x = \"Total sleep time (hours)\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n\n\n\n\n\n\nBeeswarm Plots\nBeesward plots are a variation of strip charts that shows the distribution of data, but without the points overlaping.\n\nlibrary(ggbeeswarm)\n\ntxhousing %>%\n filter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>% \n ggplot(aes(x = median, y = city)) +\n geom_beeswarm(alpha = 0.2, size = 5) + \n scale_x_continuous(labels = scales::dollar) +\n labs(title = \"Household Sale Price by City\",\n x = \"Sale Price\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n\n\n\n\n\n\nHistograms\nHistograms divide the distribution of a variable into n equal-sized bins and then count and display the number of observations in each bin. Histograms are sensitive to bin width. As ?geom_histogram notes, “You should always override [the default binwidth] value, exploring multiple widths to find the best to illustrate the stories in your data.”\n\nggplot(data = diamonds, mapping = aes(x = depth)) + \n geom_histogram(bins = 100) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 100)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.2)), labels = scales::comma) +\n labs(x = \"Depth\",\n y = \"Count\")\n\n\n\n\n\n\nBoxplots\nBoxplots were invented in the 1970s by John Tukey1. Instead of showing the underlying data or binned counts of the underlying data, they focus on important values like the 25th percentile, median, and 75th percentile.\n\nInsectSprays %>%\n ggplot(mapping = aes(x = spray, y = count)) +\n geom_boxplot() +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Type of insect spray\",\n y = \"Number of dead insects\") +\n remove_ticks()\n\n\n\n\n\n\nSmoothed Kernel Density Plots\nContinuous variables with smooth distributions are sometimes better represented with smoothed kernel density estimates than histograms or boxplots. geom_density() computes and plots a kernel density estimate. Notice the lumps around integers and halves in the following distribution because of rounding.\n\ndiamonds %>%\n ggplot(mapping = aes(carat)) +\n geom_density(color = NA) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, NA)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Carat\",\n y = \"Density\")\n\n\n\n\n\ndiamonds %>%\n mutate(cost = ifelse(price > 5500, \"More than $5,500 +\", \"$0 to $5,500\")) %>%\n ggplot(mapping = aes(carat, fill = cost)) +\n geom_density(alpha = 0.25, color = NA) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, NA)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Carat\",\n y = \"Density\")\n\n\n\n\n\n\nRidgeline Plots\nRidgeline plots are partially overlapping smoothed kernel density plots faceted by a categorical variable that pack a lot of information into one elegant plot.\n\nlibrary(ggridges)\n\nggplot(diamonds, mapping = aes(x = price, y = cut)) +\n geom_density_ridges(fill = \"#1696d2\") +\n labs(x = \"Price\",\n y = \"Cut\")\n\n\n\n\n\n\nViolin Plots\nViolin plots are symmetrical displays of smooth kernel density plots.\n\nInsectSprays %>%\n ggplot(mapping = aes(x = spray, y = count, fill = spray)) +\n geom_violin(color = NA) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Type of insect spray\",\n y = \"Number of dead insects\") +\n remove_ticks()\n\n\n\n\n\n\nBean Plot\nIndividual outliers and important summary values are not visible in violin plots or smoothed kernel density plots. Bean plots, created by Peter Kampstra in 2008, are violin plots with data shown as small lines in a one-dimensional sstrip plot and larger lines for the mean.\n\nmsleep %>%\n filter(!is.na(vore)) %>%\n mutate(vore = fct_recode(vore, \n \"Insectivore\" = \"insecti\",\n \"Omnivore\" = \"omni\", \n \"Herbivore\" = \"herbi\", \n \"Carnivore\" = \"carni\"\n )) %>%\n ggplot(aes(x = vore, y = sleep_total, fill = vore)) +\n stat_summary(fun = \"mean\",\n colour = \"black\", \n size = 30,\n shape = 95,\n geom = \"point\") +\n geom_violin(color = NA) +\n geom_jitter(width = 0,\n height = 0.05,\n alpha = 0.4,\n shape = \"-\",\n size = 10,\n color = \"grey50\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) + \n labs(x = NULL,\n y = \"Total sleep time (hours)\") +\n theme(legend.position = \"none\") +\n remove_ticks()"
+ "text": "Univariate\n\nThere are a number of ways to explore the distributions of univariate data in R. Some methods, like strip charts, show all data points. Other methods, like the box and whisker plot, show selected data points that communicate key values like the median and 25th percentile. Finally, some methods don’t show any of the underlying data but calculate density estimates. Each method has advantages and disadvantages, so it is worthwhile to understand the different forms. For more information, read 40 years of boxplots by Hadley Wickham and Lisa Stryjewski.\n\nStrip Chart\nStrip charts, the simplest univariate plot, show the distribution of values along one axis. Strip charts work best with variables that have plenty of variation. If not, the points tend to cluster on top of each other. Even if the variable has plenty of variation, it is often important to add transparency to the points with alpha = so overlapping values are visible.\n\nmsleep %>%\n ggplot(aes(x = sleep_total, y = factor(1))) +\n geom_point(alpha = 0.2, size = 5) +\n labs(y = NULL) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) +\n scale_y_discrete(labels = NULL) +\n labs(title = \"Total Sleep Time of Different Mammals\",\n x = \"Total sleep time (hours)\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n\n\n\n\n\n\nStrip Chart with Highlighting\nBecause strip charts show all values, they are useful for showing where selected points lie in the distribution of a variable. The clearest way to do this is by adding geom_point() twice with filter() in the data argument. This way, the highlighted values show up on top of unhighlighted values.\n\nggplot() +\n geom_point(data = filter(msleep, name != \"Red fox\"), \n aes(x = sleep_total, \n y = factor(1)),\n alpha = 0.2, \n size = 5,\n color = \"grey50\") +\n geom_point(data = filter(msleep, name == \"Red fox\"),\n aes(x = sleep_total, \n y = factor(1), \n color = name),\n alpha = 0.8,\n size = 5) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) + \n scale_y_discrete(labels = NULL) +\n labs(title = \"Total Sleep Time of Different Mammals\",\n x = \"Total sleep time (hours)\",\n y = NULL,\n legend) +\n guides(color = guide_legend(title = NULL)) +\n theme(axis.ticks.y = element_blank())\n\n\n\n\n\n\nSubsetted Strip Chart\nAdd a y variable to see the distributions of the continuous variable in subsets of a categorical variable.\n\nlibrary(forcats)\n\nmsleep %>%\n filter(!is.na(vore)) %>%\n mutate(vore = fct_recode(vore, \n \"Insectivore\" = \"insecti\",\n \"Omnivore\" = \"omni\", \n \"Herbivore\" = \"herbi\", \n \"Carnivore\" = \"carni\"\n )) %>%\n ggplot(aes(x = sleep_total, y = vore)) +\n geom_point(alpha = 0.2, size = 5) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) + \n labs(title = \"Total Sleep Time of Different Mammals by Diet\",\n x = \"Total sleep time (hours)\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n\n\n\n\n\n\nBeeswarm Plots\nBeesward plots are a variation of strip charts that shows the distribution of data, but without the points overlaping.\n\nlibrary(ggbeeswarm)\n\ntxhousing %>%\n filter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>% \n ggplot(aes(x = median, y = city)) +\n geom_beeswarm(alpha = 0.2, size = 5) + \n scale_x_continuous(labels = scales::dollar) +\n labs(title = \"Household Sale Price by City\",\n x = \"Sale Price\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n\n\n\n\n\n\nHistograms\nHistograms divide the distribution of a variable into n equal-sized bins and then count and display the number of observations in each bin. Histograms are sensitive to bin width. As ?geom_histogram notes, “You should always override [the default binwidth] value, exploring multiple widths to find the best to illustrate the stories in your data.”\n\nggplot(data = diamonds, mapping = aes(x = depth)) + \n geom_histogram(bins = 100) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 100)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.2)), labels = scales::comma) +\n labs(x = \"Depth\",\n y = \"Count\")\n\n\n\n\n\n\nBoxplots\nBoxplots were invented in the 1970s by John Tukey1. Instead of showing the underlying data or binned counts of the underlying data, they focus on important values like the 25th percentile, median, and 75th percentile.\n\nInsectSprays %>%\n ggplot(mapping = aes(x = spray, y = count)) +\n geom_boxplot() +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Type of insect spray\",\n y = \"Number of dead insects\") +\n remove_ticks()\n\n\n\n\n\n\nSmoothed Kernel Density Plots\nContinuous variables with smooth distributions are sometimes better represented with smoothed kernel density estimates than histograms or boxplots. geom_density() computes and plots a kernel density estimate. Notice the lumps around integers and halves in the following distribution because of rounding.\n\ndiamonds %>%\n ggplot(mapping = aes(carat)) +\n geom_density(color = NA) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, NA)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Carat\",\n y = \"Density\")\n\n\n\n\n\ndiamonds %>%\n mutate(cost = ifelse(price > 5500, \"More than $5,500 +\", \"$0 to $5,500\")) %>%\n ggplot(mapping = aes(carat, fill = cost)) +\n geom_density(alpha = 0.25, color = NA) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, NA)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Carat\",\n y = \"Density\")\n\n\n\n\n\n\nRidgeline Plots\nRidgeline plots are partially overlapping smoothed kernel density plots faceted by a categorical variable that pack a lot of information into one elegant plot.\n\nlibrary(ggridges)\n\nggplot(diamonds, mapping = aes(x = price, y = cut)) +\n geom_density_ridges(fill = \"#1696d2\") +\n labs(x = \"Price\",\n y = \"Cut\")\n\n\n\n\n\n\nViolin Plots\nViolin plots are symmetrical displays of smooth kernel density plots.\n\nInsectSprays %>%\n ggplot(mapping = aes(x = spray, y = count, fill = spray)) +\n geom_violin(color = NA) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Type of insect spray\",\n y = \"Number of dead insects\") +\n remove_ticks()\n\n\n\n\n\n\nBean Plot\nIndividual outliers and important summary values are not visible in violin plots or smoothed kernel density plots. Bean plots, created by Peter Kampstra in 2008, are violin plots with data shown as small lines in a one-dimensional sstrip plot and larger lines for the mean.\n\nmsleep %>%\n filter(!is.na(vore)) %>%\n mutate(vore = fct_recode(vore, \n \"Insectivore\" = \"insecti\",\n \"Omnivore\" = \"omni\", \n \"Herbivore\" = \"herbi\", \n \"Carnivore\" = \"carni\"\n )) %>%\n ggplot(aes(x = vore, y = sleep_total, fill = vore)) +\n stat_summary(fun = \"mean\",\n colour = \"black\", \n size = 30,\n shape = 95,\n geom = \"point\") +\n geom_violin(color = NA) +\n geom_jitter(width = 0,\n height = 0.05,\n alpha = 0.4,\n shape = \"-\",\n size = 10,\n color = \"grey50\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) + \n labs(x = NULL,\n y = \"Total sleep time (hours)\") +\n theme(legend.position = \"none\") +\n remove_ticks()"
},
{
"objectID": "graphics-guide.html#area-plot",
"href": "graphics-guide.html#area-plot",
- "title": "R@URBAN",
+ "title": "Urban Institute R Graphics Guide",
"section": "Area Plot",
- "text": "Area Plot\n\n\nStacked Area\n\ntxhousing %>%\n filter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>%\n group_by(city, year) %>%\n summarize(sales = sum(sales)) %>%\n ggplot(aes(x = year, y = sales, fill = city)) +\n geom_area(position = \"stack\") +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(2000, 2015),\n breaks = 2000 + 0:15) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n labels = scales::comma) +\n labs(x = \"Year\",\n y = \"Home sales\")\n\n\n\n\n\n\nFilled Area\n\ntxhousing %>%\n filter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>%\n group_by(city, year) %>%\n summarize(sales = sum(sales)) %>%\n ggplot(aes(x = year, y = sales, fill = city)) +\n geom_area(position = \"fill\") +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(2000, 2015),\n breaks = 2000 + 0:15) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.02)),\n breaks = c(0, 0.25, 0.5, 0.75, 1),\n labels = scales::percent) +\n labs(x = \"Year\",\n y = \"Home sales\")"
+ "text": "Area Plot\n\n\nStacked Area\n\ntxhousing %>%\n filter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>%\n group_by(city, year) %>%\n summarize(sales = sum(sales)) %>%\n ggplot(aes(x = year, y = sales, fill = city)) +\n geom_area(position = \"stack\") +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(2000, 2015),\n breaks = 2000 + 0:15) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n labels = scales::comma) +\n labs(x = \"Year\",\n y = \"Home sales\")\n\n\n\n\n\n\nFilled Area\n\ntxhousing %>%\n filter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>%\n group_by(city, year) %>%\n summarize(sales = sum(sales)) %>%\n ggplot(aes(x = year, y = sales, fill = city)) +\n geom_area(position = \"fill\") +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(2000, 2015),\n breaks = 2000 + 0:15) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.02)),\n breaks = c(0, 0.25, 0.5, 0.75, 1),\n labels = scales::percent) +\n labs(x = \"Year\",\n y = \"Home sales\")"
+ },
+ {
+ "objectID": "graphics-guide.html#sankey-plot",
+ "href": "graphics-guide.html#sankey-plot",
+ "title": "Urban Institute R Graphics Guide",
+ "section": "Sankey Plot",
+ "text": "Sankey Plot\n\nSankey plots visualize flows from one set of variables to another. This can be useful for showing outcomes from the start of a program to the end. You’ll need to install the ggsankey package to create Sankey plots in R. In this example I make a dummy data set of housing status prior to program start and at exit to show the flow of people between outcomes. A key step is to transform your data set using the make_long function from the package. This creates a data frame that specifies each of the initial nodes and how they flow into the next stage.\n\n# load ggsankey package\nremotes::install_github(\"davidsjoberg/ggsankey\")\nlibrary(ggsankey)\n\n# create a dummy dataset of housing status\ndf <- data_frame(entry_status = c(rep(\"Housed\", 7), rep(\"Unhoused\", 15), rep(\"Staying w/ Family\", 8)), \n exit_status = c(rep(\"Housed\", 15), rep(\"Unhoused\", 2), rep(\"Staying w/ Family\", 13))) %>% \n # transform the data frame into the proper format for the sankey plot\n make_long(entry_status, exit_status) %>% \n # recode the labels to be cleaner in the plot \n mutate(x = recode(x, entry_status = \"Prior Housing Status\", exit_status = \"Exit Housing Status\"),\n next_x = recode(next_x, entry_status = \"Prior Housing Status\", exit_status = \"Exit Housing Status\"))\n\n# create sankey plot\nggplot(df, aes(x = x, \n next_x = next_x, \n node = node, \n next_node = next_node,\n fill = factor(node), \n label = node)) +\n geom_sankey(flow.alpha = 0.5, node.color = 1, show.legend = FALSE) +\n # add labels to plot and style\n geom_sankey_label(size = 3.5, color = 1, fill = \"white\") +\n theme_sankey(base_size = 16)+\n labs(x = NULL)"
},
{
"objectID": "graphics-guide.html#heat-map",
"href": "graphics-guide.html#heat-map",
- "title": "R@URBAN",
+ "title": "Urban Institute R Graphics Guide",
"section": "Heat Map",
- "text": "Heat Map\n\n\nlibrary(fivethirtyeight)\n\nbad_drivers %>%\n filter(state %in% c(\"Maine\", \"New Hampshire\", \"Vermont\", \"Massachusetts\", \"Connecticut\", \"New York\")) %>%\n mutate(`Number of\\nDrivers` = scale(num_drivers),\n `Percent\\nSpeeding` = scale(perc_speeding),\n `Percent\\nAlcohol` = scale(perc_alcohol),\n `Percent Not\\nDistracted` = scale(perc_not_distracted),\n `Percent No\\nPrevious` = scale(perc_no_previous),\n state = factor(state, levels = rev(state))\n ) %>%\n select(-insurance_premiums, -losses, -(num_drivers:losses)) %>%\n gather(`Number of\\nDrivers`:`Percent No\\nPrevious`, key = \"variable\", value = \"SD's from Mean\") %>%\n ggplot(aes(variable, state)) +\n geom_tile(aes(fill = `SD's from Mean`)) +\n labs(x = NULL,\n y = NULL) + \n scale_fill_gradientn() +\n theme(legend.position = \"right\",\n legend.direction = \"vertical\",\n axis.line.x = element_blank(),\n panel.grid.major.y = element_blank()) +\n remove_ticks()\n\n\n\n#https://learnr.wordpress.com/2010/01/26/ggplot2-quick-heatmap-plotting/"
+ "text": "Heat Map\n\n\nlibrary(fivethirtyeight)\n\nbad_drivers %>%\n filter(state %in% c(\"Maine\", \"New Hampshire\", \"Vermont\", \"Massachusetts\", \"Connecticut\", \"New York\")) %>%\n mutate(`Number of\\nDrivers` = scale(num_drivers),\n `Percent\\nSpeeding` = scale(perc_speeding),\n `Percent\\nAlcohol` = scale(perc_alcohol),\n `Percent Not\\nDistracted` = scale(perc_not_distracted),\n `Percent No\\nPrevious` = scale(perc_no_previous),\n state = factor(state, levels = rev(state))\n ) %>%\n select(-insurance_premiums, -losses, -(num_drivers:losses)) %>%\n gather(`Number of\\nDrivers`:`Percent No\\nPrevious`, key = \"variable\", value = \"SD's from Mean\") %>%\n ggplot(aes(variable, state)) +\n geom_tile(aes(fill = `SD's from Mean`)) +\n labs(x = NULL,\n y = NULL) + \n scale_fill_gradientn() +\n theme(legend.position = \"right\",\n legend.direction = \"vertical\",\n axis.line.x = element_blank(),\n panel.grid.major.y = element_blank()) +\n remove_ticks()\n\n\n\n#https://learnr.wordpress.com/2010/01/26/ggplot2-quick-heatmap-plotting/"
},
{
"objectID": "graphics-guide.html#faceting-and-small-multiples",
"href": "graphics-guide.html#faceting-and-small-multiples",
- "title": "R@URBAN",
+ "title": "Urban Institute R Graphics Guide",
"section": "Faceting and Small Multiples",
- "text": "Faceting and Small Multiples\n\n\nfacet_wrap()\nR’s faceting system is a powerful way to make “small multiples”.\nSome edits to the theme may be necessary depending upon how many rows and columns are in the plot.\n\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n facet_wrap(~cut, ncol = 5) +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 6)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 20000), \n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n\n\n\n\n\n\nfacet_grid()\n\ndiamonds %>%\n filter(color %in% c(\"D\", \"E\", \"F\", \"G\")) %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n facet_grid(color ~ cut) +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 4)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 20000), \n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n theme(panel.spacing = unit(20L, \"pt\")) +\n scatter_grid()"
+ "text": "Faceting and Small Multiples\n\n\nfacet_wrap()\nR’s faceting system is a powerful way to make “small multiples”.\nSome edits to the theme may be necessary depending upon how many rows and columns are in the plot.\n\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n facet_wrap(~cut, ncol = 5) +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 6)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 20000), \n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n\n\n\n\n\n\nfacet_grid()\n\ndiamonds %>%\n filter(color %in% c(\"D\", \"E\", \"F\", \"G\")) %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n facet_grid(color ~ cut) +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 4)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 20000), \n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n theme(panel.spacing = unit(20L, \"pt\")) +\n scatter_grid()"
},
{
"objectID": "graphics-guide.html#smoothers",
"href": "graphics-guide.html#smoothers",
- "title": "R@URBAN",
+ "title": "Urban Institute R Graphics Guide",
"section": "Smoothers",
- "text": "Smoothers\n\ngeom_smooth() fits and plots models to data with two or more dimensions.\nUnderstanding and manipulating defaults is more important for geom_smooth() than other geoms because it contains a number of assumptions. geom_smooth() automatically uses loess for datasets with fewer than 1,000 observations and a generalized additive model with formula = y ~ s(x, bs = \"cs\") for datasets with greater than 1,000 observations. Both default to a 95% confidence interval with the confidence interval displayed.\nModels are chosen with method = and can be set to lm(), glm(), gam(), loess(), rlm(), and more. Formulas can be specified with formula = and y ~ x syntax. Plotting the standard error is toggled with se = TRUE and se = FALSE, and level is specificed with level =. As always, more information can be seen in RStudio with ?geom_smooth().\ngeom_point() adds a scatterplot to geom_smooth(). The order of the function calls is important. The function called second will be layed on top of the function called first.\n\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n geom_smooth(color = \"#ec008b\") +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 5),\n breaks = 0:5) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 20000), \n labels = scales::dollar) + \n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n\n\n\n\ngeom_smooth can be subset by categorical and factor variables. This requires subgroups to have a decent number of observations and and a fair amount of variability across the x-axis. Confidence intervals often widen at the ends so special care is needed for the chart to be meaningful and readable.\nThis example uses Loess with MPG = displacement.\n\nggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = factor(cyl))) +\n geom_point(alpha = 0.2) +\n geom_smooth() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 7),\n breaks = 0:7) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 60)) + \n labs(x = \"Engine displacement\",\n y = \"Highway MPG\") +\n scatter_grid()\n\n\n\n\nThis example uses linear models with MPG = displacement.\n\nggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = factor(cyl))) +\n geom_point(alpha = 0.2) +\n geom_smooth(method = \"lm\") +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 7),\n breaks = 0:7) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 60)) + \n labs(x = \"Engine displacement\",\n y = \"Highway MPG\") +\n scatter_grid()"
+ "text": "Smoothers\n\ngeom_smooth() fits and plots models to data with two or more dimensions.\nUnderstanding and manipulating defaults is more important for geom_smooth() than other geoms because it contains a number of assumptions. geom_smooth() automatically uses loess for datasets with fewer than 1,000 observations and a generalized additive model with formula = y ~ s(x, bs = \"cs\") for datasets with greater than 1,000 observations. Both default to a 95% confidence interval with the confidence interval displayed.\nModels are chosen with method = and can be set to lm(), glm(), gam(), loess(), rlm(), and more. Formulas can be specified with formula = and y ~ x syntax. Plotting the standard error is toggled with se = TRUE and se = FALSE, and level is specificed with level =. As always, more information can be seen in RStudio with ?geom_smooth().\ngeom_point() adds a scatterplot to geom_smooth(). The order of the function calls is important. The function called second will be layed on top of the function called first.\n\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n geom_smooth(color = \"#ec008b\") +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 5),\n breaks = 0:5) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 20000), \n labels = scales::dollar) + \n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n\n\n\n\ngeom_smooth can be subset by categorical and factor variables. This requires subgroups to have a decent number of observations and and a fair amount of variability across the x-axis. Confidence intervals often widen at the ends so special care is needed for the chart to be meaningful and readable.\nThis example uses Loess with MPG = displacement.\n\nggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = factor(cyl))) +\n geom_point(alpha = 0.2) +\n geom_smooth() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 7),\n breaks = 0:7) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 60)) + \n labs(x = \"Engine displacement\",\n y = \"Highway MPG\") +\n scatter_grid()\n\n\n\n\nThis example uses linear models with MPG = displacement.\n\nggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = factor(cyl))) +\n geom_point(alpha = 0.2) +\n geom_smooth(method = \"lm\") +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 7),\n breaks = 0:7) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 60)) + \n labs(x = \"Engine displacement\",\n y = \"Highway MPG\") +\n scatter_grid()"
},
{
"objectID": "graphics-guide.html#highlighting",
"href": "graphics-guide.html#highlighting",
- "title": "R@URBAN",
+ "title": "Urban Institute R Graphics Guide",
"section": "Highlighting",
- "text": "Highlighting\n\nlibrary(gghighlight) enables the intuitive highlighting of ggplot2 plots. gghighlight modifies existing ggplot2 objects, so no other code should change. All of the highlighting is handled by the function gghighlight(), which can handle all types of geoms.\nWarning: R will throw an error if too many colors are highlighted because of the design of urbnthemes. Simply decrease the number of highlighted geoms to solve this issue.\nThere are two main ways to highlight.\n\nThreshold\nThe first way to highlight is with a threshold. Add a logical test to gghighlight() to describe which lines should be highlighted. Here, lines with maximum change in per-capita Gross Domestic Product greater than $35,000 are highlighted by gghighlight(max(pcgpd_change) > 35000, use_direct_label = FALSE).\n\nlibrary(gghighlight)\nlibrary(gapminder)\n\ndata <- gapminder %>%\n filter(continent %in% c(\"Europe\")) %>%\n group_by(country) %>%\n mutate(pcgpd_change = ifelse(year == 1952, 0, gdpPercap - lag(gdpPercap))) %>%\n mutate(pcgpd_change = cumsum(pcgpd_change))\n \ndata %>%\n ggplot(aes(year, pcgpd_change, group = country, color = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change) > 35000, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\")\n\n\n\n\n\n\nRank\nThe second way to highlight is by rank. Here, the countries with the first highest values for change in per-capita Gross Domestic Product are highlighted with gghighlight(max(pcgpd_change), max_highlight = 5, use_direct_label = FALSE).\n\ndata %>%\n ggplot(aes(year, pcgpd_change, group = country, color = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change), max_highlight = 5, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\")\n\n\n\n\n\n\nFaceting\ngghighlight() works well with ggplot2’s faceting system.\n\ndata %>%\n ggplot(aes(year, pcgpd_change, group = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change), max_highlight = 4, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\") +\n facet_wrap(~ country) +\n theme(panel.spacing = unit(20L, \"pt\"))"
+ "text": "Highlighting\n\nlibrary(gghighlight) enables the intuitive highlighting of ggplot2 plots. gghighlight modifies existing ggplot2 objects, so no other code should change. All of the highlighting is handled by the function gghighlight(), which can handle all types of geoms.\nWarning: R will throw an error if too many colors are highlighted because of the design of urbnthemes. Simply decrease the number of highlighted geoms to solve this issue.\nThere are two main ways to highlight.\n\nThreshold\nThe first way to highlight is with a threshold. Add a logical test to gghighlight() to describe which lines should be highlighted. Here, lines with maximum change in per-capita Gross Domestic Product greater than $35,000 are highlighted by gghighlight(max(pcgpd_change) > 35000, use_direct_label = FALSE).\n\nlibrary(gghighlight)\nlibrary(gapminder)\n\ndata <- gapminder %>%\n filter(continent %in% c(\"Europe\")) %>%\n group_by(country) %>%\n mutate(pcgpd_change = ifelse(year == 1952, 0, gdpPercap - lag(gdpPercap))) %>%\n mutate(pcgpd_change = cumsum(pcgpd_change))\n \ndata %>%\n ggplot(aes(year, pcgpd_change, group = country, color = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change) > 35000, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\")\n\n\n\n\n\n\nRank\nThe second way to highlight is by rank. Here, the countries with the first highest values for change in per-capita Gross Domestic Product are highlighted with gghighlight(max(pcgpd_change), max_highlight = 5, use_direct_label = FALSE).\n\ndata %>%\n ggplot(aes(year, pcgpd_change, group = country, color = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change), max_highlight = 5, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\")\n\n\n\n\n\n\nFaceting\ngghighlight() works well with ggplot2’s faceting system.\n\ndata %>%\n ggplot(aes(year, pcgpd_change, group = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change), max_highlight = 4, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\") +\n facet_wrap(~ country) +\n theme(panel.spacing = unit(20L, \"pt\"))"
},
{
"objectID": "graphics-guide.html#text-and-annotation",
"href": "graphics-guide.html#text-and-annotation",
- "title": "R@URBAN",
+ "title": "Urban Institute R Graphics Guide",
"section": "Text and Annotation",
- "text": "Text and Annotation\n\nSeveral functions can be used to annotate, label, and highlight different parts of plots. geom_text() and geom_text_repel() both display variables from data frames. annotate(), which has several different uses, displays variables and values included in the function call.\n\ngeom_text()\ngeom_text() turns text variables in data sets into geometric objects. This is useful for labeling data in plots. Both functions need x values and y values to determine placement on the coordinate plane, and a text vector of labels.\nThis can be used to label geom_bar().\n\ndiamonds %>%\n group_by(cut) %>%\n summarize(price = mean(price)) %>%\n ggplot(aes(cut, price)) +\n geom_bar(stat = \"identity\") +\n geom_text(aes(label = scales::dollar(price)), vjust = -1) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2)),\n labels = scales::dollar) +\n labs(title = \"Average Diamond Price by Diamond Cut\",\n x = \"Cut\",\n y = \"Price\") +\n remove_ticks()\n\n\n\n\nIt can also be used to label points in a scatter plot.\nIt’s rarely useful to label every point in a scatter plot. Use filter() to create a second data set that is subsetted and pass it into the labelling function.\n\nlabels <- mtcars %>%\n rownames_to_column(\"model\") %>%\n filter(model %in% c(\"Toyota Corolla\", \"Merc 240D\", \"Datsun 710\"))\n\nmtcars %>%\n ggplot() +\n geom_point(mapping = aes(x = wt, y = mpg)) +\n geom_text(data = labels, mapping = aes(x = wt, y = mpg, label = model), nudge_x = 0.38) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 6)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 40)) + \n labs(x = \"Weight (Tons)\",\n y = \"Miles per gallon (MPG)\") +\n scatter_grid()\n\n\n\n\nText too often overlaps with other text or geoms when using geom_text(). library(ggrepel) is a library(ggplot2) add-on that automatically positions text so it doesn’t overlap with geoms or other text. To add this functionality, install and load library(ggrepel) and then use geom_text_repel() with the same syntax as geom_text().\n\n\ngeom_text_repel()\n\nlibrary(ggrepel)\n\nlabels <- mtcars %>%\n rownames_to_column(\"model\") %>%\n top_n(5, mpg)\n\nmtcars %>%\n ggplot(mapping = aes(x = wt, y = mpg)) +\n geom_point() +\n geom_text_repel(data = labels, \n mapping = aes(label = model), \n nudge_x = 0.38) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 6)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 40)) + \n labs(x = \"Weight (Tons)\",\n y = \"Miles per gallon (MPG)\") +\n scatter_grid()\n\n\n\n\n\n\nannotate()\nannotate() doesn’t use data frames. Instead, it takes values for x = and y =. It can add text, rectangles, segments, and pointrange.\n\nmsleep %>%\n filter(bodywt <= 1000) %>%\n ggplot(aes(bodywt, sleep_total)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(-10, 1000),\n labels = scales::comma) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 25)) + \n annotate(\"text\", x = 500, y = 12, label = \"These data suggest that heavy \\n animals sleep less than light animals\") +\n labs(x = \"Body weight (pounds)\",\n y = \"Sleep time (hours)\") +\n scatter_grid() \n\n\n\n\n\nlibrary(AmesHousing)\n\names <- make_ames()\n\names %>%\n mutate(square_footage = Total_Bsmt_SF - Bsmt_Unf_SF + First_Flr_SF + Second_Flr_SF) %>%\n mutate(Sale_Price = Sale_Price / 1000) %>% \n ggplot(aes(square_footage, Sale_Price)) +\n geom_point(alpha = 0.2) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(-10, 12000),\n labels = scales::comma) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 800),\n labels = scales::dollar) + \n annotate(\"rect\", xmin = 6800, xmax = 11500, ymin = 145, ymax = 210, alpha = 0.1) +\n annotate(\"text\", x = 8750, y = 230, label = \"Unfinished homes\") +\n labs(x = \"Square footage\", \n y = \"Sale price (thousands)\") +\n scatter_grid()"
+ "text": "Text and Annotation\n\nSeveral functions can be used to annotate, label, and highlight different parts of plots. geom_text() and geom_text_repel() both display variables from data frames. annotate(), which has several different uses, displays variables and values included in the function call.\n\ngeom_text()\ngeom_text() turns text variables in data sets into geometric objects. This is useful for labeling data in plots. Both functions need x values and y values to determine placement on the coordinate plane, and a text vector of labels.\nThis can be used to label geom_bar().\n\ndiamonds %>%\n group_by(cut) %>%\n summarize(price = mean(price)) %>%\n ggplot(aes(cut, price)) +\n geom_bar(stat = \"identity\") +\n geom_text(aes(label = scales::dollar(price)), vjust = -1) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2)),\n labels = scales::dollar) +\n labs(title = \"Average Diamond Price by Diamond Cut\",\n x = \"Cut\",\n y = \"Price\") +\n remove_ticks()\n\n\n\n\nIt can also be used to label points in a scatter plot.\nIt’s rarely useful to label every point in a scatter plot. Use filter() to create a second data set that is subsetted and pass it into the labelling function.\n\nlabels <- mtcars %>%\n rownames_to_column(\"model\") %>%\n filter(model %in% c(\"Toyota Corolla\", \"Merc 240D\", \"Datsun 710\"))\n\nmtcars %>%\n ggplot() +\n geom_point(mapping = aes(x = wt, y = mpg)) +\n geom_text(data = labels, mapping = aes(x = wt, y = mpg, label = model), nudge_x = 0.38) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 6)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 40)) + \n labs(x = \"Weight (Tons)\",\n y = \"Miles per gallon (MPG)\") +\n scatter_grid()\n\n\n\n\nText too often overlaps with other text or geoms when using geom_text(). library(ggrepel) is a library(ggplot2) add-on that automatically positions text so it doesn’t overlap with geoms or other text. To add this functionality, install and load library(ggrepel) and then use geom_text_repel() with the same syntax as geom_text().\n\n\ngeom_text_repel()\n\nlibrary(ggrepel)\n\nlabels <- mtcars %>%\n rownames_to_column(\"model\") %>%\n top_n(5, mpg)\n\nmtcars %>%\n ggplot(mapping = aes(x = wt, y = mpg)) +\n geom_point() +\n geom_text_repel(data = labels, \n mapping = aes(label = model), \n nudge_x = 0.38) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 6)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 40)) + \n labs(x = \"Weight (Tons)\",\n y = \"Miles per gallon (MPG)\") +\n scatter_grid()\n\n\n\n\n\n\nannotate()\nannotate() doesn’t use data frames. Instead, it takes values for x = and y =. It can add text, rectangles, segments, and pointrange.\n\nmsleep %>%\n filter(bodywt <= 1000) %>%\n ggplot(aes(bodywt, sleep_total)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(-10, 1000),\n labels = scales::comma) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 25)) + \n annotate(\"text\", x = 500, y = 12, label = \"These data suggest that heavy \\n animals sleep less than light animals\") +\n labs(x = \"Body weight (pounds)\",\n y = \"Sleep time (hours)\") +\n scatter_grid() \n\n\n\n\n\nlibrary(AmesHousing)\n\names <- make_ames()\n\names %>%\n mutate(square_footage = Total_Bsmt_SF - Bsmt_Unf_SF + First_Flr_SF + Second_Flr_SF) %>%\n mutate(Sale_Price = Sale_Price / 1000) %>% \n ggplot(aes(square_footage, Sale_Price)) +\n geom_point(alpha = 0.2) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(-10, 12000),\n labels = scales::comma) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 800),\n labels = scales::dollar) + \n annotate(\"rect\", xmin = 6800, xmax = 11500, ymin = 145, ymax = 210, alpha = 0.1) +\n annotate(\"text\", x = 8750, y = 230, label = \"Unfinished homes\") +\n labs(x = \"Square footage\", \n y = \"Sale price (thousands)\") +\n scatter_grid()"
},
{
"objectID": "graphics-guide.html#layered-geoms",
"href": "graphics-guide.html#layered-geoms",
- "title": "R@URBAN",
+ "title": "Urban Institute R Graphics Guide",
"section": "Layered Geoms",
- "text": "Layered Geoms\n\nGeoms can be layered in ggplot2. This is useful for design and analysis.\nIt is often useful to add points to line plots with a small number of values across the x-axis. This example from R for Data Science shows how changing the line to grey can be appealing.\n\nDesign\n\nBefore\n\ntable1 %>%\n ggplot(aes(x = year, y = cases)) +\n geom_line(aes(color = country)) +\n geom_point(aes(color = country)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n labels = scales::comma) +\n scale_x_continuous(breaks = c(1999, 2000)) +\n labs(title = \"Changes in Tuberculosis Cases in Three Countries\")\n\n\n\n\n\n\nAfter\n\ntable1 %>%\n ggplot(aes(year, cases)) +\n geom_line(aes(group = country), color = \"grey50\") +\n geom_point(aes(color = country)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n labels = scales::comma) +\n scale_x_continuous(breaks = c(1999, 2000)) +\n labs(title = \"Changes in Tuberculosis Cases in Three Countries\")\n\n\n\n\n\n\n\nCentroids\n\nmpg_summary <- mpg %>%\n group_by(cyl) %>%\n summarize(displ = mean(displ), cty = mean(cty))\n\nmpg %>%\n ggplot() +\n geom_point(aes(x = displ, y = cty, color = factor(cyl)), alpha = 0.5) +\n geom_point(data = mpg_summary, aes(x = displ, y = cty), size = 5, color = \"#ec008b\") +\n geom_text(data = mpg_summary, aes(x = displ, y = cty, label = cyl)) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 8)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0)), \n limits = c(0, 40)) +\n labs(x = \"Displacement\",\n y = \"City MPG\") +\n scatter_grid()"
+ "text": "Layered Geoms\n\nGeoms can be layered in ggplot2. This is useful for design and analysis.\nIt is often useful to add points to line plots with a small number of values across the x-axis. This example from R for Data Science shows how changing the line to grey can be appealing.\n\nDesign\n\nBefore\n\ntable1 %>%\n ggplot(aes(x = year, y = cases)) +\n geom_line(aes(color = country)) +\n geom_point(aes(color = country)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n labels = scales::comma) +\n scale_x_continuous(breaks = c(1999, 2000)) +\n labs(title = \"Changes in Tuberculosis Cases in Three Countries\")\n\n\n\n\n\n\nAfter\n\ntable1 %>%\n ggplot(aes(year, cases)) +\n geom_line(aes(group = country), color = \"grey50\") +\n geom_point(aes(color = country)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n labels = scales::comma) +\n scale_x_continuous(breaks = c(1999, 2000)) +\n labs(title = \"Changes in Tuberculosis Cases in Three Countries\")\n\n\n\n\n\n\n\nCentroids\n\nmpg_summary <- mpg %>%\n group_by(cyl) %>%\n summarize(displ = mean(displ), cty = mean(cty))\n\nmpg %>%\n ggplot() +\n geom_point(aes(x = displ, y = cty, color = factor(cyl)), alpha = 0.5) +\n geom_point(data = mpg_summary, aes(x = displ, y = cty), size = 5, color = \"#ec008b\") +\n geom_text(data = mpg_summary, aes(x = displ, y = cty, label = cyl)) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 8)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0)), \n limits = c(0, 40)) +\n labs(x = \"Displacement\",\n y = \"City MPG\") +\n scatter_grid()"
},
{
"objectID": "graphics-guide.html#saving-plots",
"href": "graphics-guide.html#saving-plots",
- "title": "R@URBAN",
+ "title": "Urban Institute R Graphics Guide",
"section": "Saving Plots",
- "text": "Saving Plots\n\nggsave() exports ggplot2 plots. The function can be used in two ways. If plot = isn’t specified in the function call, then ggsave() automatically saves the plot that was last displayed in the Viewer window. Second, if plot = is specified, then ggsave() saves the specified plot. ggsave() guesses the type of graphics device to use in export (.png, .pdf, .svg, etc.) from the file extension in the filename.\nmtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\nggsave(filename = \"cars.png\")\n\nplot2 <- mtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\nggsave(filename = \"cars.png\", plot = plot2)\nExported plots rarely look identical to the plots that show up in the Viewer window in RStudio because the overall size and aspect ratio of the Viewer is often different than the defaults for ggsave(). Specific sizes, aspect ratios, and resolutions can be controlled with arguments in ggsave(). RStudio has a useful cheatsheet called “How Big is Your Graph?” that should help with choosing the best size, aspect ratio, and resolution.\nFonts are not embedded in PDFs by default. To embed fonts in PDFs, include device = cairo_pdf in ggsave().\nplot <- mtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\nggsave(filename = \"cars.pdf\", plot = plot2, width = 6.5, height = 4, device = cairo_pdf)"
+ "text": "Saving Plots\n\nggsave() exports ggplot2 plots. The function can be used in two ways. If plot = isn’t specified in the function call, then ggsave() automatically saves the plot that was last displayed in the Viewer window. Second, if plot = is specified, then ggsave() saves the specified plot. ggsave() guesses the type of graphics device to use in export (.png, .pdf, .svg, etc.) from the file extension in the filename.\nmtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\nggsave(filename = \"cars.png\")\n\nplot2 <- mtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\nggsave(filename = \"cars.png\", plot = plot2)\nExported plots rarely look identical to the plots that show up in the Viewer window in RStudio because the overall size and aspect ratio of the Viewer is often different than the defaults for ggsave(). Specific sizes, aspect ratios, and resolutions can be controlled with arguments in ggsave(). RStudio has a useful cheatsheet called “How Big is Your Graph?” that should help with choosing the best size, aspect ratio, and resolution.\nFonts are not embedded in PDFs by default. To embed fonts in PDFs, include device = cairo_pdf in ggsave().\nplot <- mtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\nggsave(filename = \"cars.pdf\", plot = plot2, width = 6.5, height = 4, device = cairo_pdf)"
+ },
+ {
+ "objectID": "graphics-guide.html#interactive-plots",
+ "href": "graphics-guide.html#interactive-plots",
+ "title": "Urban Institute R Graphics Guide",
+ "section": "Interactive Plots",
+ "text": "Interactive Plots\nWe can make any of the previous plots interactive with the powerful and easy plotly library. All we have to do is wrap a ggplot object in the ggplotly function. Note: You can’t add ggplotly to the end of a ggplot object, but have to actually save the ggplot as a variable and then wrap that in the function call as shown below.\nYou can customize the tooltip text by adding a value to text in aes() and then specifying tooltip = \"text\" in the ggplotly call.\n\nlibrary(plotly)\n\nstock_plot <- as_tibble(EuStockMarkets) %>% \n mutate(date = time(EuStockMarkets)) %>% \n gather(key = \"key\", value = \"value\", -date) %>% \n ggplot(mapping = aes(x = date, y = value, color = key,\n # sometimes ggplotly messes with line charts,\n # adding a group value usually helps with that\n group = key,\n # customize the tooltip with the text aes\n text = paste0(\"Value: \", round(value, 2), \"<br>\",\n \"Date: \", round(date, 3), \"<br>\",\n \"Key: \", key))\n ) +\n geom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(1991, 1999), \n breaks = c(1991, 1993, 1995, 1997, 1999)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 2500,\n labels = scales::dollar, \n limits = c(0, 10000)) + \n labs(x = \"Date\",\n y = \"Value\")\n\n# make interactive with gggplotly\n# Uncomment pipe to hide the interative toolbar in the top right \nggplotly(stock_plot, tooltip = \"text\") # %>% config(displayModeBar = FALSE)"
},
{
"objectID": "graphics-guide.html#urbnthemes",
"href": "graphics-guide.html#urbnthemes",
- "title": "R@URBAN",
+ "title": "Urban Institute R Graphics Guide",
"section": "urbnthemes",
- "text": "urbnthemes\n\nOverview\nurbnthemes is a set of tools for creating Urban Institute-themed plots and maps in R. The package extends ggplot2 with print and map themes as well as tools that make plotting easier at the Urban Institute. urbnthemes replaces the urban_R_theme.\nAlways load library(urbnthemes) after library(ggplot2) or library(tidyverse).\n\n\nUsage\nUse set_urbn_defaults(style = \"print\") to set the default styles. scatter_grid(), remove_ticks(), add_axis(), and remove_axis() can all be used to improve graphics.\n\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nggplot(data = mtcars, mapping = aes(factor(cyl))) +\n geom_bar() + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Number of Cylinders\",\n y = \"Count\") +\n remove_ticks()\n\n\n\n\n\n\nCombining elements\nlibrary(urbnthemes) contains functions for combining plot elements into graphics. urbn_plot() brings all of the elements together.\n\nurbn_logo_text()\nremove_ticks()\nremove_axis()\nscatter_grid()\nadd_axis()\nurbn_geofacet\n\n\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nplot <- ggplot(data = mtcars, mapping = aes(factor(cyl))) +\n geom_bar() + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Number of Cylinders\",\n y = \"Count\") +\n remove_ticks()\n\nurbn_plot(plot, urbn_logo_text(), ncol = 1, heights = c(30, 1))\n\n\n\n\nSometimes it’s important to horizontally add the y-axis title above the plot. urbn_y_title() can be sued for this task. The following example goes one step further and adds the title between the legend and the plot.\n\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults()\n\nplot <- ggplot(data = mtcars, mapping = aes(x = wt, y = mpg, color = factor(cyl))) +\n geom_point() + \n scale_x_continuous(expand = c(0, 0),\n limits = c(0, 8)) +\n scale_y_continuous(expand = c(0, 0),\n limits = c(0, 40)) +\n remove_ticks() +\n labs(\"\") +\n scatter_grid()\n\nurbn_plot(get_legend(plot),\n urbn_y_title(\"Miles per gallon\"),\n remove_legend(plot), \n urbn_logo_text(), \n ncol = 1, \n heights = c(3, 1, 30, 1))\n\n\n\n\n\n\nPalettes\nurbnthemes contains many quick-access color palettes from the Urban Institute Data Visualization Style Guide. These palettes can be used to quickly overwrite default color palettes from urbnthemes.\n\npalette_urbn_main is the eight color discrete palette of the Urban Institute with cyan, yellow, black, gray, magenta, green, space gray, and red.\npalette_urbn_diverging is an eight color diverging palette.\npalette_urbn_quintile is a five color blue palette that is good for quintiles.\npalette_urbn_politics is a two color palette with blue for Democrats and red for Republicans.\n\nThere are seven palettes that are continuous palettes of the seven unique colors in the discrete Urban Institute color palette:\n\npalette_urbn_cyan\npalette_urbn_gray\npalette_urbn_yellow\npalette_urbn_magenta\npalette_urbn_green\npalette_urbn_spacegray\npalette_urbn_red\n\nUse view_palette() to see the palette:\n\nview_palette(palette_urbn_magenta)\n\n[1] \"c(#351123, #761548, #af1f6b, #e90989, #e54096, #e46aa7, #eb99c2, #f5cbdf)\"\n\n\n\n\n\nThe vectors can be subset using base R syntax. This allows for the quick selection of specific colors from a palette.\n\npalette_urbn_main[1:4]\n\n cyan yellow black gray \n\"#1696d2\" \"#fdbf11\" \"#000000\" \"#d2d2d2\" \n\n\n\npalette_urbn_spacegray[1:5]\n\n[1] \"#d5d5d4\" \"#adabac\" \"#848081\" \"#5c5859\" \"#332d2f\"\n\n\n\n\nUtility functions\nlibrary(urbnthemes) contains four functions that are helpful with managing font instalations:\n\nlato_test()\nlato_install()\nfontawesome_test()\nfontawesome_install()"
+ "text": "urbnthemes\n\nOverview\nurbnthemes is a set of tools for creating Urban Institute-themed plots and maps in R. The package extends ggplot2 with print and map themes as well as tools that make plotting easier at the Urban Institute. urbnthemes replaces the urban_R_theme.\nAlways load library(urbnthemes) after library(ggplot2) or library(tidyverse).\n\n\nUsage\nUse set_urbn_defaults(style = \"print\") to set the default styles. scatter_grid(), remove_ticks(), add_axis(), and remove_axis() can all be used to improve graphics.\n\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nggplot(data = mtcars, mapping = aes(factor(cyl))) +\n geom_bar() + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Number of Cylinders\",\n y = \"Count\") +\n remove_ticks()\n\n\n\n\n\n\nCombining elements\nlibrary(urbnthemes) contains functions for combining plot elements into graphics. urbn_plot() brings all of the elements together.\n\nurbn_logo_text()\nremove_ticks()\nremove_axis()\nscatter_grid()\nadd_axis()\nurbn_geofacet\n\n\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nplot <- ggplot(data = mtcars, mapping = aes(factor(cyl))) +\n geom_bar() + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Number of Cylinders\",\n y = \"Count\") +\n remove_ticks()\n\nurbn_plot(plot, urbn_logo_text(), ncol = 1, heights = c(30, 1))\n\n\n\n\nSometimes it’s important to horizontally add the y-axis title above the plot. urbn_y_title() can be sued for this task. The following example goes one step further and adds the title between the legend and the plot.\n\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults()\n\nplot <- ggplot(data = mtcars, mapping = aes(x = wt, y = mpg, color = factor(cyl))) +\n geom_point() + \n scale_x_continuous(expand = c(0, 0),\n limits = c(0, 8)) +\n scale_y_continuous(expand = c(0, 0),\n limits = c(0, 40)) +\n remove_ticks() +\n labs(\"\") +\n scatter_grid()\n\nurbn_plot(get_legend(plot),\n urbn_y_title(\"Miles per gallon\"),\n remove_legend(plot), \n urbn_logo_text(), \n ncol = 1, \n heights = c(3, 1, 30, 1))\n\n\n\n\n\n\nPalettes\nurbnthemes contains many quick-access color palettes from the Urban Institute Data Visualization Style Guide. These palettes can be used to quickly overwrite default color palettes from urbnthemes.\n\npalette_urbn_main is the eight color discrete palette of the Urban Institute with cyan, yellow, black, gray, magenta, green, space gray, and red.\npalette_urbn_diverging is an eight color diverging palette.\npalette_urbn_quintile is a five color blue palette that is good for quintiles.\npalette_urbn_politics is a two color palette with blue for Democrats and red for Republicans.\n\nThere are seven palettes that are continuous palettes of the seven unique colors in the discrete Urban Institute color palette:\n\npalette_urbn_cyan\npalette_urbn_gray\npalette_urbn_yellow\npalette_urbn_magenta\npalette_urbn_green\npalette_urbn_spacegray\npalette_urbn_red\n\nUse view_palette() to see the palette:\n\nview_palette(palette_urbn_magenta)\n\n[1] \"c(#351123, #761548, #af1f6b, #e90989, #e54096, #e46aa7, #eb99c2, #f5cbdf)\"\n\n\n\n\n\nThe vectors can be subset using base R syntax. This allows for the quick selection of specific colors from a palette.\n\npalette_urbn_main[1:4]\n\n cyan yellow black gray \n\"#1696d2\" \"#fdbf11\" \"#000000\" \"#d2d2d2\" \n\n\n\npalette_urbn_spacegray[1:5]\n\n[1] \"#d5d5d4\" \"#adabac\" \"#848081\" \"#5c5859\" \"#332d2f\"\n\n\n\n\nUtility functions\nlibrary(urbnthemes) contains four functions that are helpful with managing font instalations:\n\nlato_test()\nlato_install()\nfontawesome_test()\nfontawesome_install()"
},
{
"objectID": "graphics-guide.html#bibliography-and-session-information",
"href": "graphics-guide.html#bibliography-and-session-information",
- "title": "R@URBAN",
+ "title": "Urban Institute R Graphics Guide",
"section": "Bibliography and Session Information",
- "text": "Bibliography and Session Information\n\nNote: Examples present in this document by Aaron Williams were created during personal time.\nBob Rudis and Dave Gandy (2017). waffle: Create Waffle Chart Visualizations in R. R package version 0.7.0. https://CRAN.R-project.org/package=waffle\nChester Ismay and Jennifer Chunn (2017). fivethirtyeight: Data and Code Behind the Stories and Interactives at ‘FiveThirtyEight’. R package version 0.3.0. https://CRAN.R-project.org/package=fivethirtyeight\nHadley Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2009.\nHadley Wickham (2017). tidyverse: Easily Install and Load the ‘Tidyverse’. R package version 1.2.1. https://CRAN.R-project.org/package=tidyverse\nHadley Wickham (2017). forcats: Tools for Working with Categorical Variables (Factors). R package version 0.2.0. https://CRAN.R-project.org/package=forcats\nJennifer Bryan (2017). gapminder: Data from Gapminder. R package version 0.3.0. https://CRAN.R-project.org/package=gapminder\nKamil Slowikowski (2017). ggrepel: Repulsive Text and Label Geoms for ‘ggplot2’. R package version 0.7.0. https://CRAN.R-project.org/package=ggrepel\nMax Kuhn (2017). AmesHousing: The Ames Iowa Housing Data. R package version 0.0.3. https://CRAN.R-project.org/package=AmesHousing\nPeter Kampstra (2008). Beanplot: A Boxplot Alternative for Visual Comparison of Distributions, Journal of Statistical Software, 2008. https://www.jstatsoft.org/article/view/v028c01\nR Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.\nWinston Chang, (2014). extrafont: Tools for using fonts. R package version 0.17. https://CRAN.R-project.org/package=extrafont\nYihui Xie (2018). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.19.\n\nsessionInfo()\n\nR version 4.2.2 (2022-10-31)\nPlatform: aarch64-apple-darwin20 (64-bit)\nRunning under: macOS Monterey 12.5.1\n\nMatrix products: default\nBLAS: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib\nLAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib\n\nlocale:\n[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8\n\nattached base packages:\n[1] stats graphics grDevices datasets utils methods base \n\nother attached packages:\n [1] AmesHousing_0.0.4 gghighlight_0.4.0 fivethirtyeight_0.6.2\n [4] ggsankey_0.0.99999 ggridges_0.5.4 ggbeeswarm_0.7.1 \n [7] ggrepel_0.9.2 gapminder_0.3.0 urbnthemes_0.0.2 \n[10] forcats_0.5.2 stringr_1.4.1 dplyr_1.0.10 \n[13] purrr_0.3.5 readr_2.1.3 tidyr_1.2.1 \n[16] tibble_3.1.8 ggplot2_3.4.0 tidyverse_1.3.2 \n[19] knitr_1.40 \n\nloaded via a namespace (and not attached):\n [1] nlme_3.1-160 fs_1.5.2 lubridate_1.9.0 \n [4] bit64_4.0.5 httr_1.4.4 tools_4.2.2 \n [7] backports_1.4.1 utf8_1.2.2 R6_2.5.1 \n[10] vipor_0.4.5 DBI_1.1.3 mgcv_1.8-41 \n[13] colorspace_2.0-3 withr_2.5.0 tidyselect_1.2.0 \n[16] gridExtra_2.3 bit_4.0.5 curl_4.3.3 \n[19] compiler_4.2.2 extrafontdb_1.0 cli_3.4.1 \n[22] rvest_1.0.3 xml2_1.3.3 labeling_0.4.2 \n[25] scales_1.2.1 hexbin_1.28.2 digest_0.6.30 \n[28] rmarkdown_2.18 pkgconfig_2.0.3 htmltools_0.5.4 \n[31] extrafont_0.18 dbplyr_2.2.1 fastmap_1.1.0 \n[34] htmlwidgets_1.6.1 rlang_1.0.6 readxl_1.4.1 \n[37] rstudioapi_0.14 farver_2.1.1 generics_0.1.3 \n[40] jsonlite_1.8.3 vroom_1.6.0 googlesheets4_1.0.1\n[43] magrittr_2.0.3 Matrix_1.5-1 Rcpp_1.0.9 \n[46] munsell_0.5.0 fansi_1.0.3 lifecycle_1.0.3 \n[49] stringi_1.7.8 yaml_2.3.6 grid_4.2.2 \n[52] parallel_4.2.2 crayon_1.5.2 lattice_0.20-45 \n[55] splines_4.2.2 haven_2.5.1 hms_1.1.2 \n[58] pillar_1.8.1 urbnmapr_0.0.0.9002 reprex_2.0.2 \n[61] glue_1.6.2 evaluate_0.18 remotes_2.4.2 \n[64] renv_0.16.0 modelr_0.1.10 vctrs_0.5.1 \n[67] tzdb_0.3.0 Rttf2pt1_1.3.11 cellranger_1.1.0 \n[70] gtable_0.3.1 assertthat_0.2.1 xfun_0.34 \n[73] broom_1.0.1 googledrive_2.0.0 gargle_1.2.1 \n[76] beeswarm_0.4.0 timechange_0.1.1 ellipsis_0.3.2"
+ "text": "Bibliography and Session Information\n\nNote: Examples present in this document by Aaron Williams were created during personal time.\nBob Rudis and Dave Gandy (2017). waffle: Create Waffle Chart Visualizations in R. R package version 0.7.0. https://CRAN.R-project.org/package=waffle\nChester Ismay and Jennifer Chunn (2017). fivethirtyeight: Data and Code Behind the Stories and Interactives at ‘FiveThirtyEight’. R package version 0.3.0. https://CRAN.R-project.org/package=fivethirtyeight\nHadley Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2009.\nHadley Wickham (2017). tidyverse: Easily Install and Load the ‘Tidyverse’. R package version 1.2.1. https://CRAN.R-project.org/package=tidyverse\nHadley Wickham (2017). forcats: Tools for Working with Categorical Variables (Factors). R package version 0.2.0. https://CRAN.R-project.org/package=forcats\nJennifer Bryan (2017). gapminder: Data from Gapminder. R package version 0.3.0. https://CRAN.R-project.org/package=gapminder\nKamil Slowikowski (2017). ggrepel: Repulsive Text and Label Geoms for ‘ggplot2’. R package version 0.7.0. https://CRAN.R-project.org/package=ggrepel\nMax Kuhn (2017). AmesHousing: The Ames Iowa Housing Data. R package version 0.0.3. https://CRAN.R-project.org/package=AmesHousing\nPeter Kampstra (2008). Beanplot: A Boxplot Alternative for Visual Comparison of Distributions, Journal of Statistical Software, 2008. https://www.jstatsoft.org/article/view/v028c01\nR Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.\nWinston Chang, (2014). extrafont: Tools for using fonts. R package version 0.17. https://CRAN.R-project.org/package=extrafont\nYihui Xie (2018). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.19.\n\nsessionInfo()\n\nR version 4.2.2 (2022-10-31)\nPlatform: aarch64-apple-darwin20 (64-bit)\nRunning under: macOS Monterey 12.5.1\n\nMatrix products: default\nBLAS: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib\nLAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib\n\nlocale:\n[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8\n\nattached base packages:\n[1] stats graphics grDevices datasets utils methods base \n\nother attached packages:\n [1] plotly_4.10.4 AmesHousing_0.0.4 gghighlight_0.4.0 \n [4] fivethirtyeight_0.6.2 ggsankey_0.0.99999 ggridges_0.5.4 \n [7] ggbeeswarm_0.7.1 ggrepel_0.9.2 gapminder_0.3.0 \n[10] urbnthemes_0.0.2 forcats_1.0.0 stringr_1.5.1 \n[13] dplyr_1.1.4 purrr_1.0.2 readr_2.1.3 \n[16] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.0 \n[19] tidyverse_1.3.2 knitr_1.40 \n\nloaded via a namespace (and not attached):\n [1] nlme_3.1-160 fs_1.5.2 lubridate_1.9.0 \n [4] bit64_4.0.5 httr_1.4.4 tools_4.2.2 \n [7] backports_1.4.1 utf8_1.2.4 R6_2.5.1 \n[10] vipor_0.4.5 lazyeval_0.2.2 DBI_1.1.3 \n[13] mgcv_1.8-41 colorspace_2.1-0 withr_3.0.0 \n[16] tidyselect_1.2.1 gridExtra_2.3 bit_4.0.5 \n[19] curl_4.3.3 compiler_4.2.2 extrafontdb_1.0 \n[22] cli_3.6.2 rvest_1.0.3 xml2_1.3.3 \n[25] labeling_0.4.3 scales_1.3.0 hexbin_1.28.2 \n[28] digest_0.6.30 rmarkdown_2.18 pkgconfig_2.0.3 \n[31] htmltools_0.5.4 extrafont_0.18 dbplyr_2.2.1 \n[34] fastmap_1.1.0 htmlwidgets_1.6.1 rlang_1.1.3 \n[37] readxl_1.4.1 rstudioapi_0.14 farver_2.1.1 \n[40] generics_0.1.3 jsonlite_1.8.3 crosstalk_1.2.0 \n[43] vroom_1.6.0 googlesheets4_1.0.1 magrittr_2.0.3 \n[46] Matrix_1.5-1 Rcpp_1.0.9 munsell_0.5.1 \n[49] fansi_1.0.6 lifecycle_1.0.4 stringi_1.8.3 \n[52] yaml_2.3.6 grid_4.2.2 parallel_4.2.2 \n[55] crayon_1.5.2 lattice_0.20-45 splines_4.2.2 \n[58] haven_2.5.1 hms_1.1.2 pillar_1.9.0 \n[61] urbnmapr_0.0.0.9002 reprex_2.0.2 glue_1.7.0 \n[64] evaluate_0.18 data.table_1.14.4 remotes_2.4.2 \n[67] renv_0.16.0 modelr_0.1.10 vctrs_0.6.5 \n[70] tzdb_0.3.0 Rttf2pt1_1.3.11 cellranger_1.1.0 \n[73] gtable_0.3.4 assertthat_0.2.1 xfun_0.34 \n[76] broom_1.0.1 viridisLite_0.4.2 googledrive_2.0.0 \n[79] gargle_1.2.1 beeswarm_0.4.0 timechange_0.1.1 \n[82] ellipsis_0.3.2"
},
{
- "objectID": "resources.html",
- "href": "resources.html",
- "title": "R@URBAN",
- "section": "",
- "text": "Free Books\n\nIntro\n\nR for Data Science by Garrett Grolemund and Hadley Wickham\n\n\n\nData Viz\n\nggplot2: Elegant Graphics for Data Analysis by Hadley Wickham\nData Visualization - A practical introduction by Kieran Healy\n\n\n\n*down\n\nR Markdown: The Definitive Guide by Yihui Xie, J. J. Allaire, and Garrett Grolemund\nblogdown: Creating Websites with R Markdown by Yihui Xie, Amber Thomas, and Alison Presmanes Hill\nbookdown: Authoring Books and Technical Documents with R Markdown by Yihui Xie\n\n\n\nStatistics\n\nLearning Statistics with R by Danielle Navarro\nIntroduction to Econometrics with R by Christoph Hanck, Martin Arnold, Alexander Gerber and Martin Schmelzer\nAn Introduction to Bayesian Thinking by Merlise Clyde et. al.\nStatistical Inference via Data Science by Chester Ismay and Albert Y. Kim\n\n\n\nMachine Learning\n\nHands-On Machine Learning with R by Bradley Boehmke & Brandon Greenwell\nFeature Engineering and Selection: A Practical Approach for Predictive Models by Max Kuhn and Kjell Johnson\n\n\n\nMapping and Geospatial Analysis\n\nGeocomputation with R by Robin Lovelace, Jakub Nowosad, Jannes Muenchow\n\n\n\nText Analysis\n\nText Mining with R A Tidy Approach by Julia Silge and David Robinson\n\n\n\nProgramming\n\nAdvanced R by Hadley Wickham\nR Packages by Hadley Wickham\nMaster Spark with R by Javier Luraschi, Kevin Kuo, and Edgar Ruiz\nFunctional programming and unit testing for data munging with R by Bruno Rodrigues\n\n\n\n\nWebsites\n\nRStudio Essentials\nRStudio Education\nR Cheat Sheets\nAndrew Heiss’ free Data Viz Course"
+ "objectID": "graphics-guide.html#footnotes",
+ "href": "graphics-guide.html#footnotes",
+ "title": "Urban Institute R Graphics Guide",
+ "section": "Footnotes",
+ "text": "Footnotes\n\n\nWickham, H., & Stryjewski, L. (2011). 40 years of boxplots.↩︎"
},
{
"objectID": "mapping.html#geospatial-workflow",
"href": "mapping.html#geospatial-workflow",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Geospatial Workflow",
"text": "Geospatial Workflow\nThis picture below outlines what we think are the main steps in a geospatial workflow. This guide will be split into sections describing each of the steps."
},
{
"objectID": "mapping.html#should-this-be-a-map",
"href": "mapping.html#should-this-be-a-map",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Should this be a map?",
"text": "Should this be a map?\nThe Urban Institute Data Visualization Style Guide offers some blunt but useful suggestions for maps:\n\nJust because you’ve got geographic data, doesn’t mean that you have to make a map. Many times, there are more efficient storyforms that will get your point across more clearly. If your data shows a very clear geographic trend or if the absolute location of a place or event matters, maps might be the best approach, but sometimes the reflexive impulse to map the data can make you forget that showing the data in another form might answer other—and sometimes more important—questions.\n\nSo we would encourage you to think critically before making a map."
},
{
"objectID": "mapping.html#why-map-with-r",
"href": "mapping.html#why-map-with-r",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Why map with R?",
"text": "Why map with R?\nR can have a steeper learning curve than point-and-click tools - like QGIS or ArcGIS - for geospatial analysis and mapping. But creating maps in R has many advantages including:\n\nReproducibility: By creating maps with R code, you can easily share the outputs and the code that generated the output with collaborators, allowing them to replicate your work and catch errors easily.\nIteration: With point and click software like ArcGIS, making 50 maps would be 50 times the work/time. But using R, we can easily make make many iterations of the same map with a few changes to the code.\nEasy Updates: Writing code provides a roadmap for others (and future you!) to quickly update parts of the map as needed. Say for example a collaborator wanted to change the legend colors of 50 state maps. With R, this is possible in just a few seconds!\nAn Expansive ecosystem: There are several R packages that make it very easy to get spatial data, create static and interactive maps, and perform spatial analyses. This feature rich package ecosystem which all play nice together is frankly unmatched by other programming languages and even point and click tools like QGIS and ArcGIS. Some of these R packages include:\n\nsf: For managing and analyzing spatial dataframes\ntigris: For downloading in Census geographies\nggplot2: For making publication ready static maps\nurbnmapr: For automatically adding Urban styling to static maps\nmapview: For making expxploratory interactive maps\n\nCost: Most point-and-click tools for geospatial analysis are proprietary and expensive. R is free open-source software. The software and most of its packages can be used for free by anyone for almost any use case."
},
{
"objectID": "mapping.html#helpful-learning-resources",
"href": "mapping.html#helpful-learning-resources",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Helpful Learning Resources",
"text": "Helpful Learning Resources\nIn addition to this guide, you may want to look at these other helpful resources:\n\nThe Urban Institute mapping training series (with video lectures and notes)\nChapters 5, 6, and 7 from Kyle Walker’s Analyzing US Census Data book.\nAndrew Heiss’ fantastic mapping guide\nAll of the vignettes for the sf package\nGeocomputation with R: A book by Robin Lovelace and others\nUChicago’s R Spatial Workshops: https://spatialanalysis.github.io/tutorials/"
},
{
"objectID": "mapping.html#librarysf",
"href": "mapping.html#librarysf",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "library(sf)",
- "text": "library(sf)\n\nThe short version\nlibrary(sf) stores geospatial data, which are points (a single longitude/latitude), lines (a pair of connected points), or polygons (a collection of points which make a polygon) in a geometry column within R dataframes\n\nThis is what sf dataframe looks like in the console:\n\ndc_parks <- st_read(\"mapping/data/dc_parks.geojson\", \n quiet = TRUE)\n\n# Print just the NAME and geometry column\ndc_parks %>%\n select(NAME) %>%\n head(2)\n\nSimple feature collection with 2 features and 1 field\nGeometry type: MULTIPOLYGON\nDimension: XY\nBounding box: xmin: -77.01063 ymin: 38.81718 xmax: -76.9625 ymax: 38.89723\nGeodetic CRS: WGS 84\n NAME geometry\n1 Kingman and Heritage Islands MULTIPOLYGON (((-76.96566 3...\n2 Bald Eagle Hill MULTIPOLYGON (((-77.01063 3...\n\n\n\n\nThe long version\nThe sf library is a key tool for reading in, managing, and working with spatial data in R. sf stands for simple features (not San Francisco you Bay Area folks) and denotes a way to describe the spatial attributes of real life objects. The R object you will be working with most frequently for mapping is an sf dataframe. An sf dataframe is essentially a regular R dataframe, with a couple of extra features for use in mapping. These extra features exclusive to sf dataframes include:\n\nsticky geometry columns\nattached coordinate reference systems\nsome other spatial metadata\n\nThe most important of the above list is the sticky geometry column, which is a magical column that contains all of the geographic information for each row of data. Say for example you had a sf dataframe of all DC census tracts. Then the geometry column would contain all of the geographic points used to define DC census tract polygons. The stickiness of this column means that no matter what data munging/filtering you do, you will not be able to drop or delete the geometry column. Below is a graphic to help you understand this:\n\ncredits: @allisonhorst\nThis is what an sf dataframe looks like in the console:\n\n# Read in spatial data about DC parks from DC Open Data Portal\ndc_parks <- st_read(\"https://opendata.arcgis.com/api/v3/datasets/287eaa2ecbff4d699762bbc6795ffdca_9/downloads/data?format=geojson&spatialRefId=4326\",\n quiet = TRUE)\n\n# dc_parks <- st_read(\"mapping/data/dc_parks.geojson\")\n\n# Select just a few columns for readability\ndc_parks <- dc_parks %>%\n select(NAME, geometry)\n\n# Print to the console\ndc_parks\n\nSimple feature collection with 256 features and 1 field\nGeometry type: MULTIPOLYGON\nDimension: XY\nBounding box: xmin: -77.11113 ymin: 38.81718 xmax: -76.91108 ymax: 38.98811\nGeodetic CRS: WGS 84\nFirst 10 features:\n NAME geometry\n1 Plymouth Circle MULTIPOLYGON (((-77.04677 3...\n2 Triangle Park RES 0566 MULTIPOLYGON (((-77.04481 3...\n3 Shepherd Field MULTIPOLYGON (((-77.03528 3...\n4 Marvin Caplan Memorial Park MULTIPOLYGON (((-77.03027 3...\n5 Pinehurst Circle MULTIPOLYGON (((-77.06643 3...\n6 Triangle Park 3278 0801 MULTIPOLYGON (((-77.01759 3...\n7 Fort Stevens MULTIPOLYGON (((-77.02988 3...\n8 Takoma Recreation Center MULTIPOLYGON (((-77.01794 3...\n9 Takoma Community Center MULTIPOLYGON (((-77.01716 3...\n10 Triangle Park RES 0648 MULTIPOLYGON (((-77.03362 3...\n\n\nNote that there is some spatial metadata such as the Geometry Type, Bounding Box, and CRS which shows up as a header before the actual contents of the dataframe.\nSince sf dataframes operate similarly to regular dataframes, we can use all our familiar tidyverse functions for data wrangling, including select, filter, rename, mutate, group_by and summarize. The sf package also has many functions that provide easy ways to replicate common tasks done in other GIS software like spatial joins, clipping, and buffering. Almost all of the mapping and geospatial analysis methods described in this guide rely on you having an sf dataframe. So let’s talk about how to get one!"
+ "text": "library(sf)\n\nThe short version\nlibrary(sf) stores geospatial data, which are points (a single longitude/latitude), lines (a pair of connected points), or polygons (a collection of points which make a polygon) in a geometry column within R dataframes\n\nThis is what sf dataframe looks like in the console:\n\ndc_parks <- st_read(\"mapping/data/dc_parks.geojson\", \n quiet = TRUE)\n\n# Print just the NAME and geometry column\ndc_parks %>%\n select(NAME) %>%\n head(2)\n\nSimple feature collection with 2 features and 1 field\nGeometry type: MULTIPOLYGON\nDimension: XY\nBounding box: xmin: -77.01063 ymin: 38.81718 xmax: -76.9625 ymax: 38.89723\nGeodetic CRS: WGS 84\n NAME geometry\n1 Kingman and Heritage Islands MULTIPOLYGON (((-76.96566 3...\n2 Bald Eagle Hill MULTIPOLYGON (((-77.01063 3...\n\n\n\n\nThe long version\nThe sf library is a key tool for reading in, managing, and working with spatial data in R. sf stands for simple features (not San Francisco you Bay Area folks) and denotes a way to describe the spatial attributes of real life objects. The R object you will be working with most frequently for mapping is an sf dataframe. An sf dataframe is essentially a regular R dataframe, with a couple of extra features for use in mapping. These extra features exclusive to sf dataframes include:\n\nsticky geometry columns\nattached coordinate reference systems\nsome other spatial metadata\n\nThe most important of the above list is the sticky geometry column, which is a magical column that contains all of the geographic information for each row of data. Say for example you had a sf dataframe of all DC census tracts. Then the geometry column would contain all of the geographic points used to define DC census tract polygons. The stickiness of this column means that no matter what data munging/filtering you do, you will not be able to drop or delete the geometry column. Below is a graphic to help you understand this:\n\ncredits: @allisonhorst\nThis is what an sf dataframe looks like in the console:\n\n# Read in spatial data about DC parks from DC Open Data Portal\ndc_parks <- st_read(\"https://opendata.arcgis.com/api/v3/datasets/287eaa2ecbff4d699762bbc6795ffdca_9/downloads/data?format=geojson&spatialRefId=4326\",\n quiet = TRUE)\n\n# dc_parks <- st_read(\"mapping/data/dc_parks.geojson\")\n\n# Select just a few columns for readability\ndc_parks <- dc_parks %>%\n select(NAME, geometry)\n\n# Print to the console\ndc_parks\n\nSimple feature collection with 256 features and 1 field\nGeometry type: MULTIPOLYGON\nDimension: XY\nBounding box: xmin: -77.11113 ymin: 38.81718 xmax: -76.91108 ymax: 38.98811\nGeodetic CRS: WGS 84\nFirst 10 features:\n NAME geometry\n1 Plymouth Circle MULTIPOLYGON (((-77.04677 3...\n2 Triangle Park RES 0566 MULTIPOLYGON (((-77.04481 3...\n3 Shepherd Field MULTIPOLYGON (((-77.03528 3...\n4 Marvin Caplan Memorial Park MULTIPOLYGON (((-77.03027 3...\n5 Pinehurst Circle MULTIPOLYGON (((-77.06643 3...\n6 Triangle Park 3278 0801 MULTIPOLYGON (((-77.01759 3...\n7 Fort Stevens MULTIPOLYGON (((-77.02988 3...\n8 Takoma Recreation Center MULTIPOLYGON (((-77.01794 3...\n9 Takoma Community Center MULTIPOLYGON (((-77.01716 3...\n10 Triangle Park RES 0648 MULTIPOLYGON (((-77.03362 3...\n\n\nNote that there is some spatial metadata such as the Geometry Type, Bounding Box, and CRS which shows up as a header before the actual contents of the dataframe.\nSince sf dataframes operate similarly to regular dataframes, we can use all our familiar tidyverse functions for data wrangling, including select, filter, rename, mutate, group_by and summarize. The sf package also has many functions that provide easy ways to replicate common tasks done in other GIS software like spatial joins, clipping, and buffering. Almost all of the mapping and geospatial analysis methods described in this guide rely on you having an sf dataframe. So let’s talk about how to get one!"
},
{
"objectID": "mapping.html#importing-spatial-data",
"href": "mapping.html#importing-spatial-data",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Importing spatial data",
- "text": "Importing spatial data\nGetting an sf dataframe is always the first step in the geospatial workflow. Here’s how to import spatial data for…\n\nStates and counties\nWe highly recommend using the library(urbnmapr) package, which was created by folks here at Urban to easily create state and county level maps. The get_urbn_map() function in the package allows you to read in spatial data on states and counties, with options to include territories. Importantly, it will also display AL and HI as insets on the map in accordance with the Urban Institute Data Visualization Style Guide. For information on how to install urbnmapr, see the GitHub repository.\nBelow is an example of how you would use urbnmapr to get an sf dataframe of all the states or counties in the US.\n\nlibrary(urbnmapr)\n\n# Get state data\nstates <- get_urbn_map(\"states\", sf = TRUE)\n\n# Can also get county data\ncounties <- get_urbn_map(\"counties\", sf = TRUE)\n\n\n\nOther Census geographies\nUse the library(tigris) package, which allows you to easily download TIGER and other cartographic boundaries from the US Census Bureau. In order to automatically load in the boundaries as sf objects, run once per R session.\nlibrary(tigris) has all the standard census geographies, including census tracts, counties, CBSAs, ZCTAs, congressional districts, tribal areas, and more. It also includes other elements such as water, roads, and military bases.\nBy default, libraray(tigris) will download large very large and detailed TIGER line boundary files. For thematic mapping, the smaller cartographic boundary files are a better choice, as they are clipped to the shoreline, generalized, and therefore usually smaller in size without losing too much accuracy. To load cartographic boundaries, use the cb = TRUE argument. If you are doing detailed geospatial analysis and need the most detailed shapefiles, then you should use the detailed TIGER line boundary files and set cb = FALSE.\nBelow is an example of how you would use library(tigris) to get a sf dataframe of all Census tracts in DC for 2019.\n\nlibrary(tigris)\n\n# Only need to set once per script\noptions(tigris_class = \"sf\")\n\ndc_tracts <- tracts(\n state = \"DC\",\n cb = TRUE,\n year = 2019\n)\n\nUnlike library(urbnmapr), different functions are used to get geographic data for different geographic levels. For instance, the blocks() function will load census block group data, and the tracts() function will load tract data. Other functions include block_groups(), zctas() , and core_based_statistical_areas(). For the full list of supported geographies and functions, see the package vignette.\nFor folks interested in pulling in Census demographic information along with Census geographies, we recommend checking out the sister package to library(tigris): library(tidycensus). That package allows you to download in Census variables and Census geographic data simultaneously.\n\n\nCountries\nWe recommend using the library(rnaturalearth) package, which is similar to library(tigris) but allows you to download and use boundaries beyond the US. Instead of setting class to sf one time per session as we did with library(tigris), you must set the returnclass = \"sf\" argument each time you use a function from the package. Below is an example of downloading in an sf dataframe of all the countries in the world.\n\nlibrary(rnaturalearth)\n\nworld <- ne_countries(returnclass = \"sf\")\n\nggplot() +\n geom_sf(data = world, mapping = aes())\n\n\n\nYour own files\n\nShapefiles/GeoJSONS\nShapefiles and GeoJSONs are 2 common spatial file formats you will found out in the wild. library(sf) has a function called st_read which allows you to easily read in these files as sf dataframes. The only required argument is dsn or data source name. This is the filepath of the .shp file or the .geojson file on your local computer. For geojsons, dsn can also be a URL.\nBelow is an example of reading in a shapefile of fire stations in DC which is stored in mapping/data/shapefiles/. Note that shapefiles are actually stored as 6+ different files inside a folder. You need to provide the filepath to the file ending in .shp.\n\nlibrary(sf)\n\n# Print out all files in the directory\nlist.files(\"mapping/data/shapefiles\")\n\n[1] \"Fire_Stations.cpg\" \"Fire_Stations.dbf\" \"Fire_Stations.prj\"\n[4] \"Fire_Stations.shp\" \"Fire_Stations.shx\" \"Fire_Stations.xml\"\n\n# Read in .shp file\ndc_firestations <- st_read(\n dsn = \"mapping/data/shapefiles/Fire_Stations.shp\",\n quiet = TRUE\n)\n\nAnd now dc_firestations is an sf dataframe you can use for all your mapping needs! st_read supports reading in a wide variety of other spatial file formats, including geodatabases, KML files, and over 200 others. For an incomplete list, please see the this sf vignette.\n\n\nCSVs or dataframes with lat/lons\nIf you have a CSV with geographic information stored in columns, you will need to read in the CSV as a regular R dataframe and then convert to an sf dataframe. library(sf) contains the st_as_sf() function for converting regular R dataframes into an sf dataframe. The two arguments you must specify for this function are:\n\ncoords: A length 2 vector with the names of the columns corresponding to longitude and latitude (in that order!). For example, c(\"lon\", \"lat\").\ncrs: The CRS (coordinate references system) for your longitude/latitude coordinates. Remember you need to specify both the\nauthority and the SRID code, for example (“EPSG:4326”). For more information on finding and setting CRS codes, please see the CRS section.\n\nBelow is an example of reading in data from a CSV and converting it to an sf dataframe.\n\nlibrary(sf)\n\n# Read in dataset of state capitals which is stored as a csv\nstate_capitals <- read_csv(\"mapping/data/state-capitals.csv\")\n\nstate_capitals <- state_capitals %>%\n # Specify names of the lon/lat columns in the CSV to use to make geometry col\n st_as_sf(\n coords = c(\"longitude\", \"latitude\"),\n crs = 4326\n )\n\nOne common mistake is that before converting to an sf dataframe, you must drop any rows that have NA values for latitude or longitude. If your data contains NA values, then the st_as_sf() function will throw an error."
+ "text": "Importing spatial data\nGetting an sf dataframe is always the first step in the geospatial workflow. Here’s how to import spatial data for…\n\nStates and counties\nWe highly recommend using the library(urbnmapr) package, which was created by folks here at Urban to easily create state and county level maps. The get_urbn_map() function in the package allows you to read in spatial data on states and counties, with options to include territories. Importantly, it will also display AL and HI as insets on the map in accordance with the Urban Institute Data Visualization Style Guide. For information on how to install urbnmapr, see the GitHub repository.\nBelow is an example of how you would use urbnmapr to get an sf dataframe of all the states or counties in the US.\n\nlibrary(urbnmapr)\n\n# Get state data\nstates <- get_urbn_map(\"states\", sf = TRUE)\n\n# Can also get county data\ncounties <- get_urbn_map(\"counties\", sf = TRUE)\n\n\n\nOther Census geographies\nUse the library(tigris) package, which allows you to easily download TIGER and other cartographic boundaries from the US Census Bureau. In order to automatically load in the boundaries as sf objects, run once per R session.\nlibrary(tigris) has all the standard census geographies, including census tracts, counties, CBSAs, ZCTAs, congressional districts, tribal areas, and more. It also includes other elements such as water, roads, and military bases.\nBy default, libraray(tigris) will download large very large and detailed TIGER line boundary files. For thematic mapping, the smaller cartographic boundary files are a better choice, as they are clipped to the shoreline, generalized, and therefore usually smaller in size without losing too much accuracy. To load cartographic boundaries, use the cb = TRUE argument. If you are doing detailed geospatial analysis and need the most detailed shapefiles, then you should use the detailed TIGER line boundary files and set cb = FALSE.\nBelow is an example of how you would use library(tigris) to get a sf dataframe of all Census tracts in DC for 2019.\n\nlibrary(tigris)\n\n# Only need to set once per script\noptions(tigris_class = \"sf\")\n\ndc_tracts <- tracts(\n state = \"DC\",\n cb = TRUE,\n year = 2019\n)\n\nUnlike library(urbnmapr), different functions are used to get geographic data for different geographic levels. For instance, the blocks() function will load census block group data, and the tracts() function will load tract data. Other functions include block_groups(), zctas() , and core_based_statistical_areas(). For the full list of supported geographies and functions, see the package vignette.\nFor folks interested in pulling in Census demographic information along with Census geographies, we recommend checking out the sister package to library(tigris): library(tidycensus). That package allows you to download in Census variables and Census geographic data simultaneously.\n\n\nCountries\nWe recommend using the library(rnaturalearth) package, which is similar to library(tigris) but allows you to download and use boundaries beyond the US. Instead of setting class to sf one time per session as we did with library(tigris), you must set the returnclass = \"sf\" argument each time you use a function from the package. Below is an example of downloading in an sf dataframe of all the countries in the world.\n\nlibrary(rnaturalearth)\n\nworld <- ne_countries(returnclass = \"sf\")\n\nggplot() +\n geom_sf(data = world, mapping = aes())\n\n\n\nYour own files\n\nShapefiles/GeoJSONS\nShapefiles and GeoJSONs are 2 common spatial file formats you will found out in the wild. library(sf) has a function called st_read which allows you to easily read in these files as sf dataframes. The only required argument is dsn or data source name. This is the filepath of the .shp file or the .geojson file on your local computer. For geojsons, dsn can also be a URL.\nBelow is an example of reading in a shapefile of fire stations in DC which is stored in mapping/data/shapefiles/. Note that shapefiles are actually stored as 6+ different files inside a folder. You need to provide the filepath to the file ending in .shp.\n\nlibrary(sf)\n\n# Print out all files in the directory\nlist.files(\"mapping/data/shapefiles\")\n\n[1] \"Fire_Stations.cpg\" \"Fire_Stations.dbf\" \"Fire_Stations.prj\"\n[4] \"Fire_Stations.shp\" \"Fire_Stations.shx\" \"Fire_Stations.xml\"\n\n# Read in .shp file\ndc_firestations <- st_read(\n dsn = \"mapping/data/shapefiles/Fire_Stations.shp\",\n quiet = TRUE\n)\n\nAnd now dc_firestations is an sf dataframe you can use for all your mapping needs! st_read supports reading in a wide variety of other spatial file formats, including geodatabases, KML files, and over 200 others. For an incomplete list, please see the this sf vignette.\n\n\nCSVs or dataframes with lat/lons\nIf you have a CSV with geographic information stored in columns, you will need to read in the CSV as a regular R dataframe and then convert to an sf dataframe. library(sf) contains the st_as_sf() function for converting regular R dataframes into an sf dataframe. The two arguments you must specify for this function are:\n\ncoords: A length 2 vector with the names of the columns corresponding to longitude and latitude (in that order!). For example, c(\"lon\", \"lat\").\ncrs: The CRS (coordinate references system) for your longitude/latitude coordinates. Remember you need to specify both the\nauthority and the SRID code, for example (“EPSG:4326”). For more information on finding and setting CRS codes, please see the CRS section.\n\nBelow is an example of reading in data from a CSV and converting it to an sf dataframe.\n\nlibrary(sf)\n\n# Read in dataset of state capitals which is stored as a csv\nstate_capitals <- read_csv(\"mapping/data/state-capitals.csv\")\n\nstate_capitals <- state_capitals %>%\n # Specify names of the lon/lat columns in the CSV to use to make geometry col\n st_as_sf(\n coords = c(\"longitude\", \"latitude\"),\n crs = 4326\n )\n\nOne common mistake is that before converting to an sf dataframe, you must drop any rows that have NA values for latitude or longitude. If your data contains NA values, then the st_as_sf() function will throw an error."
},
{
"objectID": "mapping.html#appending-spatial-info-to-your-data",
"href": "mapping.html#appending-spatial-info-to-your-data",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Appending spatial info to your data",
- "text": "Appending spatial info to your data\nOftentimes, the data you are working with will just have state or county identifiers - like FIPS codes or state abbreviations - but will not contain any geographic information. In this case, you must do the extra work of downloading in the geographic data as an sf dataframe and then joining your non-spatial data to the spatial data. Generally this involves 3 steps:\n\nReading in your own data as a data frame\nReading in the geographic data as an sf dataframe\nUsing left_join to merge the geographic data with your own non spatial data and create a new expanded sf dataframe\n\nLet’s say we had a dataframe on CHIP enrollment by state with state abbreviations.\n\n# read the state CHIP data\nchip_by_state <- read_csv(\"mapping/data/chip-enrollment.csv\") %>%\n # clean column names so there are no random spaces/uppercase letters\n janitor::clean_names()\n\n# print to the console\nchip_by_state %>% head()\n\n# A tibble: 6 × 3\n state chip_enrollment state_abbreviation\n \n1 Alabama 150040 AL \n2 Alaska 15662 AK \n3 Arizona 88224 AZ \n4 Arkansas 120863 AR \n5 California 2022213 CA \n6 Colorado 167227 CO \n\n\nIn order to convert this to an sf dataframe, we need to read in the spatial boundaries for each state and append it to our dataframe. Here is how we do that with get_urbn_map() and left_join() .\n\nlibrary(urbnmapr)\n\n# read in state geographic data from urbnmapr\nstates <- get_urbn_map(map = \"states\", sf = TRUE)\n\n# left join state geographies to chip data\nchip_with_geographies <- states %>%\n left_join(\n chip_by_state,\n # Specify join column, which are slightly differently named in states and chip\n # respectively\n by = c(\"state_abbv\" = \"state_abbreviation\")\n )\n\nchip_with_geographies %>%\n select(state_fips, state_abbv, chip_enrollment)\n\nSimple feature collection with 51 features and 3 fields\nGeometry type: MULTIPOLYGON\nDimension: XY\nBounding box: xmin: -2600000 ymin: -2363000 xmax: 2516374 ymax: 732352.2\nProjected CRS: NAD27 / US National Atlas Equal Area\nFirst 10 features:\n state_fips state_abbv chip_enrollment geometry\n1 01 AL 150040 MULTIPOLYGON (((1150023 -15...\n2 04 AZ 88224 MULTIPOLYGON (((-1386136 -1...\n3 08 CO 167227 MULTIPOLYGON (((-786661.9 -...\n4 09 CT 25551 MULTIPOLYGON (((2156197 -83...\n5 12 FL 374884 MULTIPOLYGON (((1953691 -20...\n6 13 GA 232050 MULTIPOLYGON (((1308636 -10...\n7 16 ID 35964 MULTIPOLYGON (((-1357097 78...\n8 18 IN 114927 MULTIPOLYGON (((1042064 -71...\n9 20 KS 79319 MULTIPOLYGON (((-174904.2 -...\n10 22 LA 161565 MULTIPOLYGON (((1075669 -15..."
+ "text": "Appending spatial info to your data\nOftentimes, the data you are working with will just have state or county identifiers - like FIPS codes or state abbreviations - but will not contain any geographic information. In this case, you must do the extra work of downloading in the geographic data as an sf dataframe and then joining your non-spatial data to the spatial data. Generally this involves 3 steps:\n\nReading in your own data as a data frame\nReading in the geographic data as an sf dataframe\nUsing left_join to merge the geographic data with your own non spatial data and create a new expanded sf dataframe\n\nLet’s say we had a dataframe on CHIP enrollment by state with state abbreviations.\n\n# read the state CHIP data\nchip_by_state <- read_csv(\"mapping/data/chip-enrollment.csv\") %>%\n # clean column names so there are no random spaces/uppercase letters\n janitor::clean_names()\n\n# print to the console\nchip_by_state %>% head()\n\n# A tibble: 6 × 3\n state chip_enrollment state_abbreviation\n <chr> <dbl> <chr> \n1 Alabama 150040 AL \n2 Alaska 15662 AK \n3 Arizona 88224 AZ \n4 Arkansas 120863 AR \n5 California 2022213 CA \n6 Colorado 167227 CO \n\n\nIn order to convert this to an sf dataframe, we need to read in the spatial boundaries for each state and append it to our dataframe. Here is how we do that with get_urbn_map() and left_join() .\n\nlibrary(urbnmapr)\n\n# read in state geographic data from urbnmapr\nstates <- get_urbn_map(map = \"states\", sf = TRUE)\n\n# left join state geographies to chip data\nchip_with_geographies <- states %>%\n left_join(\n chip_by_state,\n # Specify join column, which are slightly differently named in states and chip\n # respectively\n by = c(\"state_abbv\" = \"state_abbreviation\")\n )\n\nchip_with_geographies %>%\n select(state_fips, state_abbv, chip_enrollment)\n\nSimple feature collection with 51 features and 3 fields\nGeometry type: MULTIPOLYGON\nDimension: XY\nBounding box: xmin: -2600000 ymin: -2363000 xmax: 2516374 ymax: 732352.2\nProjected CRS: NAD27 / US National Atlas Equal Area\nFirst 10 features:\n state_fips state_abbv chip_enrollment geometry\n1 01 AL 150040 MULTIPOLYGON (((1150023 -15...\n2 04 AZ 88224 MULTIPOLYGON (((-1386136 -1...\n3 08 CO 167227 MULTIPOLYGON (((-786661.9 -...\n4 09 CT 25551 MULTIPOLYGON (((2156197 -83...\n5 12 FL 374884 MULTIPOLYGON (((1953691 -20...\n6 13 GA 232050 MULTIPOLYGON (((1308636 -10...\n7 16 ID 35964 MULTIPOLYGON (((-1357097 78...\n8 18 IN 114927 MULTIPOLYGON (((1042064 -71...\n9 20 KS 79319 MULTIPOLYGON (((-174904.2 -...\n10 22 LA 161565 MULTIPOLYGON (((1075669 -15..."
},
{
"objectID": "mapping.html#crs",
"href": "mapping.html#crs",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Coordinate Reference Systems",
"text": "Coordinate Reference Systems\n\nThe short version\nJust watch this video and know the following:\n\nAll spatial data has a CRS, which specifies how to identify a location on earth.\nIt’s important that all spatial datasets you are working with be in the same CRS. You can find the CRS with st_crs() and change the CRS with st_transform().\nThe Urban Institute Style Guide requires the use of the Atlas Equal Earth Projection (\"ESRI:102003\") for national maps. For state and local maps, use this handy guide to find an appropriate State Plane projection.\n\n\n\nThe long version\nCoordinate reference systems (CRS) specify the 3d shape of the earth and optionally how we project that 3d shape onto a 2d surface. They are an important part of working with spatial data as you need to ensure that all the data you are working with are in the same CRS in order for spatial operations and maps to be accurate.\nCRS can be specified either by name (ie Maryland State Plane) or Spatial Reference System IDentifier (SRID). THe SRID is a numeric identifier that uniquely identifies a coordinate reference system. Generally when referring to an SRID, you need to refer to an authority (ie the data source) and a unique ID. An example is EPSG:26985 which refers to the Maryland State plane projection from the EPSG, or ESRI:102003 which refers to the Atlas Equal Area projection from ESRI. Most CRS codes will be from the EPSG, and some from ESRI and others. A good resource for finding/validating CRS codes is epsg.io.\nSidenote - EPSG stands for the now defunct European Petroleum Survey Group. And while oil companies have generally been terrible for the earth, the one nice thing they did for the earth was to set up common standards for coordinate reference systems.\nYou might be thinking well isn’t the earth just a sphere? Why do we need all this complicated stuff? And the answer is well the earth is kind of a sphere, but it’s really more of a misshapen ellipsoid which is pudgier at the equator than at the poles. To visualize how coordinate reference systems work, imagine that the earth is a (lumpy) orange. Now peel the skin off an orange and try to flatten it. There are many ways to do it, but all will create distortions of some kind. The CRS will give us the formula we’ve used to specify the shape of the orange (usually a sphere or ellipsoid of some kind) and optionally, specify how we flattened the orange into 2d.\nBroadly, there are two kinds of Coordinate Reference Systems:\n\nGeographic coordinate systems\n\n(sometimes called unprojected coordinate systems)\nSpecifies a 3d shape for the earth\nUses a spheroid/ellipsoid to approximate shape of the earth\nUsually use decimal degree units (ie latitude/longitude) to identify locations on earth\n\n\n\n\nProjected coordinate systems\n\nSpecifies a 3d shape for the earth + a 2d mapping\n\nIs a geographic coordinate system + a projection\n\ncredit: xkcd\nprojection: mathematical formula used to convert a 3d coordinate system to a 2d flat coordinate system\nMany different kinds of projections, including Equal Area, Equidistant, Conformal, etc\nAll projections distort the true shape of the earth in some way, either in terms of shape, area, or angle. Required xkcd comic\nUsually use linear units (ie feet, meters) and therefore useful for distance based spatial operations (ie creating buffers)"
},
{
"objectID": "mapping.html#finding-the-crs",
"href": "mapping.html#finding-the-crs",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Finding the CRS",
- "text": "Finding the CRS\nIf you are lucky, your data will have embedded CRS data that will be automatically detected when the file is read in. This is usually the case for GeoJSONS (.geojson) and shapefiles (.shp). When you use st_read() on these files, you should see the CRS displayed in the metadata:\n\nYou can also the st_crs() function to find the CRS. The CRS code is located at the end in ID[authority, SRID].\n\nst_crs(dc_firestations)\n\nCoordinate Reference System:\n User input: WGS 84 \n wkt:\nGEOGCRS[\"WGS 84\",\n DATUM[\"World Geodetic System 1984\",\n ELLIPSOID[\"WGS 84\",6378137,298.257223563,\n LENGTHUNIT[\"metre\",1]]],\n PRIMEM[\"Greenwich\",0,\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n CS[ellipsoidal,2],\n AXIS[\"latitude\",north,\n ORDER[1],\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n AXIS[\"longitude\",east,\n ORDER[2],\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n ID[\"EPSG\",4326]]\n\n\nSometimes, the CRS will be blank or NA as the dataset did not specify the CRS. In that case you MUST find and set the CRS for your data before proceeding with analysis. Below are some good rules of thumb for finding out what the CRS for your data is:\n\nFor geojsons, the CRS should always be EPSG:4326 (or WGS 84). The official geojson specification states that this is the only valid CRS for geojsons, but in the wild, this may not be true 100% of the time.\nFor shapefiles, there should be a file that ends in .proj in the same directory as the .shp file. This file contains the projection information for that file and should be used automatically when reading in shapefiles.\nFor CSV’s with latitude/longitude columns, the CRS is usually EPSG:4326 (or WGS 84).\nLook at the metadata and any accompanying documentation to see if the coordinate reference system for the data is specified\n\nIf none of the above rules of thumb apply to you, check out the crsuggest R package.\nOnce you’ve identified the appropriate CRS, you can set the CRS for your data with st_crs():\n\n# If you are certain that your data contains coordinates in the ESRI Atlas Equal Earth projections\nst_crs(some_sf_dataframe) <- st_crs(\"ESRI:102003\")"
+ "text": "Finding the CRS\nIf you are lucky, your data will have embedded CRS data that will be automatically detected when the file is read in. This is usually the case for GeoJSONS (.geojson) and shapefiles (.shp). When you use st_read() on these files, you should see the CRS displayed in the metadata:\n\nYou can also the st_crs() function to find the CRS. The CRS code is located at the end in ID[authority, SRID].\n\nst_crs(dc_firestations)\n\nCoordinate Reference System:\n User input: WGS 84 \n wkt:\nGEOGCRS[\"WGS 84\",\n DATUM[\"World Geodetic System 1984\",\n ELLIPSOID[\"WGS 84\",6378137,298.257223563,\n LENGTHUNIT[\"metre\",1]]],\n PRIMEM[\"Greenwich\",0,\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n CS[ellipsoidal,2],\n AXIS[\"latitude\",north,\n ORDER[1],\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n AXIS[\"longitude\",east,\n ORDER[2],\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n ID[\"EPSG\",4326]]\n\n\nSometimes, the CRS will be blank or NA as the dataset did not specify the CRS. In that case you MUST find and set the CRS for your data before proceeding with analysis. Below are some good rules of thumb for finding out what the CRS for your data is:\n\nFor geojsons, the CRS should always be EPSG:4326 (or WGS 84). The official geojson specification states that this is the only valid CRS for geojsons, but in the wild, this may not be true 100% of the time.\nFor shapefiles, there should be a file that ends in .proj in the same directory as the .shp file. This file contains the projection information for that file and should be used automatically when reading in shapefiles.\nFor CSV’s with latitude/longitude columns, the CRS is usually EPSG:4326 (or WGS 84).\nLook at the metadata and any accompanying documentation to see if the coordinate reference system for the data is specified\n\nIf none of the above rules of thumb apply to you, check out the crsuggest R package.\nOnce you’ve identified the appropriate CRS, you can set the CRS for your data with st_crs():\n\n# If you are certain that your data contains coordinates in the ESRI Atlas Equal Earth projections\nst_crs(some_sf_dataframe) <- st_crs(\"ESRI:102003\")"
},
{
"objectID": "mapping.html#transforming-the-crs",
"href": "mapping.html#transforming-the-crs",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Transforming the CRS",
- "text": "Transforming the CRS\nOften you will need to change the CRS for your sf dataframe so that all datasets you are using have the same CRS, or to use a projected CRS for performing more accurate spatial operations. You can do this with st_transform:\n\n# Transforming CRS from WGS 84 to Urban required Equal Earth Projection\nstate_capitals <- state_capitals %>% st_transform(\"ESRI:102003\")\n\nst_transform() also allows you to just use the CRS of another sf dataframe when transforming.\n\n# transform CRS of chip_with_geographies to be the same as CRS of dc_firestations\nchip_with_geographies <- chip_with_geographies %>%\n st_transform(crs = st_crs(state_capitals))\n\nIf you are working with local data, you should use an appropriate state plane projection instead of the Atlas Equal Earth projection which is meant for national maps. library(crsuggest) can simplify the process of picking an appropriate state plane CRS.\n\nlibrary(crsuggest)\n\nsuggest_crs(dc_firestations) %>%\n # Use the value in the \"crs_code\" column to transform CRS's\n head(4)\n\n# A tibble: 4 × 6\n crs_code crs_name crs_type crs_gcs crs_units crs_proj4\n \n1 6488 NAD83(2011) / Maryland (ftUS) project… 6318 us-ft +proj=lc…\n2 6487 NAD83(2011) / Maryland project… 6318 m +proj=lc…\n3 3582 NAD83(NSRS2007) / Maryland (ftU… project… 4759 us-ft +proj=lc…\n4 3559 NAD83(NSRS2007) / Maryland project… 4759 m +proj=lc…"
+ "text": "Transforming the CRS\nOften you will need to change the CRS for your sf dataframe so that all datasets you are using have the same CRS, or to use a projected CRS for performing more accurate spatial operations. You can do this with st_transform:\n\n# Transforming CRS from WGS 84 to Urban required Equal Earth Projection\nstate_capitals <- state_capitals %>% st_transform(\"ESRI:102003\")\n\nst_transform() also allows you to just use the CRS of another sf dataframe when transforming.\n\n# transform CRS of chip_with_geographies to be the same as CRS of dc_firestations\nchip_with_geographies <- chip_with_geographies %>%\n st_transform(crs = st_crs(state_capitals))\n\nIf you are working with local data, you should use an appropriate state plane projection instead of the Atlas Equal Earth projection which is meant for national maps. library(crsuggest) can simplify the process of picking an appropriate state plane CRS.\n\nlibrary(crsuggest)\n\nsuggest_crs(dc_firestations) %>%\n # Use the value in the \"crs_code\" column to transform CRS's\n head(4)\n\n# A tibble: 4 × 6\n crs_code crs_name crs_type crs_gcs crs_units crs_proj4\n <chr> <chr> <chr> <dbl> <chr> <chr> \n1 6488 NAD83(2011) / Maryland (ftUS) project… 6318 us-ft +proj=lc…\n2 6487 NAD83(2011) / Maryland project… 6318 m +proj=lc…\n3 3582 NAD83(NSRS2007) / Maryland (ftU… project… 4759 us-ft +proj=lc…\n4 3559 NAD83(NSRS2007) / Maryland project… 4759 m +proj=lc…"
},
{
"objectID": "mapping.html#the-basics",
"href": "mapping.html#the-basics",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "The basics",
- "text": "The basics\n\nlibrary(ggplot2)\nMost mapping in R fits the same theoretical framework as plotting in R using library(ggplot2). To learn more about ggplot2, visit the Data Viz page or read the official ggplot book.\nThe key function for mapping is the special geom_sf() function which works with sf dataframes. This function magically detects whether you have point or polygon spatial data and displays the results on a map.\n\n\nA simple map\nTo make a simple map, add geom_sf() to a ggplot() and set data = an_sf_dataframe. Below is code for making a map of all 50 states using library(urbnmapr):\n\nlibrary(urbnmapr)\n\nstates <- get_urbn_map(\"states\", sf = TRUE)\n\nggplot() +\n geom_sf(\n data = states,\n mapping = aes()\n )"
+ "text": "The basics\n\nlibrary(ggplot2)\nMost mapping in R fits the same theoretical framework as plotting in R using library(ggplot2). To learn more about ggplot2, visit the Data Viz page or read the official ggplot book.\nThe key function for mapping is the special geom_sf() function which works with sf dataframes. This function magically detects whether you have point or polygon spatial data and displays the results on a map.\n\n\nA simple map\nTo make a simple map, add geom_sf() to a ggplot() and set data = an_sf_dataframe. Below is code for making a map of all 50 states using library(urbnmapr):\n\nlibrary(urbnmapr)\n\nstates <- get_urbn_map(\"states\", sf = TRUE)\n\nggplot() +\n geom_sf(\n data = states,\n mapping = aes()\n )"
},
{
"objectID": "mapping.html#styling",
"href": "mapping.html#styling",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Styling",
- "text": "Styling\n\nlibrary(urbnthemes)\nlibrary(urbnthemes) automatically styles maps in accordance with the Urban Institute Data Visualization Style Guide. By using library(urbnthemes), you can create publication ready maps you can immediately drop in to Urban research briefs or blog posts.\nTo install urbnthemes, visit the package’s GitHub repository and follow the instructions. There are 2 ways to use the urbnthemes functions:\n\nlibrary(urbnthemes)\n\n# You can either run this once per script to automatically style all maps with\n# the Urban theme\nset_urbn_defaults(style = \"map\")\n\n# Or you can add `+ theme_urbn_map()` to the end of every map you make\nggplot() +\n geom_sf(states, mapping = aes()) +\n theme_urbn_map()\n\n\n\n\n\n\nLayering\nYou can layer multiple points/lines/polygons on top of each other using the + operator from library(ggplot2). The shapes will appear from bottom to top (ie the last mapped object will show up on top). It is important that all layers are in the same CRS (coordinate reference system).\n\nstate_capitals <- state_capitals %>%\n # This will change CRS to ESRI:102003 and shift the AK and HI state capitals\n # point locations to the appropriate locations on the inset maps.\n tigris::shift_geometry() %>%\n # For now filter out AL and HI as their state capitals will be slightly off.\n filter(!state %in% c(\"Alaska\", \"Hawaii\"))\n\nggplot() +\n geom_sf(\n data = states,\n mapping = aes()\n ) +\n # Note we change the data argument\n geom_sf(\n data = state_capitals,\n mapping = aes(),\n # urbnthemes library has urbn color palettes built in.\n color = palette_urbn_main[\"yellow\"],\n size = 2.0\n ) +\n theme_urbn_map()\n\n\n\n\n\n\nFill and Outline Colors\nThe same commands used to change colors, opacity, lines, size, etc. in charts can be used for maps too. To change the colors of the map , just use the fill = and color = parameters in geom_sf(). fill will change the fill color of polygons; color will change the color of polygon outlines, lines, and points.\nGenerally, maps that show the magnitude of a variable use the blue sequential ramp and maps that display positives and negatives use the diverging color ramp.library(urbnthemes) contains inbuilt. helper variables (like palette_urbn_main) for accessing color palettes from the Urban Data Viz Style guide. If for example you want states to be Urban’s magenta color:\n\nggplot() +\n geom_sf(states,\n mapping = aes(),\n # Adjust polygon fill color\n fill = palette_urbn_main[\"magenta\"],\n # Adjust polygon outline color\n color = \"white\"\n ) +\n theme_urbn_map()\n\n\n\n\n\n\nAdding text\nYou can also add text, like state abbreviations, directly to your map using geom_sf_text and the helper function get_urbn_labels().\n\nlibrary(urbnmapr)\n\nggplot() +\n geom_sf(states,\n mapping = aes(),\n color = \"white\"\n ) +\n theme_urbn_map() +\n # Generates dataframe of state abbv and appropriate location to plot them\n geom_sf_text(\n data = get_urbn_labels(\n map = \"states\",\n sf = TRUE\n ),\n aes(label = state_abbv),\n size = 3\n )\n\n\n\n\nThere’s also geom_sf_label() if you want labels with a border."
+ "text": "Styling\n\nlibrary(urbnthemes)\nlibrary(urbnthemes) automatically styles maps in accordance with the Urban Institute Data Visualization Style Guide. By using library(urbnthemes), you can create publication ready maps you can immediately drop in to Urban research briefs or blog posts.\nTo install urbnthemes, visit the package’s GitHub repository and follow the instructions. There are 2 ways to use the urbnthemes functions:\n\nlibrary(urbnthemes)\n\n# You can either run this once per script to automatically style all maps with\n# the Urban theme\nset_urbn_defaults(style = \"map\")\n\n# Or you can add `+ theme_urbn_map()` to the end of every map you make\nggplot() +\n geom_sf(states, mapping = aes()) +\n theme_urbn_map()\n\n\n\n\n\n\nLayering\nYou can layer multiple points/lines/polygons on top of each other using the + operator from library(ggplot2). The shapes will appear from bottom to top (ie the last mapped object will show up on top). It is important that all layers are in the same CRS (coordinate reference system).\n\nstate_capitals <- state_capitals %>%\n # This will change CRS to ESRI:102003 and shift the AK and HI state capitals\n # point locations to the appropriate locations on the inset maps.\n tigris::shift_geometry() %>%\n # For now filter out AL and HI as their state capitals will be slightly off.\n filter(!state %in% c(\"Alaska\", \"Hawaii\"))\n\nggplot() +\n geom_sf(\n data = states,\n mapping = aes()\n ) +\n # Note we change the data argument\n geom_sf(\n data = state_capitals,\n mapping = aes(),\n # urbnthemes library has urbn color palettes built in.\n color = palette_urbn_main[\"yellow\"],\n size = 2.0\n ) +\n theme_urbn_map()\n\n\n\n\n\n\nFill and Outline Colors\nThe same commands used to change colors, opacity, lines, size, etc. in charts can be used for maps too. To change the colors of the map , just use the fill = and color = parameters in geom_sf(). fill will change the fill color of polygons; color will change the color of polygon outlines, lines, and points.\nGenerally, maps that show the magnitude of a variable use the blue sequential ramp and maps that display positives and negatives use the diverging color ramp.library(urbnthemes) contains inbuilt. helper variables (like palette_urbn_main) for accessing color palettes from the Urban Data Viz Style guide. If for example you want states to be Urban’s magenta color:\n\nggplot() +\n geom_sf(states,\n mapping = aes(),\n # Adjust polygon fill color\n fill = palette_urbn_main[\"magenta\"],\n # Adjust polygon outline color\n color = \"white\"\n ) +\n theme_urbn_map()\n\n\n\n\n\n\nAdding text\nYou can also add text, like state abbreviations, directly to your map using geom_sf_text and the helper function get_urbn_labels().\n\nlibrary(urbnmapr)\n\nggplot() +\n geom_sf(states,\n mapping = aes(),\n color = \"white\"\n ) +\n theme_urbn_map() +\n # Generates dataframe of state abbv and appropriate location to plot them\n geom_sf_text(\n data = get_urbn_labels(\n map = \"states\",\n sf = TRUE\n ),\n aes(label = state_abbv),\n size = 3\n )\n\n\n\n\nThere’s also geom_sf_label() if you want labels with a border."
},
{
"objectID": "mapping.html#choropleth-maps",
"href": "mapping.html#choropleth-maps",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Choropleth Maps",
- "text": "Choropleth Maps\nChoropleth maps display geographic areas with shades, colors, or patterns in proportion to a variable or variables. Choropleth maps can represent massive geographies like the entire world and small geographies like Census Tracts. To make a choropleth map, you need to set geom_sf(aes(fill = some_variable_name)). Below are examples\n\nContinuous color scale\n\n# Map of CHIP enrollment percentage by state\nchip_with_geographies_map <- chip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct\n ))\n\n\n# Below add-ons to the map are optional, but make the map look prettier.\nchip_with_geographies_map +\n # scale_fill_gradientn adds colors with more interpolation and reverses color scale\n scale_fill_gradientn(\n # Convert legend from decimal to percentages\n labels = scales::percent_format(),\n # Make legend title more readable\n name = \"CHIP Enrollment %\",\n # Manually add 0 to lower limit to include it in legend. NA=use maximum value in data\n limits = c(0, NA),\n # Set number of breaks on legend = 3\n n.breaks = 3\n )\n\n\n\n\n\n\nDiscrete color scale\nThe quick and dirty way is with scale_fill_steps(), which creates discretized bins for continuous variables:\n\nchip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct\n )) +\n scale_fill_steps(\n # Convert legend from decimal to percentages\n labels = scales::percent_format(),\n # Make legend title more readable\n name = \"CHIP Enrollment %\",\n # Show top and bottom limits on legend\n show.limits = TRUE,\n # Roughly set number of bins. Won't be exact as R uses algorithms under the\n # hood for pretty looking breaks.\n n.breaks = 4\n )\n\n\n\n\nOften you will want to manually generate the bins yourself to give you more fine grained control over the exact legend text. (ie 1% - 1.8%, 1.8 - 2.5%, etc). Below is an example of discretizing the continuous chip_pct variable yourself using cut_interval() and a helper function to get nice looking interval labels:\n\n# Helper function to clean up R generated intervals into nice looking interval labels\nformat_interval <- function(interval_text) {\n text <- interval_text %>%\n # Remove open and close brackets which is R generated math notation\n str_remove_all(\"\\\\(\") %>%\n str_remove_all(\"\\\\)\") %>%\n str_remove_all(\"\\\\[\") %>%\n str_remove_all(\"\\\\]\") %>%\n str_replace_all(\",\", \" — \")\n\n # Convert decimal ranges to percent ranges\n text <- text %>%\n str_split(\" — \") %>%\n map(~ as.numeric(.x) %>%\n scales::percent() %>%\n paste0(collapse = \" — \")) %>%\n unlist() %>%\n # By default character vectors are plotted in alphabetical order. We want\n # factors in reverse alphabetical order to get correct colors in ggplot\n fct_rev()\n\n return(text)\n}\n\nchip_with_geographies <- chip_with_geographies %>%\n # cut_interval into n groups with equal range. Set boundary so 0 is included in the bins\n mutate(chip_pct_interval = cut_interval(chip_pct, n = 5)) %>%\n # Generate nice looking interval labels\n mutate(chip_pct_interval = format_interval(chip_pct_interval))\n\nAnd now we can map the discretized chip_pct_interval variable using geom_sf():\n\nchip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct_interval\n )) +\n # Default is to use main urban palette, which assumes unrelated groups. We\n # adjust colors manually to be on Urban cyan palette\n scale_fill_manual(\n values = palette_urbn_cyan[c(8, 7, 5, 3, 1)],\n name = \"CHIP Enrollment %\"\n )\n\n\n\n\nIn addition to cut_interval there are similar functions for creating intervals/bins with slightly different rules. When creating bins, be careful as changing the number of bins can drastically change how the map looks."
+ "text": "Choropleth Maps\nChoropleth maps display geographic areas with shades, colors, or patterns in proportion to a variable or variables. Choropleth maps can represent massive geographies like the entire world and small geographies like Census Tracts. To make a choropleth map, you need to set geom_sf(aes(fill = some_variable_name)). Below are examples\n\nContinuous color scale\n\n# Map of CHIP enrollment percentage by state\nchip_with_geographies_map <- chip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct\n ))\n\n\n# Below add-ons to the map are optional, but make the map look prettier.\nchip_with_geographies_map +\n # scale_fill_gradientn adds colors with more interpolation and reverses color scale\n scale_fill_gradientn(\n # Convert legend from decimal to percentages\n labels = scales::percent_format(),\n # Make legend title more readable\n name = \"CHIP Enrollment %\",\n # Manually add 0 to lower limit to include it in legend. NA=use maximum value in data\n limits = c(0, NA),\n # Set number of breaks on legend = 3\n n.breaks = 3\n )\n\n\n\n\n\n\nDiscrete color scale\nThe quick and dirty way is with scale_fill_steps(), which creates discretized bins for continuous variables:\n\nchip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct\n )) +\n scale_fill_steps(\n # Convert legend from decimal to percentages\n labels = scales::percent_format(),\n # Make legend title more readable\n name = \"CHIP Enrollment %\",\n # Show top and bottom limits on legend\n show.limits = TRUE,\n # Roughly set number of bins. Won't be exact as R uses algorithms under the\n # hood for pretty looking breaks.\n n.breaks = 4\n )\n\n\n\n\nOften you will want to manually generate the bins yourself to give you more fine grained control over the exact legend text. (ie 1% - 1.8%, 1.8 - 2.5%, etc). Below is an example of discretizing the continuous chip_pct variable yourself using cut_interval() and a helper function to get nice looking interval labels:\n\n# Helper function to clean up R generated intervals into nice looking interval labels\nformat_interval <- function(interval_text) {\n text <- interval_text %>%\n # Remove open and close brackets which is R generated math notation\n str_remove_all(\"\\\\(\") %>%\n str_remove_all(\"\\\\)\") %>%\n str_remove_all(\"\\\\[\") %>%\n str_remove_all(\"\\\\]\") %>%\n str_replace_all(\",\", \" — \")\n\n # Convert decimal ranges to percent ranges\n text <- text %>%\n str_split(\" — \") %>%\n map(~ as.numeric(.x) %>%\n scales::percent() %>%\n paste0(collapse = \" — \")) %>%\n unlist() %>%\n # By default character vectors are plotted in alphabetical order. We want\n # factors in reverse alphabetical order to get correct colors in ggplot\n fct_rev()\n\n return(text)\n}\n\nchip_with_geographies <- chip_with_geographies %>%\n # cut_interval into n groups with equal range. Set boundary so 0 is included in the bins\n mutate(chip_pct_interval = cut_interval(chip_pct, n = 5)) %>%\n # Generate nice looking interval labels\n mutate(chip_pct_interval = format_interval(chip_pct_interval))\n\nAnd now we can map the discretized chip_pct_interval variable using geom_sf():\n\nchip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct_interval\n )) +\n # Default is to use main urban palette, which assumes unrelated groups. We\n # adjust colors manually to be on Urban cyan palette\n scale_fill_manual(\n values = palette_urbn_cyan[c(8, 7, 5, 3, 1)],\n name = \"CHIP Enrollment %\"\n )\n\n\n\n\nIn addition to cut_interval there are similar functions for creating intervals/bins with slightly different rules. When creating bins, be careful as changing the number of bins can drastically change how the map looks."
},
{
"objectID": "mapping.html#bubble-maps",
"href": "mapping.html#bubble-maps",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Bubble Maps",
- "text": "Bubble Maps\nThis is just a layered map with one polygon layer and one point layer, where the points are sized in accordance with a variable in your data.\n\nset_urbn_defaults(style = \"map\")\n\n# Get sf dataframe of DC tracts\nlibrary(tigris)\ndc_tracts <- tracts(\n state = \"DC\",\n year = 2019,\n progress_bar = FALSE\n)\n\n# Add bubbles for firestations\nggplot() +\n geom_sf(data = dc_tracts, fill = palette_urbn_main[\"gray\"]) +\n geom_sf(\n data = dc_firestations,\n # Size bubbles by number of trucks at each station\n aes(size = TRUCK),\n color = palette_urbn_main[\"yellow\"],\n # Adjust transparency for readability\n alpha = 0.8\n )"
+ "text": "Bubble Maps\nThis is just a layered map with one polygon layer and one point layer, where the points are sized in accordance with a variable in your data.\n\nset_urbn_defaults(style = \"map\")\n\n# Get sf dataframe of DC tracts\nlibrary(tigris)\ndc_tracts <- tracts(\n state = \"DC\",\n year = 2019,\n progress_bar = FALSE\n)\n\n# Add bubbles for firestations\nggplot() +\n geom_sf(data = dc_tracts, fill = palette_urbn_main[\"gray\"]) +\n geom_sf(\n data = dc_firestations,\n # Size bubbles by number of trucks at each station\n aes(size = TRUCK),\n color = palette_urbn_main[\"yellow\"],\n # Adjust transparency for readability\n alpha = 0.8\n )"
},
{
"objectID": "mapping.html#dot-density-maps",
"href": "mapping.html#dot-density-maps",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Dot-density Maps",
- "text": "Dot-density Maps\nThese maps scatter dots within a geographic area. Typically each dot represents a unit (like 100 people, or 1000 houses). To create this kind of map, you need to start with an sf dataframe that is of geometry type POLYGON or MULTIPOLYGON and then sample points within the polygon.\nThe below code generates a dot-density map representing people of different races within Washington DC tracts The code may look a little complicated, but the key workhorse function is st_sample() which samples points within each polygon to use in the dot density map:\n\nlibrary(tidycensus)\n\n# Get counts by race of DC tracts\ndc_pop <- get_acs(\n geography = \"tract\",\n state = \"DC\",\n year = 2019,\n variables = c(\n Hispanic = \"DP05_0071\",\n White = \"DP05_0077\",\n Black = \"DP05_0078\",\n Asian = \"DP05_0080\"\n ),\n geometry = TRUE,\n progress_bar = FALSE\n)\n\n# Get unique groups (ie races)\ngroups <- unique(dc_pop$variable)\n\n# For each unique group (ie race), generate sampled points\ndc_race_dots <- map_dfr(groups, ~ {\n dc_pop %>%\n # .x = the group used in the loop\n filter(variable == .x) %>%\n # Use the projected MD state plane for accuracy\n st_transform(crs = \"EPSG:6487\") %>%\n # Have every dot represent 100 people\n mutate(est100 = as.integer(estimate / 100)) %>%\n st_sample(size = .$est100, exact = TRUE) %>%\n st_sf() %>%\n # Add group (ie race) as a column so we can use it when plotting\n mutate(group = .x)\n})\n\n\nggplot() +\n # Plot tracts, then dots on top of tracts\n geom_sf(\n data = dc_pop,\n # Make interior of tracts transparent and boundaries black\n fill = \"transparent\",\n color = \"black\"\n ) +\n geom_sf(\n data = dc_race_dots,\n # Color in dots by racial group\n aes(color = group),\n # Adjust transparency and size to be more readable\n alpha = 0.5,\n size = 1.1,\n stroke = FALSE\n )"
+ "text": "Dot-density Maps\nThese maps scatter dots within a geographic area. Typically each dot represents a unit (like 100 people, or 1000 houses). To create this kind of map, you need to start with an sf dataframe that is of geometry type POLYGON or MULTIPOLYGON and then sample points within the polygon.\nThe below code generates a dot-density map representing people of different races within Washington DC tracts The code may look a little complicated, but the key workhorse function is st_sample() which samples points within each polygon to use in the dot density map:\n\nlibrary(tidycensus)\n\n# Get counts by race of DC tracts\ndc_pop <- get_acs(\n geography = \"tract\",\n state = \"DC\",\n year = 2019,\n variables = c(\n Hispanic = \"DP05_0071\",\n White = \"DP05_0077\",\n Black = \"DP05_0078\",\n Asian = \"DP05_0080\"\n ),\n geometry = TRUE,\n progress_bar = FALSE\n)\n\n# Get unique groups (ie races)\ngroups <- unique(dc_pop$variable)\n\n# For each unique group (ie race), generate sampled points\ndc_race_dots <- map_dfr(groups, ~ {\n dc_pop %>%\n # .x = the group used in the loop\n filter(variable == .x) %>%\n # Use the projected MD state plane for accuracy\n st_transform(crs = \"EPSG:6487\") %>%\n # Have every dot represent 100 people\n mutate(est100 = as.integer(estimate / 100)) %>%\n st_sample(size = .$est100, exact = TRUE) %>%\n st_sf() %>%\n # Add group (ie race) as a column so we can use it when plotting\n mutate(group = .x)\n})\n\n\nggplot() +\n # Plot tracts, then dots on top of tracts\n geom_sf(\n data = dc_pop,\n # Make interior of tracts transparent and boundaries black\n fill = \"transparent\",\n color = \"black\"\n ) +\n geom_sf(\n data = dc_race_dots,\n # Color in dots by racial group\n aes(color = group),\n # Adjust transparency and size to be more readable\n alpha = 0.5,\n size = 1.1,\n stroke = FALSE\n )"
},
{
"objectID": "mapping.html#geofacets",
"href": "mapping.html#geofacets",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Geofacets",
- "text": "Geofacets\nGeofaceting arranges sub-geography-specific plots into a grid that resembles a larger geography (usually the US). This can be a useful alternative to choropleth maps, which tend to overemphasize low-population density areas with large areas. To make geofacetted charts, use the facet_geo() function from the geofacet library, which can be thought of as equivalent to ggplot2’s facet_wrap(). For this example, we’ll use the built-in state_ranks data.\n\nlibrary(geofacet)\n\nhead(state_ranks %>% as_tibble())\n\n# A tibble: 6 × 4\n state name variable rank\n \n1 AK Alaska education 28\n2 AK Alaska employment 50\n3 AK Alaska health 25\n4 AK Alaska wealth 5\n5 AK Alaska sleep 27\n6 AK Alaska insured 50\n\n\n\nset_urbn_defaults(style = \"print\")\n\nstate_ranks %>%\n filter(variable %in% c(\"education\", \"employment\")) %>%\n ggplot(aes(x = rank, y = variable)) +\n geom_col() +\n facet_geo(\n facets = \"state\",\n # Use custom urban geofacet grid which is built into urbnthemes\n # For now we need to rename a few columns as urbnthemes has to be\n # updated\n grid = urbnthemes::urbn_geofacet %>%\n rename(\n code = state_code,\n name = state_name\n )\n )\n\n\n\n\nInteractive geofacets of the United States have been used in Urban Features like A Matter of Time which included geofaceted line charts showing trends in incarceration by state. Static geofacets of the United States were included in Barriers to Accessing Homeownership Down Payment, Credit, and Affordability by the Housing Finance Policy Center.\n\nTile grid map\nYou can select predefined grids, or create your own at https://hafen.github.io/grid-designer/\n\n# create a grid with all of the US states and territories \nmygrid <- data.frame(\n code = c(\"ME\", \"AK\", \"WI\", \"VT\", \"NH\", \"IL\", \"ID\", \"WA\", \"MN\", \"MT\", \"ND\", \"MI\", \"NY\", \"MA\", \"IA\", \"IN\", \"CT\", \"RI\", \"NJ\", \"PA\", \"OH\", \"SD\", \"WY\", \"NV\", \"OR\", \"CA\", \"NE\", \"DE\", \"MD\", \"VA\", \"WV\", \"KY\", \"MO\", \"CO\", \"UT\", \"AZ\", \"KS\", \"AR\", \"DC\", \"SC\", \"NC\", \"TN\", \"NM\", \"LA\", \"AL\", \"GA\", \"MS\", \"OK\", \"HI\", \"FL\", \"TX\"),\n row = c(1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8),\n col = c(12, 2, 7, 11, 12, 7, 3, 2, 6, 4, 5, 8, 10, 11, 6, 7, 11, 12, 10, 9, 8, 5, 4, 3, 2, 2, 5, 11, 10, 9, 8, 7, 6, 4, 3, 3, 5, 6, 10, 9, 8, 7, 4, 6, 8, 9, 7, 5, 2, 10, 5),\n stringsAsFactors = FALSE\n)\n\n## Combine data into geo_grid for tiling:\ngeo_grid_data <- mygrid %>% \n left_join(chip_with_geographies, by=c(\"code\" = \"state_abbv\")) \n\n## plot tile grid\ngeo_grid_data %>% \n ggplot(aes(x = col, y = row, fill = chip_pct_interval)) +\n scale_fill_manual(values = palette_urbn_cyan[c(8, 7, 5, 3, 1)], \n name = \"CHIP Enrollment %\") +\n geom_tile(color = \"white\", linewidth = 1) +\n geom_text(aes(label = code), color=\"white\", size = 4) +\n scale_y_reverse() +\n coord_equal() +\n labs(fill=NULL)"
+ "text": "Geofacets\nGeofaceting arranges sub-geography-specific plots into a grid that resembles a larger geography (usually the US). This can be a useful alternative to choropleth maps, which tend to overemphasize low-population density areas with large areas. To make geofacetted charts, use the facet_geo() function from the geofacet library, which can be thought of as equivalent to ggplot2’s facet_wrap(). For this example, we’ll use the built-in state_ranks data.\n\nlibrary(geofacet)\n\nhead(state_ranks %>% as_tibble())\n\n# A tibble: 6 × 4\n state name variable rank\n <chr> <chr> <chr> <dbl>\n1 AK Alaska education 28\n2 AK Alaska employment 50\n3 AK Alaska health 25\n4 AK Alaska wealth 5\n5 AK Alaska sleep 27\n6 AK Alaska insured 50\n\n\n\nset_urbn_defaults(style = \"print\")\n\nstate_ranks %>%\n filter(variable %in% c(\"education\", \"employment\")) %>%\n ggplot(aes(x = rank, y = variable)) +\n geom_col() +\n facet_geo(\n facets = \"state\",\n # Use custom urban geofacet grid which is built into urbnthemes\n # For now we need to rename a few columns as urbnthemes has to be\n # updated\n grid = urbnthemes::urbn_geofacet %>%\n rename(\n code = state_code,\n name = state_name\n )\n )\n\n\n\n\nInteractive geofacets of the United States have been used in Urban Features like A Matter of Time which included geofaceted line charts showing trends in incarceration by state. Static geofacets of the United States were included in Barriers to Accessing Homeownership Down Payment, Credit, and Affordability by the Housing Finance Policy Center.\n\nTile grid map\nYou can select predefined grids, or create your own at https://hafen.github.io/grid-designer/\n\n# create a grid with all of the US states and territories \nmygrid <- data.frame(\n code = c(\"ME\", \"AK\", \"WI\", \"VT\", \"NH\", \"IL\", \"ID\", \"WA\", \"MN\", \"MT\", \"ND\", \"MI\", \"NY\", \"MA\", \"IA\", \"IN\", \"CT\", \"RI\", \"NJ\", \"PA\", \"OH\", \"SD\", \"WY\", \"NV\", \"OR\", \"CA\", \"NE\", \"DE\", \"MD\", \"VA\", \"WV\", \"KY\", \"MO\", \"CO\", \"UT\", \"AZ\", \"KS\", \"AR\", \"DC\", \"SC\", \"NC\", \"TN\", \"NM\", \"LA\", \"AL\", \"GA\", \"MS\", \"OK\", \"HI\", \"FL\", \"TX\"),\n row = c(1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8),\n col = c(12, 2, 7, 11, 12, 7, 3, 2, 6, 4, 5, 8, 10, 11, 6, 7, 11, 12, 10, 9, 8, 5, 4, 3, 2, 2, 5, 11, 10, 9, 8, 7, 6, 4, 3, 3, 5, 6, 10, 9, 8, 7, 4, 6, 8, 9, 7, 5, 2, 10, 5),\n stringsAsFactors = FALSE\n)\n\n## Combine data into geo_grid for tiling:\ngeo_grid_data <- mygrid %>% \n left_join(chip_with_geographies, by=c(\"code\" = \"state_abbv\")) \n\n## plot tile grid\ngeo_grid_data %>% \n ggplot(aes(x = col, y = row, fill = chip_pct_interval)) +\n scale_fill_manual(values = palette_urbn_cyan[c(8, 7, 5, 3, 1)], \n name = \"CHIP Enrollment %\") +\n geom_tile(color = \"white\", linewidth = 1) +\n geom_text(aes(label = code), color=\"white\", size = 4) +\n scale_y_reverse() +\n coord_equal() +\n labs(fill=NULL)"
},
{
"objectID": "mapping.html#cartograms",
"href": "mapping.html#cartograms",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Cartograms",
- "text": "Cartograms\nCartograms are a modified form of a choropleth map with intentionally distorted sizes that map to a variable in your data. Below we create a cartogram with library(cartogram) where the state sizes are proportional to the population.\n\nlibrary(cartogram)\n\nset_urbn_defaults(style = \"map\")\n\nchip_with_geographies_weighted <- chip_with_geographies %>%\n # Note column name needs to be in quotes for this package\n cartogram_cont(weight = \"population\")\n\nggplot() +\n geom_sf(\n data = chip_with_geographies_weighted,\n # Color in states by chip percentages\n aes(fill = chip_pct)\n )"
+ "text": "Cartograms\nCartograms are a modified form of a choropleth map with intentionally distorted sizes that map to a variable in your data. Below we create a cartogram with library(cartogram) where the state sizes are proportional to the population.\n\nlibrary(cartogram)\n\nset_urbn_defaults(style = \"map\")\n\nchip_with_geographies_weighted <- chip_with_geographies %>%\n # Note column name needs to be in quotes for this package\n cartogram_cont(weight = \"population\")\n\nggplot() +\n geom_sf(\n data = chip_with_geographies_weighted,\n # Color in states by chip percentages\n aes(fill = chip_pct)\n )"
},
{
"objectID": "mapping.html#interactive-maps",
"href": "mapping.html#interactive-maps",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Interactive Maps",
- "text": "Interactive Maps\nInteractive maps can be a great exploratory tool to explore and understand your data. And luckily there are a lot of new R packages that make it really easy to create them. Interactive maps are powerful but we do not recommend them for official use in Urban publications as getting them in Urban styles and appropriate basemaps can be tricky (reach out to anarayanan@urban.org if you really want to include them).\n\nlibrary(mapview)\nlibrary(mapview) is probably the most user friendly of the interactive mapping R libraries. All you have to do to create an interactive map is:\n\nlibrary(mapview)\n\n\nchip_with_geographies_for_interactive_mapping <- chip_with_geographies %>%\n # Filter out AL and HI bc they would appear in Mexico. If you want AL, HI and\n # in the correct place in interactive maps, make sure to use tigris::states()\n filter(!state_abbv %in% c(\"AK\", \"HI\"))\n\nmapview(chip_with_geographies_for_interactive_mapping)\n\n\n\n\n\n\nWhen you click on an object, you get a popup table of all it’s attributes. And when you hover over an object, you get a popup with an object id.\nEach of the above behaviors can be changed if desired. As you’ll see in the below section, the syntax for library(mapview) is significantly different from library(ggplot2) so be careful!\n\nColoring in points/polygons\nIn order to create a choropleth map where we color in the points/polygons by a variable, we need to feed in a column name in quotes to thezcol argument inside the mapview() function:\n\n# Create interactive state map colored in by chip enrollment\nmapview(chip_with_geographies_for_interactive_mapping, zcol = \"chip_enrollment\")\n\n\n\n\n\n\nIf you want more granular control over the color palette for the legend can also feed in a vector of color hex codes to col.regions along with a column name to zcol. This will create a continuous color range along the provided colors. Be careful though as the color interpolation is not perfect.\n\n# library(RColorBrewer)\nmapview(chip_with_geographies_for_interactive_mapping,\n col.regions = c(\n palette_urbn_green[6],\n \"white\",\n palette_urbn_cyan[6]\n ),\n zcol = \"chip_enrollment\"\n)\n\n\n\n\n\n\nIf you want to color in all points/polygons as the same color, just feed in a single color hex code to the col.regions argument:\n\nmapview(chip_with_geographies_for_interactive_mapping,\n col.regions = palette_urbn_green[5]\n)\n\n\n\n\n\n\n\n\nAdding layers\nYou can add multiple sf objects on the same map by using the + operator. This is very useful when comparing 2 or more spatial datasets.\n\nmapview(chip_with_geographies_for_interactive_mapping, col.regions = palette_urbn_green[5]) +\n mapview(state_capitals, col.regions = palette_urbn_cyan[5])\n\n\n\n\n\n\nYou can even create slider maps by using the | operator!\n\nmapview(chip_with_geographies_for_interactive_mapping, col.regions = palette_urbn_green[5]) |\n mapview(state_capitals, col.regions = palette_urbn_cyan[5])\n\n\n\n\n\n\n\n\n\nMore details\nTo learn more about more advanced options with mapview maps, check out the documentation page and the reference manual.\nThere are also other interactive map making packages in R like leaflet (which mapview is a more user friendly wrapper of), tmap, and mapdeck. To learn about these other packages, this book chapter is a good starting point."
+ "text": "Interactive Maps\nInteractive maps can be a great exploratory tool to explore and understand your data. And luckily there are a lot of new R packages that make it really easy to create them. Interactive maps are powerful but we do not recommend them for official use in Urban publications as getting them in Urban styles and appropriate basemaps can be tricky (reach out to anarayanan@urban.org if you really want to include them).\n\nlibrary(mapview)\nlibrary(mapview) is probably the most user friendly of the interactive mapping R libraries. All you have to do to create an interactive map is:\n\nlibrary(mapview)\n\n\nchip_with_geographies_for_interactive_mapping <- chip_with_geographies %>%\n # Filter out AL and HI bc they would appear in Mexico. If you want AL, HI and\n # in the correct place in interactive maps, make sure to use tigris::states()\n filter(!state_abbv %in% c(\"AK\", \"HI\"))\n\nmapview(chip_with_geographies_for_interactive_mapping)\n\n\n\n\n\n\nWhen you click on an object, you get a popup table of all it’s attributes. And when you hover over an object, you get a popup with an object id.\nEach of the above behaviors can be changed if desired. As you’ll see in the below section, the syntax for library(mapview) is significantly different from library(ggplot2) so be careful!\n\nColoring in points/polygons\nIn order to create a choropleth map where we color in the points/polygons by a variable, we need to feed in a column name in quotes to thezcol argument inside the mapview() function:\n\n# Create interactive state map colored in by chip enrollment\nmapview(chip_with_geographies_for_interactive_mapping, zcol = \"chip_enrollment\")\n\n\n\n\n\n\nIf you want more granular control over the color palette for the legend can also feed in a vector of color hex codes to col.regions along with a column name to zcol. This will create a continuous color range along the provided colors. Be careful though as the color interpolation is not perfect.\n\n# library(RColorBrewer)\nmapview(chip_with_geographies_for_interactive_mapping,\n col.regions = c(\n palette_urbn_green[6],\n \"white\",\n palette_urbn_cyan[6]\n ),\n zcol = \"chip_enrollment\"\n)\n\n\n\n\n\n\nIf you want to color in all points/polygons as the same color, just feed in a single color hex code to the col.regions argument:\n\nmapview(chip_with_geographies_for_interactive_mapping,\n col.regions = palette_urbn_green[5]\n)\n\n\n\n\n\n\n\n\nAdding layers\nYou can add multiple sf objects on the same map by using the + operator. This is very useful when comparing 2 or more spatial datasets.\n\nmapview(chip_with_geographies_for_interactive_mapping, col.regions = palette_urbn_green[5]) +\n mapview(state_capitals, col.regions = palette_urbn_cyan[5])\n\n\n\n\n\n\nYou can even create slider maps by using the | operator!\n\nmapview(chip_with_geographies_for_interactive_mapping, col.regions = palette_urbn_green[5]) |\n mapview(state_capitals, col.regions = palette_urbn_cyan[5])\n\n\n\n\n\n\n\n\n\nMore details\nTo learn more about more advanced options with mapview maps, check out the documentation page and the reference manual.\nThere are also other interactive map making packages in R like leaflet (which mapview is a more user friendly wrapper of), tmap, and mapdeck. To learn about these other packages, this book chapter is a good starting point."
},
{
"objectID": "mapping.html#cropping",
"href": "mapping.html#cropping",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Cropping",
- "text": "Cropping\nCropping (or clipping) is geographically filtering an sf dataframe to just the area we are interested in. Say we wanted to look at the roads around Fire Station 24 in DC.\n\nlibrary(tigris)\nlibrary(units)\n\ndc_firestations <- dc_firestations %>%\n st_transform(\"EPSG:6487\")\n\n\n# Draw 500 meter circle around one fire station\nfire_station_24_buffered <- dc_firestations %>%\n filter(NAME == \"Engine 24 Station\") %>%\n st_buffer(set_units(500, \"meter\"))\n\n# Get listing of all roads in DC\ndc_roads <- roads(\n state = \"DC\",\n county = \"District of Columbia\",\n class = \"sf\",\n progress_bar = FALSE\n) %>%\n st_transform(\"EPSG:6487\")\n\n# View roads on top of fire_station\nggplot() +\n # Order matters! We need to plot fire_stations first, and then roads on top\n # to see overlapping firestations\n geom_sf(\n data = fire_station_24_buffered,\n fill = palette_urbn_cyan[1],\n color = palette_urbn_cyan[7]\n ) +\n geom_sf(\n data = dc_roads,\n color = palette_urbn_gray[7]\n ) +\n theme_urbn_map()\n\n\n\n\nWe can clip the larger roads dataframe to just roads that overlap with the circle around the fire station with st_intersection().\n\n# Use st_intersection() to crop the roads data to just roads within the\n# fire_station radius\ndc_roads_around_fire_station_24_buffered <- fire_station_24_buffered %>%\n st_intersection(dc_roads)\n\nggplot() +\n geom_sf(\n data = fire_station_24_buffered,\n fill = palette_urbn_cyan[1],\n color = palette_urbn_cyan[7]\n ) +\n geom_sf(\n data = dc_roads_around_fire_station_24_buffered,\n color = palette_urbn_gray[7]\n ) +\n theme_urbn_map()\n\n\n\n\nMore Coming Soon!"
+ "text": "Cropping\nCropping (or clipping) is geographically filtering an sf dataframe to just the area we are interested in. Say we wanted to look at the roads around Fire Station 24 in DC.\n\nlibrary(tigris)\nlibrary(units)\n\ndc_firestations <- dc_firestations %>%\n st_transform(\"EPSG:6487\")\n\n\n# Draw 500 meter circle around one fire station\nfire_station_24_buffered <- dc_firestations %>%\n filter(NAME == \"Engine 24 Station\") %>%\n st_buffer(set_units(500, \"meter\"))\n\n# Get listing of all roads in DC\ndc_roads <- roads(\n state = \"DC\",\n county = \"District of Columbia\",\n class = \"sf\",\n progress_bar = FALSE\n) %>%\n st_transform(\"EPSG:6487\")\n\n# View roads on top of fire_station\nggplot() +\n # Order matters! We need to plot fire_stations first, and then roads on top\n # to see overlapping firestations\n geom_sf(\n data = fire_station_24_buffered,\n fill = palette_urbn_cyan[1],\n color = palette_urbn_cyan[7]\n ) +\n geom_sf(\n data = dc_roads,\n color = palette_urbn_gray[7]\n ) +\n theme_urbn_map()\n\n\n\n\nWe can clip the larger roads dataframe to just roads that overlap with the circle around the fire station with st_intersection().\n\n# Use st_intersection() to crop the roads data to just roads within the\n# fire_station radius\ndc_roads_around_fire_station_24_buffered <- fire_station_24_buffered %>%\n st_intersection(dc_roads)\n\nggplot() +\n geom_sf(\n data = fire_station_24_buffered,\n fill = palette_urbn_cyan[1],\n color = palette_urbn_cyan[7]\n ) +\n geom_sf(\n data = dc_roads_around_fire_station_24_buffered,\n color = palette_urbn_gray[7]\n ) +\n theme_urbn_map()\n\n\n\n\nMore Coming Soon!"
},
{
"objectID": "mapping.html#calculating-distance",
"href": "mapping.html#calculating-distance",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Calculating Distance",
"text": "Calculating Distance"
},
{
"objectID": "mapping.html#spatial-joins",
"href": "mapping.html#spatial-joins",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Spatial Joins",
"text": "Spatial Joins\n\nPoint to Polygon\n\n\nPolygon to Polygon"
},
{
"objectID": "mapping.html#aggregating",
"href": "mapping.html#aggregating",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Aggregating",
"text": "Aggregating"
},
{
"objectID": "mapping.html#drivetransit-times",
"href": "mapping.html#drivetransit-times",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Drive/Transit times",
"text": "Drive/Transit times"
},
{
"objectID": "mapping.html#geocoding",
"href": "mapping.html#geocoding",
- "title": "R@URBAN",
+ "title": "Introduction",
"section": "Geocoding",
- "text": "Geocoding\nGeocoding is the process of turning text (usually addresses) into geographic coordinates (usually latitudes/longitudes) for use in mapping. For Urban researchers, we highly recommend using the Urban geocoder as it is fast, accurate, designed to work with sensitive/confidential data and most importantly free to use for Urban researchers! To learn about how we set up and chose the geocoder for the Urban Institute, you can read our Data@Urban blog.\n\nCleaning Addresses\nThe single most important factor in getting accurate geocoded data is having cleaned, well structured address data. This can prove difficult as address data out in the wild is often messy and unstandardized. While the rules for cleaning addresses are very data specific, below are some examples of clean addresses you should aim for in your data cleaning process:\n\n\n\n\n\n\n \n \n \n f_address\n Type of address\n \n \n \n 123 Troy Drive, Pillowtown, CO, 92432\nresidnetial address\n 789 Abed Avenue, Apt 666, Blankesburg, CO, 92489\nresidential apartment address\n Shirley Boulevard and Britta Drive, Blanketsburg, CO, 92489\nstreet intersection\n Pillowtown, CO\ncity\n 92489, CO\nZip Code\n \n \n \n\n\n\n\nAll that being said, our geocoder is pretty tolerant of different address formats, typos/spelling errors and missing states, zip codes, etc. So don’t spend too much time cleaning every address in the data. Also note that while our geocoder is able to geocode cities and zip codes, it will return the lat/lon of the center of the city/zip code, which may not be what you want.\n\n\nInstructions\nTo use the Urban geocoder, you will need to:\n\nGenerate a CSV with a column named f_address which contains the addresses in single line format (ie 123 Abed Avenue, Blanketsburg, CO, 94328). This means that if you have the addresses split across multiple columns (ie Address, City, State, Zip columns), you will need to concatenate them into one column. Also see our Address cleaning section above.\nGo to the Urban geocoder and answer the initial questions. This will tell you whether your data is non-confidential or confidential data, and allow you to upload your CSV for geocoding.\nWait for an email telling you your results are ready. If your data is non-confidential, this email will contain a link to your geocoded results. This link expires in 24 hours, so make sure to download your data before then. If you data is confidential, the email will contain a link to the location on the Y Drive where your confidential geocoded data is stored. You can specify this output folder when submitting the CSV in step 1.\n\n\n\nGeocoder outputs\n\nThe geocoded file will be your original data, plus a few more columns (including latitude and longitude). each of the new columns that have been appended to your original data. It’s very important that you take a look at the Addr_type column in the CSV before doing further analysis to check the accuracy of the geocoding process.\n\n\n\n\n\n\n\n\nColumn\nDescription\n\n\n\n\nMatch_addr\nThe actual address that the inputted address was matched to. This is the address that the geocoder used to get Latitudes / Longitudes. If there are potentially many typos or non standard address formats in your data file, you will want to take a close look at this column to confirm that the matched address correctly handled typos and badly formatted addresses.\n\n\nLongitude\nThe WGS 84 datum Longitude (EPSG code 4326)\n\n\nLatitude\nThe WGS 84 datum Latitude (EPSG code 4326)\n\n\nAddr_type\nThe match level for a geocode request. This should be used as an indicator of the precision of geocode results. Generally, Subaddress, PointAddress, StreetAddress, and StreetInt represent accurate matches. The list below contains all possible values for this field. Green values represent High accuracy matches, yellow represents Medium accuracy matches and red represents Low accuracy/inaccurate matches. If you have many yellow and red values in your data, you should manually check the results before proceeding with analysis. All possible values:\n\nSubaddress: A subset of a PointAddress that represents a house or building subaddress location, such as an apartment unit, floor, or individual building within a complex. The UnitName, UnitType, LevelName, LevelType, BldgName, and BldgType field values help to distinguish subaddresses which may be associated with the same PointAddress. Reference data consists of point features with associated house number, street name, and subaddress elements, along with administrative divisions and optional postal code; for example, 3836 Emerald Ave, Suite C, La Verne, CA, 91750.\n\nPointAddress: A street address based on points that represent house and building locations. Typically, this is the most spatially accurate match level. Reference data contains address points with associated house numbers and street names, along with administrative divisions and optional postal code. The X / Y (Longitude/Latitude) and geometry output values for a PointAddress match represent the street entry location for the address; this is the location used for routing operations. The DisplayX and DisplayY values represent the rooftop, or actual, location of the address. Example: 380 New York St, Redlands, CA, 92373.\n\nStreetAddress — A street address that differs from PointAddress because the house number is interpolated from a range of numbers. Reference data contains street center lines with house number ranges, along with administrative divisions and optional postal code information, for example, 647 Haight St, San Francisco, CA, 94117.\n\nStreetInt: A street address consisting of a street intersection along with city and optional state and postal code information. This is derived from StreetAddress reference data, for example, Redlands Blvd & New York St, Redlands, CA, 92373.\n\nStreetName: Similar to a street address but without the house number. Reference data contains street centerlines with associated street names (no numbered address ranges), along with administrative divisions and optional postal code, for example, W Olive Ave, Redlands, CA, 92373.\n\nStreetAddressExt: An interpolated street address match that is returned when parameter matchOutOfRange=true and the input house number exceeds the house number range for the matched street segment.\n\nDistanceMarker: A street address that represents the linear distance along a street, typically in kilometers or miles, from a designated origin location. Example: Carr 682 KM 4, Barceloneta, 00617.\n\nPostalExt: A postal code with an additional extension, such as the United States Postal Service ZIP+4. Reference data is postal code points with extensions, for example, 90210-3841.\n\nPOI: —Points of interest. Reference data consists of administrative division place-names, businesses, landmarks, and geographic features, for example, Golden Gate Bridge.\n\nLocality: A place-name representing a populated place. The Type output field provides more detailed information about the type of populated place. Possible Type values for Locality matches include Block, Sector, Neighborhood, District, City, MetroArea, County, State or Province, Territory, Country, and Zone. Example: Bogotá, COL,\n\nPostalLoc: A combination of postal code and city name. Reference data is typically a union of postal boundaries and administrative (locality) boundaries, for example, 7132 Frauenkirchen.\n\nPostal: Postal code. Reference data is postal code points, for example, 90210 USA.\n\n\nScore\nA number from 1–100 indicating the degree to which the input tokens in a geocoding request match the address components in a candidate record. A score of 100 represents a perfect match, while lower scores represent decreasing match accuracy.\n\n\nStatus\nIndicates whether a batch geocode request results in a match, tie, or unmatched. Possible values include\n\nM - Match. The returned address matches the input address and is the highest scoring candidate.\n\nT - Tied. The returned address matches the input address but has the same score as one or more additional candidates.\n\nU - Unmatched. No addresses match the inputted address.\n\n\ngeometry\nThe WKT (Well-known text) representation of the latitudes and longitudes. This column may be useful if you’re reading the CSV into R, Python, or ArcGIS\n\n\nRegion\nThe state that Match_addr is located in\n\n\nRegionAbbr\nAbbreviated State Name. For example, CA for California\n\n\nSubregion\nThe county that the input address is located in\n\n\nMetroArea\nThe name of the Metropolitan area that Match_addr is located in. This field may be blank if the input address is not located within a metro area.\n\n\nCity\nThe city that Match_addr is located in\n\n\nNbrhd\nThe Neighborhood that Match_addr is located in. Note these are ESRI defined neighborhoods which may or may not align with other sources neighborhood definitions"
+ "text": "Geocoding\nGeocoding is the process of turning text (usually addresses) into geographic coordinates (usually latitudes/longitudes) for use in mapping. For Urban researchers, we highly recommend using the Urban geocoder as it is fast, accurate, designed to work with sensitive/confidential data and most importantly free to use for Urban researchers! To learn about how we set up and chose the geocoder for the Urban Institute, you can read our Data@Urban blog.\n\nCleaning Addresses\nThe single most important factor in getting accurate geocoded data is having cleaned, well structured address data. This can prove difficult as address data out in the wild is often messy and unstandardized. While the rules for cleaning addresses are very data specific, below are some examples of clean addresses you should aim for in your data cleaning process:\n\n\n\n\n\n\n\n\n\nf_address\nType of address\n\n\n\n\n123 Troy Drive, Pillowtown, CO, 92432\nresidnetial address\n\n\n789 Abed Avenue, Apt 666, Blankesburg, CO, 92489\nresidential apartment address\n\n\nShirley Boulevard and Britta Drive, Blanketsburg, CO, 92489\nstreet intersection\n\n\nPillowtown, CO\ncity\n\n\n92489, CO\nZip Code\n\n\n\n\n\n\n\nAll that being said, our geocoder is pretty tolerant of different address formats, typos/spelling errors and missing states, zip codes, etc. So don’t spend too much time cleaning every address in the data. Also note that while our geocoder is able to geocode cities and zip codes, it will return the lat/lon of the center of the city/zip code, which may not be what you want.\n\n\nInstructions\nTo use the Urban geocoder, you will need to:\n\nGenerate a CSV with a column named f_address which contains the addresses in single line format (ie 123 Abed Avenue, Blanketsburg, CO, 94328). This means that if you have the addresses split across multiple columns (ie Address, City, State, Zip columns), you will need to concatenate them into one column. Also see our Address cleaning section above.\nGo to the Urban geocoder and answer the initial questions. This will tell you whether your data is non-confidential or confidential data, and allow you to upload your CSV for geocoding.\nWait for an email telling you your results are ready. If your data is non-confidential, this email will contain a link to your geocoded results. This link expires in 24 hours, so make sure to download your data before then. If you data is confidential, the email will contain a link to the location on the Y Drive where your confidential geocoded data is stored. You can specify this output folder when submitting the CSV in step 1.\n\n\n\nGeocoder outputs\n\nThe geocoded file will be your original data, plus a few more columns (including latitude and longitude). each of the new columns that have been appended to your original data. It’s very important that you take a look at the Addr_type column in the CSV before doing further analysis to check the accuracy of the geocoding process.\n\n\n\n\n\n\n\n\nColumn\nDescription\n\n\n\n\nMatch_addr\nThe actual address that the inputted address was matched to. This is the address that the geocoder used to get Latitudes / Longitudes. If there are potentially many typos or non standard address formats in your data file, you will want to take a close look at this column to confirm that the matched address correctly handled typos and badly formatted addresses.\n\n\nLongitude\nThe WGS 84 datum Longitude (EPSG code 4326)\n\n\nLatitude\nThe WGS 84 datum Latitude (EPSG code 4326)\n\n\nAddr_type\nThe match level for a geocode request. This should be used as an indicator of the precision of geocode results. Generally, Subaddress, PointAddress, StreetAddress, and StreetInt represent accurate matches. The list below contains all possible values for this field. Green values represent High accuracy matches, yellow represents Medium accuracy matches and red represents Low accuracy/inaccurate matches. If you have many yellow and red values in your data, you should manually check the results before proceeding with analysis. All possible values:\n\nSubaddress: A subset of a PointAddress that represents a house or building subaddress location, such as an apartment unit, floor, or individual building within a complex. The UnitName, UnitType, LevelName, LevelType, BldgName, and BldgType field values help to distinguish subaddresses which may be associated with the same PointAddress. Reference data consists of point features with associated house number, street name, and subaddress elements, along with administrative divisions and optional postal code; for example, 3836 Emerald Ave, Suite C, La Verne, CA, 91750.\n\nPointAddress: A street address based on points that represent house and building locations. Typically, this is the most spatially accurate match level. Reference data contains address points with associated house numbers and street names, along with administrative divisions and optional postal code. The X / Y (Longitude/Latitude) and geometry output values for a PointAddress match represent the street entry location for the address; this is the location used for routing operations. The DisplayX and DisplayY values represent the rooftop, or actual, location of the address. Example: 380 New York St, Redlands, CA, 92373.\n\nStreetAddress — A street address that differs from PointAddress because the house number is interpolated from a range of numbers. Reference data contains street center lines with house number ranges, along with administrative divisions and optional postal code information, for example, 647 Haight St, San Francisco, CA, 94117.\n\nStreetInt: A street address consisting of a street intersection along with city and optional state and postal code information. This is derived from StreetAddress reference data, for example, Redlands Blvd & New York St, Redlands, CA, 92373.\n\nStreetName: Similar to a street address but without the house number. Reference data contains street centerlines with associated street names (no numbered address ranges), along with administrative divisions and optional postal code, for example, W Olive Ave, Redlands, CA, 92373.\n\nStreetAddressExt: An interpolated street address match that is returned when parameter matchOutOfRange=true and the input house number exceeds the house number range for the matched street segment.\n\nDistanceMarker: A street address that represents the linear distance along a street, typically in kilometers or miles, from a designated origin location. Example: Carr 682 KM 4, Barceloneta, 00617.\n\nPostalExt: A postal code with an additional extension, such as the United States Postal Service ZIP+4. Reference data is postal code points with extensions, for example, 90210-3841.\n\nPOI: —Points of interest. Reference data consists of administrative division place-names, businesses, landmarks, and geographic features, for example, Golden Gate Bridge.\n\nLocality: A place-name representing a populated place. The Type output field provides more detailed information about the type of populated place. Possible Type values for Locality matches include Block, Sector, Neighborhood, District, City, MetroArea, County, State or Province, Territory, Country, and Zone. Example: Bogotá, COL,\n\nPostalLoc: A combination of postal code and city name. Reference data is typically a union of postal boundaries and administrative (locality) boundaries, for example, 7132 Frauenkirchen.\n\nPostal: Postal code. Reference data is postal code points, for example, 90210 USA.\n\n\nScore\nA number from 1–100 indicating the degree to which the input tokens in a geocoding request match the address components in a candidate record. A score of 100 represents a perfect match, while lower scores represent decreasing match accuracy.\n\n\nStatus\nIndicates whether a batch geocode request results in a match, tie, or unmatched. Possible values include\n\nM - Match. The returned address matches the input address and is the highest scoring candidate.\n\nT - Tied. The returned address matches the input address but has the same score as one or more additional candidates.\n\nU - Unmatched. No addresses match the inputted address.\n\n\ngeometry\nThe WKT (Well-known text) representation of the latitudes and longitudes. This column may be useful if you’re reading the CSV into R, Python, or ArcGIS\n\n\nRegion\nThe state that Match_addr is located in\n\n\nRegionAbbr\nAbbreviated State Name. For example, CA for California\n\n\nSubregion\nThe county that the input address is located in\n\n\nMetroArea\nThe name of the Metropolitan area that Match_addr is located in. This field may be blank if the input address is not located within a metro area.\n\n\nCity\nThe city that Match_addr is located in\n\n\nNbrhd\nThe Neighborhood that Match_addr is located in. Note these are ESRI defined neighborhoods which may or may not align with other sources neighborhood definitions"
},
{
- "objectID": "graphics-guide.html#sankey-plot",
- "href": "graphics-guide.html#sankey-plot",
- "title": "R@URBAN",
- "section": "Sankey Plot",
- "text": "Sankey Plot\n\nSankey plots visualize flows from one set of variables to another. This can be useful for showing outcomes from the start of a program to the end. You’ll need to install the ggsankey package to create Sankey plots in R. In this example I make a dummy data set of housing status prior to program start and at exit to show the flow of people between outcomes. A key step is to transform your data set using the make_long function from the package. This creates a data frame that specifies each of the initial nodes and how they flow into the next stage.\n\n# load ggsankey package\nremotes::install_github(\"davidsjoberg/ggsankey\")\nlibrary(ggsankey)\n\n# create a dummy dataset of housing status\ndf <- data_frame(entry_status = c(rep(\"Housed\", 7), rep(\"Unhoused\", 15), rep(\"Staying w/ Family\", 8)), \n exit_status = c(rep(\"Housed\", 15), rep(\"Unhoused\", 2), rep(\"Staying w/ Family\", 13))) %>% \n # transform the data frame into the proper format for the sankey plot\n make_long(entry_status, exit_status) %>% \n # recode the labels to be cleaner in the plot \n mutate(x = recode(x, entry_status = \"Prior Housing Status\", exit_status = \"Exit Housing Status\"),\n next_x = recode(next_x, entry_status = \"Prior Housing Status\", exit_status = \"Exit Housing Status\"))\n\n# create sankey plot\nggplot(df, aes(x = x, \n next_x = next_x, \n node = node, \n next_node = next_node,\n fill = factor(node), \n label = node)) +\n geom_sankey(flow.alpha = 0.5, node.color = 1, show.legend = FALSE) +\n # add labels to plot and style\n geom_sankey_label(size = 3.5, color = 1, fill = \"white\") +\n theme_sankey(base_size = 16)+\n labs(x = NULL)"
+ "objectID": "resources.html",
+ "href": "resources.html",
+ "title": "Free Books",
+ "section": "",
+ "text": "Free Books\n\nIntro\n\nR for Data Science by Garrett Grolemund and Hadley Wickham\n\n\n\nData Viz\n\nggplot2: Elegant Graphics for Data Analysis by Hadley Wickham\nData Visualization - A practical introduction by Kieran Healy\n\n\n\n*down\n\nR Markdown: The Definitive Guide by Yihui Xie, J. J. Allaire, and Garrett Grolemund\nblogdown: Creating Websites with R Markdown by Yihui Xie, Amber Thomas, and Alison Presmanes Hill\nbookdown: Authoring Books and Technical Documents with R Markdown by Yihui Xie\n\n\n\nStatistics\n\nLearning Statistics with R by Danielle Navarro\nIntroduction to Econometrics with R by Christoph Hanck, Martin Arnold, Alexander Gerber and Martin Schmelzer\nAn Introduction to Bayesian Thinking by Merlise Clyde et. al.\nStatistical Inference via Data Science by Chester Ismay and Albert Y. Kim\n\n\n\nMachine Learning\n\nHands-On Machine Learning with R by Bradley Boehmke & Brandon Greenwell\nFeature Engineering and Selection: A Practical Approach for Predictive Models by Max Kuhn and Kjell Johnson\n\n\n\nMapping and Geospatial Analysis\n\nGeocomputation with R by Robin Lovelace, Jakub Nowosad, Jannes Muenchow\n\n\n\nText Analysis\n\nText Mining with R A Tidy Approach by Julia Silge and David Robinson\n\n\n\nProgramming\n\nAdvanced R by Hadley Wickham\nR Packages by Hadley Wickham\nMaster Spark with R by Javier Luraschi, Kevin Kuo, and Edgar Ruiz\nFunctional programming and unit testing for data munging with R by Bruno Rodrigues\n\n\n\n\nWebsites\n\nRStudio Essentials\nRStudio Education\nR Cheat Sheets\nAndrew Heiss’ free Data Viz Course"
+ },
+ {
+ "objectID": "getting-data.html#librarytidycensus",
+ "href": "getting-data.html#librarytidycensus",
+ "title": "Introduction",
+ "section": "library(tidycensus)",
+ "text": "library(tidycensus)\nlibrary(tidycensus) by Kyle Walker (complete intro here) is the best tool for accessing some Census data sets in R from the Census Bureau API. The package returns tidy data frames and can easily pull shapefiles by adding geometry = TRUE.\nYou will need to apply for a Census API key and add it to your R session. Don’t add your API key to your script and don’t add it to a GitHub repository!\nHere is a simple example for one state with shapefiles:\n\nlibrary(tidyverse)\nlibrary(purrr)\nlibrary(tidycensus)\n\n# pull median household income and shapefiles for Census tracts in Alabama\nget_acs(geography = \"tract\", \n variables = \"B19013_001\", \n state = \"01\",\n year = 2015,\n geometry = TRUE,\n progress = FALSE)\n\nSimple feature collection with 1181 features and 5 fields (with 1 geometry empty)\nGeometry type: MULTIPOLYGON\nDimension: XY\nBounding box: xmin: -88.47323 ymin: 30.22333 xmax: -84.88908 ymax: 35.00803\nGeodetic CRS: NAD83\nFirst 10 features:\n GEOID NAME variable\n1 01003010500 Census Tract 105, Baldwin County, Alabama B19013_001\n2 01003011501 Census Tract 115.01, Baldwin County, Alabama B19013_001\n3 01009050500 Census Tract 505, Blount County, Alabama B19013_001\n4 01015981901 Census Tract 9819.01, Calhoun County, Alabama B19013_001\n5 01025957700 Census Tract 9577, Clarke County, Alabama B19013_001\n6 01025958002 Census Tract 9580.02, Clarke County, Alabama B19013_001\n7 01031011000 Census Tract 110, Coffee County, Alabama B19013_001\n8 01033020500 Census Tract 205, Colbert County, Alabama B19013_001\n9 01037961200 Census Tract 9612, Coosa County, Alabama B19013_001\n10 01039961700 Census Tract 9617, Covington County, Alabama B19013_001\n estimate moe geometry\n1 41944 8100 MULTIPOLYGON (((-87.80249 3...\n2 41417 14204 MULTIPOLYGON (((-87.71719 3...\n3 40055 8054 MULTIPOLYGON (((-86.75735 3...\n4 NA NA MULTIPOLYGON (((-86.01323 3...\n5 32708 4806 MULTIPOLYGON (((-88.1805 31...\n6 29048 14759 MULTIPOLYGON (((-87.98623 3...\n7 44732 7640 MULTIPOLYGON (((-85.92018 3...\n8 49052 6543 MULTIPOLYGON (((-87.76733 3...\n9 31957 9954 MULTIPOLYGON (((-86.46069 3...\n10 32697 6021 MULTIPOLYGON (((-86.6998 31...\n\n\nSmaller geographies like Census tracts can only be pulled state-by-state. This example demonstrates how to iterate across FIPS codes to pull Census tracts for multiple states. The process is as follows:\n\nPick the variables of interest\nCreate a vector of state FIPS codes for the states of interest\nCreate a custom function that works on a single state FIPS code\nIterate the function along the vector of state FIPS codes with map_df() from library(purrr)\n\nHere is an example that pulls median household income at the Census tract level for multiple states:\n\n# variables of interest\nvars <- c(\n \"B19013_001\" # median household income estimate\n)\n\n# states of interest: alabama, alaska, arizona\nstate_fips <- c(\"01\", \"02\", \"04\")\n \n# create a custom function that works for one state\nget_income <- function(state_fips) {\n \n income_data <- get_acs(geography = \"tract\", \n variables = vars, \n state = state_fips,\n year = 2015)\n \n return(income_data)\n \n}\n\n# iterate the function\nmap_df(.x = state_fips, # iterate along the vector of state fips codes\n .f = get_income) # apply get_income() to each fips_code \n\n# A tibble: 2,874 × 5\n GEOID NAME varia…¹ estim…² moe\n <chr> <chr> <chr> <dbl> <dbl>\n 1 01001020100 Census Tract 201, Autauga County, Alabama B19013… 61838 11900\n 2 01001020200 Census Tract 202, Autauga County, Alabama B19013… 32303 13538\n 3 01001020300 Census Tract 203, Autauga County, Alabama B19013… 44922 5629\n 4 01001020400 Census Tract 204, Autauga County, Alabama B19013… 54329 7003\n 5 01001020500 Census Tract 205, Autauga County, Alabama B19013… 51965 6935\n 6 01001020600 Census Tract 206, Autauga County, Alabama B19013… 63092 9585\n 7 01001020700 Census Tract 207, Autauga County, Alabama B19013… 34821 7867\n 8 01001020801 Census Tract 208.01, Autauga County, Alaba… B19013… 73728 2447\n 9 01001020802 Census Tract 208.02, Autauga County, Alaba… B19013… 60063 8602\n10 01001020900 Census Tract 209, Autauga County, Alabama B19013… 41287 7857\n# … with 2,864 more rows, and abbreviated variable names ¹variable, ²estimate\n\n\nlibrary(tidycensus) works well with library(tidyverse) and enables access to geospatial data, but it is limited to only some Census Bureau data sets. The next package has less functionality but allows for accessing any data available on the Census API."
+ },
+ {
+ "objectID": "getting-data.html#librarycensusapi",
+ "href": "getting-data.html#librarycensusapi",
+ "title": "Introduction",
+ "section": "library(censusapi)",
+ "text": "library(censusapi)\nlibrary(censusapi) by Hannah Recht (complete intro here) can access any published table that is accessible through the Census Bureau API. A full listing is available here.\nYou will need to apply for a Census API key and add it to your R session. Don’t add your API key to your script and don’t add it to a GitHub repository!\nHere is a simple example that pulls median household income and its margin of error for Census tracts in Alabama:\n\nlibrary(tidyverse)\nlibrary(purrr)\nlibrary(censusapi)\nvars <- c(\n \"B19013_001E\", # median household income estimate\n \"B19013_001M\" # median household income margin of error\n)\n\ngetCensus(name = \"acs/acs5\",\n key = Sys.getenv(\"CENSUS_API_KEY\"),\n vars = vars, \n region = \"tract:*\",\n regionin = \"state:01\",\n vintage = 2015) %>%\n as_tibble()\n\n# A tibble: 1,181 × 5\n state county tract B19013_001E B19013_001M\n <chr> <chr> <chr> <dbl> <dbl>\n 1 01 103 005109 29644 4098\n 2 01 103 005106 35864 3443\n 3 01 103 005107 66739 5468\n 4 01 103 005108 64632 9804\n 5 01 103 005701 46306 7926\n 6 01 103 005702 47769 12939\n 7 01 105 686800 30662 7299\n 8 01 009 050102 43325 9484\n 9 01 009 050300 37548 9655\n10 01 009 050700 46452 5167\n# … with 1,171 more rows\n\n\nSmaller geographies like Census tracts can only be pulled state-by-state. This example demonstrates how to iterate across FIPS codes to pull Census tracts for multiple states. The process is as follows:\n\nPick the variables of interest\nCreate a vector of state FIPS codes for the states of interest\nCreate a custom function that works on a single state FIPS code\nIterate the function along the vector of state FIPS codes with map_df() from library(purrr)\n\nHere is an example that pulls median household income at the Census tract level for multiple states:\n\n# variables of interest\nvars <- c(\n \"B19013_001E\", # median household income estimate\n \"B19013_001M\" # median household income margin of error\n)\n\n# states of interest: alabama, alaska, arizona\nstate_fips <- c(\"01\", \"02\", \"04\")\n \n# create a custom function that works for one state\nget_income <- function(state_fips) {\n \n income_data <- getCensus(name = \"acs/acs5\", \n key = Sys.getenv(\"CENSUS_API_KEY\"),\n vars = vars, \n region = \"tract:*\",\n regionin = paste0(\"state:\", state_fips),\n vintage = 2015)\n \n return(income_data)\n \n}\n\n# iterate the function\nmap_df(.x = state_fips, # iterate along the vector of state fips codes\n .f = get_income) %>% # apply get_income() to each fips_code \n as_tibble() \n\n# A tibble: 2,874 × 5\n state county tract B19013_001E B19013_001M\n <chr> <chr> <chr> <dbl> <dbl>\n 1 01 103 005109 29644 4098\n 2 01 103 005106 35864 3443\n 3 01 103 005107 66739 5468\n 4 01 103 005108 64632 9804\n 5 01 103 005701 46306 7926\n 6 01 103 005702 47769 12939\n 7 01 105 686800 30662 7299\n 8 01 009 050102 43325 9484\n 9 01 009 050300 37548 9655\n10 01 009 050700 46452 5167\n# … with 2,864 more rows"
}
]
\ No newline at end of file
diff --git a/docs/site_libs/HomeButton-0.0.1/.home-button 2.css.icloud b/docs/site_libs/HomeButton-0.0.1/.home-button 2.css.icloud
new file mode 100644
index 0000000..9f926dd
Binary files /dev/null and b/docs/site_libs/HomeButton-0.0.1/.home-button 2.css.icloud differ
diff --git a/docs/site_libs/bootstrap/.bootstrap-icons 2.css.icloud b/docs/site_libs/bootstrap/.bootstrap-icons 2.css.icloud
deleted file mode 100644
index 5df6670..0000000
Binary files a/docs/site_libs/bootstrap/.bootstrap-icons 2.css.icloud and /dev/null differ
diff --git a/docs/site_libs/bootstrap/.bootstrap-icons 2.woff.icloud b/docs/site_libs/bootstrap/.bootstrap-icons 2.woff.icloud
deleted file mode 100644
index 27db1a9..0000000
Binary files a/docs/site_libs/bootstrap/.bootstrap-icons 2.woff.icloud and /dev/null differ
diff --git a/docs/site_libs/bootstrap/bootstrap-icons.css b/docs/site_libs/bootstrap/bootstrap-icons.css
index f51d04b..94f1940 100644
--- a/docs/site_libs/bootstrap/bootstrap-icons.css
+++ b/docs/site_libs/bootstrap/bootstrap-icons.css
@@ -1,7 +1,8 @@
@font-face {
+ font-display: block;
font-family: "bootstrap-icons";
src:
-url("./bootstrap-icons.woff?524846017b983fc8ded9325d94ed40f3") format("woff");
+url("./bootstrap-icons.woff?2ab2cbbe07fcebb53bdaa7313bb290f2") format("woff");
}
.bi::before,
@@ -1702,3 +1703,316 @@ url("./bootstrap-icons.woff?524846017b983fc8ded9325d94ed40f3") format("woff");
.bi-filetype-json::before { content: "\f791"; }
.bi-filetype-pptx::before { content: "\f792"; }
.bi-filetype-xlsx::before { content: "\f793"; }
+.bi-1-circle-1::before { content: "\f794"; }
+.bi-1-circle-fill-1::before { content: "\f795"; }
+.bi-1-circle-fill::before { content: "\f796"; }
+.bi-1-circle::before { content: "\f797"; }
+.bi-1-square-fill::before { content: "\f798"; }
+.bi-1-square::before { content: "\f799"; }
+.bi-2-circle-1::before { content: "\f79a"; }
+.bi-2-circle-fill-1::before { content: "\f79b"; }
+.bi-2-circle-fill::before { content: "\f79c"; }
+.bi-2-circle::before { content: "\f79d"; }
+.bi-2-square-fill::before { content: "\f79e"; }
+.bi-2-square::before { content: "\f79f"; }
+.bi-3-circle-1::before { content: "\f7a0"; }
+.bi-3-circle-fill-1::before { content: "\f7a1"; }
+.bi-3-circle-fill::before { content: "\f7a2"; }
+.bi-3-circle::before { content: "\f7a3"; }
+.bi-3-square-fill::before { content: "\f7a4"; }
+.bi-3-square::before { content: "\f7a5"; }
+.bi-4-circle-1::before { content: "\f7a6"; }
+.bi-4-circle-fill-1::before { content: "\f7a7"; }
+.bi-4-circle-fill::before { content: "\f7a8"; }
+.bi-4-circle::before { content: "\f7a9"; }
+.bi-4-square-fill::before { content: "\f7aa"; }
+.bi-4-square::before { content: "\f7ab"; }
+.bi-5-circle-1::before { content: "\f7ac"; }
+.bi-5-circle-fill-1::before { content: "\f7ad"; }
+.bi-5-circle-fill::before { content: "\f7ae"; }
+.bi-5-circle::before { content: "\f7af"; }
+.bi-5-square-fill::before { content: "\f7b0"; }
+.bi-5-square::before { content: "\f7b1"; }
+.bi-6-circle-1::before { content: "\f7b2"; }
+.bi-6-circle-fill-1::before { content: "\f7b3"; }
+.bi-6-circle-fill::before { content: "\f7b4"; }
+.bi-6-circle::before { content: "\f7b5"; }
+.bi-6-square-fill::before { content: "\f7b6"; }
+.bi-6-square::before { content: "\f7b7"; }
+.bi-7-circle-1::before { content: "\f7b8"; }
+.bi-7-circle-fill-1::before { content: "\f7b9"; }
+.bi-7-circle-fill::before { content: "\f7ba"; }
+.bi-7-circle::before { content: "\f7bb"; }
+.bi-7-square-fill::before { content: "\f7bc"; }
+.bi-7-square::before { content: "\f7bd"; }
+.bi-8-circle-1::before { content: "\f7be"; }
+.bi-8-circle-fill-1::before { content: "\f7bf"; }
+.bi-8-circle-fill::before { content: "\f7c0"; }
+.bi-8-circle::before { content: "\f7c1"; }
+.bi-8-square-fill::before { content: "\f7c2"; }
+.bi-8-square::before { content: "\f7c3"; }
+.bi-9-circle-1::before { content: "\f7c4"; }
+.bi-9-circle-fill-1::before { content: "\f7c5"; }
+.bi-9-circle-fill::before { content: "\f7c6"; }
+.bi-9-circle::before { content: "\f7c7"; }
+.bi-9-square-fill::before { content: "\f7c8"; }
+.bi-9-square::before { content: "\f7c9"; }
+.bi-airplane-engines-fill::before { content: "\f7ca"; }
+.bi-airplane-engines::before { content: "\f7cb"; }
+.bi-airplane-fill::before { content: "\f7cc"; }
+.bi-airplane::before { content: "\f7cd"; }
+.bi-alexa::before { content: "\f7ce"; }
+.bi-alipay::before { content: "\f7cf"; }
+.bi-android::before { content: "\f7d0"; }
+.bi-android2::before { content: "\f7d1"; }
+.bi-box-fill::before { content: "\f7d2"; }
+.bi-box-seam-fill::before { content: "\f7d3"; }
+.bi-browser-chrome::before { content: "\f7d4"; }
+.bi-browser-edge::before { content: "\f7d5"; }
+.bi-browser-firefox::before { content: "\f7d6"; }
+.bi-browser-safari::before { content: "\f7d7"; }
+.bi-c-circle-1::before { content: "\f7d8"; }
+.bi-c-circle-fill-1::before { content: "\f7d9"; }
+.bi-c-circle-fill::before { content: "\f7da"; }
+.bi-c-circle::before { content: "\f7db"; }
+.bi-c-square-fill::before { content: "\f7dc"; }
+.bi-c-square::before { content: "\f7dd"; }
+.bi-capsule-pill::before { content: "\f7de"; }
+.bi-capsule::before { content: "\f7df"; }
+.bi-car-front-fill::before { content: "\f7e0"; }
+.bi-car-front::before { content: "\f7e1"; }
+.bi-cassette-fill::before { content: "\f7e2"; }
+.bi-cassette::before { content: "\f7e3"; }
+.bi-cc-circle-1::before { content: "\f7e4"; }
+.bi-cc-circle-fill-1::before { content: "\f7e5"; }
+.bi-cc-circle-fill::before { content: "\f7e6"; }
+.bi-cc-circle::before { content: "\f7e7"; }
+.bi-cc-square-fill::before { content: "\f7e8"; }
+.bi-cc-square::before { content: "\f7e9"; }
+.bi-cup-hot-fill::before { content: "\f7ea"; }
+.bi-cup-hot::before { content: "\f7eb"; }
+.bi-currency-rupee::before { content: "\f7ec"; }
+.bi-dropbox::before { content: "\f7ed"; }
+.bi-escape::before { content: "\f7ee"; }
+.bi-fast-forward-btn-fill::before { content: "\f7ef"; }
+.bi-fast-forward-btn::before { content: "\f7f0"; }
+.bi-fast-forward-circle-fill::before { content: "\f7f1"; }
+.bi-fast-forward-circle::before { content: "\f7f2"; }
+.bi-fast-forward-fill::before { content: "\f7f3"; }
+.bi-fast-forward::before { content: "\f7f4"; }
+.bi-filetype-sql::before { content: "\f7f5"; }
+.bi-fire::before { content: "\f7f6"; }
+.bi-google-play::before { content: "\f7f7"; }
+.bi-h-circle-1::before { content: "\f7f8"; }
+.bi-h-circle-fill-1::before { content: "\f7f9"; }
+.bi-h-circle-fill::before { content: "\f7fa"; }
+.bi-h-circle::before { content: "\f7fb"; }
+.bi-h-square-fill::before { content: "\f7fc"; }
+.bi-h-square::before { content: "\f7fd"; }
+.bi-indent::before { content: "\f7fe"; }
+.bi-lungs-fill::before { content: "\f7ff"; }
+.bi-lungs::before { content: "\f800"; }
+.bi-microsoft-teams::before { content: "\f801"; }
+.bi-p-circle-1::before { content: "\f802"; }
+.bi-p-circle-fill-1::before { content: "\f803"; }
+.bi-p-circle-fill::before { content: "\f804"; }
+.bi-p-circle::before { content: "\f805"; }
+.bi-p-square-fill::before { content: "\f806"; }
+.bi-p-square::before { content: "\f807"; }
+.bi-pass-fill::before { content: "\f808"; }
+.bi-pass::before { content: "\f809"; }
+.bi-prescription::before { content: "\f80a"; }
+.bi-prescription2::before { content: "\f80b"; }
+.bi-r-circle-1::before { content: "\f80c"; }
+.bi-r-circle-fill-1::before { content: "\f80d"; }
+.bi-r-circle-fill::before { content: "\f80e"; }
+.bi-r-circle::before { content: "\f80f"; }
+.bi-r-square-fill::before { content: "\f810"; }
+.bi-r-square::before { content: "\f811"; }
+.bi-repeat-1::before { content: "\f812"; }
+.bi-repeat::before { content: "\f813"; }
+.bi-rewind-btn-fill::before { content: "\f814"; }
+.bi-rewind-btn::before { content: "\f815"; }
+.bi-rewind-circle-fill::before { content: "\f816"; }
+.bi-rewind-circle::before { content: "\f817"; }
+.bi-rewind-fill::before { content: "\f818"; }
+.bi-rewind::before { content: "\f819"; }
+.bi-train-freight-front-fill::before { content: "\f81a"; }
+.bi-train-freight-front::before { content: "\f81b"; }
+.bi-train-front-fill::before { content: "\f81c"; }
+.bi-train-front::before { content: "\f81d"; }
+.bi-train-lightrail-front-fill::before { content: "\f81e"; }
+.bi-train-lightrail-front::before { content: "\f81f"; }
+.bi-truck-front-fill::before { content: "\f820"; }
+.bi-truck-front::before { content: "\f821"; }
+.bi-ubuntu::before { content: "\f822"; }
+.bi-unindent::before { content: "\f823"; }
+.bi-unity::before { content: "\f824"; }
+.bi-universal-access-circle::before { content: "\f825"; }
+.bi-universal-access::before { content: "\f826"; }
+.bi-virus::before { content: "\f827"; }
+.bi-virus2::before { content: "\f828"; }
+.bi-wechat::before { content: "\f829"; }
+.bi-yelp::before { content: "\f82a"; }
+.bi-sign-stop-fill::before { content: "\f82b"; }
+.bi-sign-stop-lights-fill::before { content: "\f82c"; }
+.bi-sign-stop-lights::before { content: "\f82d"; }
+.bi-sign-stop::before { content: "\f82e"; }
+.bi-sign-turn-left-fill::before { content: "\f82f"; }
+.bi-sign-turn-left::before { content: "\f830"; }
+.bi-sign-turn-right-fill::before { content: "\f831"; }
+.bi-sign-turn-right::before { content: "\f832"; }
+.bi-sign-turn-slight-left-fill::before { content: "\f833"; }
+.bi-sign-turn-slight-left::before { content: "\f834"; }
+.bi-sign-turn-slight-right-fill::before { content: "\f835"; }
+.bi-sign-turn-slight-right::before { content: "\f836"; }
+.bi-sign-yield-fill::before { content: "\f837"; }
+.bi-sign-yield::before { content: "\f838"; }
+.bi-ev-station-fill::before { content: "\f839"; }
+.bi-ev-station::before { content: "\f83a"; }
+.bi-fuel-pump-diesel-fill::before { content: "\f83b"; }
+.bi-fuel-pump-diesel::before { content: "\f83c"; }
+.bi-fuel-pump-fill::before { content: "\f83d"; }
+.bi-fuel-pump::before { content: "\f83e"; }
+.bi-0-circle-fill::before { content: "\f83f"; }
+.bi-0-circle::before { content: "\f840"; }
+.bi-0-square-fill::before { content: "\f841"; }
+.bi-0-square::before { content: "\f842"; }
+.bi-rocket-fill::before { content: "\f843"; }
+.bi-rocket-takeoff-fill::before { content: "\f844"; }
+.bi-rocket-takeoff::before { content: "\f845"; }
+.bi-rocket::before { content: "\f846"; }
+.bi-stripe::before { content: "\f847"; }
+.bi-subscript::before { content: "\f848"; }
+.bi-superscript::before { content: "\f849"; }
+.bi-trello::before { content: "\f84a"; }
+.bi-envelope-at-fill::before { content: "\f84b"; }
+.bi-envelope-at::before { content: "\f84c"; }
+.bi-regex::before { content: "\f84d"; }
+.bi-text-wrap::before { content: "\f84e"; }
+.bi-sign-dead-end-fill::before { content: "\f84f"; }
+.bi-sign-dead-end::before { content: "\f850"; }
+.bi-sign-do-not-enter-fill::before { content: "\f851"; }
+.bi-sign-do-not-enter::before { content: "\f852"; }
+.bi-sign-intersection-fill::before { content: "\f853"; }
+.bi-sign-intersection-side-fill::before { content: "\f854"; }
+.bi-sign-intersection-side::before { content: "\f855"; }
+.bi-sign-intersection-t-fill::before { content: "\f856"; }
+.bi-sign-intersection-t::before { content: "\f857"; }
+.bi-sign-intersection-y-fill::before { content: "\f858"; }
+.bi-sign-intersection-y::before { content: "\f859"; }
+.bi-sign-intersection::before { content: "\f85a"; }
+.bi-sign-merge-left-fill::before { content: "\f85b"; }
+.bi-sign-merge-left::before { content: "\f85c"; }
+.bi-sign-merge-right-fill::before { content: "\f85d"; }
+.bi-sign-merge-right::before { content: "\f85e"; }
+.bi-sign-no-left-turn-fill::before { content: "\f85f"; }
+.bi-sign-no-left-turn::before { content: "\f860"; }
+.bi-sign-no-parking-fill::before { content: "\f861"; }
+.bi-sign-no-parking::before { content: "\f862"; }
+.bi-sign-no-right-turn-fill::before { content: "\f863"; }
+.bi-sign-no-right-turn::before { content: "\f864"; }
+.bi-sign-railroad-fill::before { content: "\f865"; }
+.bi-sign-railroad::before { content: "\f866"; }
+.bi-building-add::before { content: "\f867"; }
+.bi-building-check::before { content: "\f868"; }
+.bi-building-dash::before { content: "\f869"; }
+.bi-building-down::before { content: "\f86a"; }
+.bi-building-exclamation::before { content: "\f86b"; }
+.bi-building-fill-add::before { content: "\f86c"; }
+.bi-building-fill-check::before { content: "\f86d"; }
+.bi-building-fill-dash::before { content: "\f86e"; }
+.bi-building-fill-down::before { content: "\f86f"; }
+.bi-building-fill-exclamation::before { content: "\f870"; }
+.bi-building-fill-gear::before { content: "\f871"; }
+.bi-building-fill-lock::before { content: "\f872"; }
+.bi-building-fill-slash::before { content: "\f873"; }
+.bi-building-fill-up::before { content: "\f874"; }
+.bi-building-fill-x::before { content: "\f875"; }
+.bi-building-fill::before { content: "\f876"; }
+.bi-building-gear::before { content: "\f877"; }
+.bi-building-lock::before { content: "\f878"; }
+.bi-building-slash::before { content: "\f879"; }
+.bi-building-up::before { content: "\f87a"; }
+.bi-building-x::before { content: "\f87b"; }
+.bi-buildings-fill::before { content: "\f87c"; }
+.bi-buildings::before { content: "\f87d"; }
+.bi-bus-front-fill::before { content: "\f87e"; }
+.bi-bus-front::before { content: "\f87f"; }
+.bi-ev-front-fill::before { content: "\f880"; }
+.bi-ev-front::before { content: "\f881"; }
+.bi-globe-americas::before { content: "\f882"; }
+.bi-globe-asia-australia::before { content: "\f883"; }
+.bi-globe-central-south-asia::before { content: "\f884"; }
+.bi-globe-europe-africa::before { content: "\f885"; }
+.bi-house-add-fill::before { content: "\f886"; }
+.bi-house-add::before { content: "\f887"; }
+.bi-house-check-fill::before { content: "\f888"; }
+.bi-house-check::before { content: "\f889"; }
+.bi-house-dash-fill::before { content: "\f88a"; }
+.bi-house-dash::before { content: "\f88b"; }
+.bi-house-down-fill::before { content: "\f88c"; }
+.bi-house-down::before { content: "\f88d"; }
+.bi-house-exclamation-fill::before { content: "\f88e"; }
+.bi-house-exclamation::before { content: "\f88f"; }
+.bi-house-gear-fill::before { content: "\f890"; }
+.bi-house-gear::before { content: "\f891"; }
+.bi-house-lock-fill::before { content: "\f892"; }
+.bi-house-lock::before { content: "\f893"; }
+.bi-house-slash-fill::before { content: "\f894"; }
+.bi-house-slash::before { content: "\f895"; }
+.bi-house-up-fill::before { content: "\f896"; }
+.bi-house-up::before { content: "\f897"; }
+.bi-house-x-fill::before { content: "\f898"; }
+.bi-house-x::before { content: "\f899"; }
+.bi-person-add::before { content: "\f89a"; }
+.bi-person-down::before { content: "\f89b"; }
+.bi-person-exclamation::before { content: "\f89c"; }
+.bi-person-fill-add::before { content: "\f89d"; }
+.bi-person-fill-check::before { content: "\f89e"; }
+.bi-person-fill-dash::before { content: "\f89f"; }
+.bi-person-fill-down::before { content: "\f8a0"; }
+.bi-person-fill-exclamation::before { content: "\f8a1"; }
+.bi-person-fill-gear::before { content: "\f8a2"; }
+.bi-person-fill-lock::before { content: "\f8a3"; }
+.bi-person-fill-slash::before { content: "\f8a4"; }
+.bi-person-fill-up::before { content: "\f8a5"; }
+.bi-person-fill-x::before { content: "\f8a6"; }
+.bi-person-gear::before { content: "\f8a7"; }
+.bi-person-lock::before { content: "\f8a8"; }
+.bi-person-slash::before { content: "\f8a9"; }
+.bi-person-up::before { content: "\f8aa"; }
+.bi-scooter::before { content: "\f8ab"; }
+.bi-taxi-front-fill::before { content: "\f8ac"; }
+.bi-taxi-front::before { content: "\f8ad"; }
+.bi-amd::before { content: "\f8ae"; }
+.bi-database-add::before { content: "\f8af"; }
+.bi-database-check::before { content: "\f8b0"; }
+.bi-database-dash::before { content: "\f8b1"; }
+.bi-database-down::before { content: "\f8b2"; }
+.bi-database-exclamation::before { content: "\f8b3"; }
+.bi-database-fill-add::before { content: "\f8b4"; }
+.bi-database-fill-check::before { content: "\f8b5"; }
+.bi-database-fill-dash::before { content: "\f8b6"; }
+.bi-database-fill-down::before { content: "\f8b7"; }
+.bi-database-fill-exclamation::before { content: "\f8b8"; }
+.bi-database-fill-gear::before { content: "\f8b9"; }
+.bi-database-fill-lock::before { content: "\f8ba"; }
+.bi-database-fill-slash::before { content: "\f8bb"; }
+.bi-database-fill-up::before { content: "\f8bc"; }
+.bi-database-fill-x::before { content: "\f8bd"; }
+.bi-database-fill::before { content: "\f8be"; }
+.bi-database-gear::before { content: "\f8bf"; }
+.bi-database-lock::before { content: "\f8c0"; }
+.bi-database-slash::before { content: "\f8c1"; }
+.bi-database-up::before { content: "\f8c2"; }
+.bi-database-x::before { content: "\f8c3"; }
+.bi-database::before { content: "\f8c4"; }
+.bi-houses-fill::before { content: "\f8c5"; }
+.bi-houses::before { content: "\f8c6"; }
+.bi-nvidia::before { content: "\f8c7"; }
+.bi-person-vcard-fill::before { content: "\f8c8"; }
+.bi-person-vcard::before { content: "\f8c9"; }
+.bi-sina-weibo::before { content: "\f8ca"; }
+.bi-tencent-qq::before { content: "\f8cb"; }
+.bi-wikipedia::before { content: "\f8cc"; }
diff --git a/docs/site_libs/bootstrap/bootstrap-icons.woff b/docs/site_libs/bootstrap/bootstrap-icons.woff
index b26ccd1..18d21d4 100644
Binary files a/docs/site_libs/bootstrap/bootstrap-icons.woff and b/docs/site_libs/bootstrap/bootstrap-icons.woff differ
diff --git a/docs/site_libs/bootstrap/bootstrap.min 2.js b/docs/site_libs/bootstrap/bootstrap.min 2.js
deleted file mode 100644
index cc0a255..0000000
--- a/docs/site_libs/bootstrap/bootstrap.min 2.js
+++ /dev/null
@@ -1,7 +0,0 @@
-/*!
- * Bootstrap v5.1.3 (https://getbootstrap.com/)
- * Copyright 2011-2021 The Bootstrap Authors (https://github.com/twbs/bootstrap/graphs/contributors)
- * Licensed under MIT (https://github.com/twbs/bootstrap/blob/main/LICENSE)
- */
-!function(t,e){"object"==typeof exports&&"undefined"!=typeof module?module.exports=e():"function"==typeof define&&define.amd?define(e):(t="undefined"!=typeof globalThis?globalThis:t||self).bootstrap=e()}(this,(function(){"use strict";const t="transitionend",e=t=>{let e=t.getAttribute("data-bs-target");if(!e||"#"===e){let i=t.getAttribute("href");if(!i||!i.includes("#")&&!i.startsWith("."))return null;i.includes("#")&&!i.startsWith("#")&&(i=`#${i.split("#")[1]}`),e=i&&"#"!==i?i.trim():null}return e},i=t=>{const i=e(t);return i&&document.querySelector(i)?i:null},n=t=>{const i=e(t);return i?document.querySelector(i):null},s=e=>{e.dispatchEvent(new Event(t))},o=t=>!(!t||"object"!=typeof t)&&(void 0!==t.jquery&&(t=t[0]),void 0!==t.nodeType),r=t=>o(t)?t.jquery?t[0]:t:"string"==typeof t&&t.length>0?document.querySelector(t):null,a=(t,e,i)=>{Object.keys(i).forEach((n=>{const s=i[n],r=e[n],a=r&&o(r)?"element":null==(l=r)?`${l}`:{}.toString.call(l).match(/\s([a-z]+)/i)[1].toLowerCase();var l;if(!new RegExp(s).test(a))throw new TypeError(`${t.toUpperCase()}: Option "${n}" provided type "${a}" but expected type "${s}".`)}))},l=t=>!(!o(t)||0===t.getClientRects().length)&&"visible"===getComputedStyle(t).getPropertyValue("visibility"),c=t=>!t||t.nodeType!==Node.ELEMENT_NODE||!!t.classList.contains("disabled")||(void 0!==t.disabled?t.disabled:t.hasAttribute("disabled")&&"false"!==t.getAttribute("disabled")),h=t=>{if(!document.documentElement.attachShadow)return null;if("function"==typeof t.getRootNode){const e=t.getRootNode();return e instanceof ShadowRoot?e:null}return t instanceof ShadowRoot?t:t.parentNode?h(t.parentNode):null},d=()=>{},u=t=>{t.offsetHeight},f=()=>{const{jQuery:t}=window;return t&&!document.body.hasAttribute("data-bs-no-jquery")?t:null},p=[],m=()=>"rtl"===document.documentElement.dir,g=t=>{var e;e=()=>{const e=f();if(e){const i=t.NAME,n=e.fn[i];e.fn[i]=t.jQueryInterface,e.fn[i].Constructor=t,e.fn[i].noConflict=()=>(e.fn[i]=n,t.jQueryInterface)}},"loading"===document.readyState?(p.length||document.addEventListener("DOMContentLoaded",(()=>{p.forEach((t=>t()))})),p.push(e)):e()},_=t=>{"function"==typeof t&&t()},b=(e,i,n=!0)=>{if(!n)return void _(e);const o=(t=>{if(!t)return 0;let{transitionDuration:e,transitionDelay:i}=window.getComputedStyle(t);const n=Number.parseFloat(e),s=Number.parseFloat(i);return n||s?(e=e.split(",")[0],i=i.split(",")[0],1e3*(Number.parseFloat(e)+Number.parseFloat(i))):0})(i)+5;let r=!1;const a=({target:n})=>{n===i&&(r=!0,i.removeEventListener(t,a),_(e))};i.addEventListener(t,a),setTimeout((()=>{r||s(i)}),o)},v=(t,e,i,n)=>{let s=t.indexOf(e);if(-1===s)return t[!i&&n?t.length-1:0];const o=t.length;return s+=i?1:-1,n&&(s=(s+o)%o),t[Math.max(0,Math.min(s,o-1))]},y=/[^.]*(?=\..*)\.|.*/,w=/\..*/,E=/::\d+$/,A={};let T=1;const O={mouseenter:"mouseover",mouseleave:"mouseout"},C=/^(mouseenter|mouseleave)/i,k=new Set(["click","dblclick","mouseup","mousedown","contextmenu","mousewheel","DOMMouseScroll","mouseover","mouseout","mousemove","selectstart","selectend","keydown","keypress","keyup","orientationchange","touchstart","touchmove","touchend","touchcancel","pointerdown","pointermove","pointerup","pointerleave","pointercancel","gesturestart","gesturechange","gestureend","focus","blur","change","reset","select","submit","focusin","focusout","load","unload","beforeunload","resize","move","DOMContentLoaded","readystatechange","error","abort","scroll"]);function L(t,e){return e&&`${e}::${T++}`||t.uidEvent||T++}function x(t){const e=L(t);return t.uidEvent=e,A[e]=A[e]||{},A[e]}function D(t,e,i=null){const n=Object.keys(t);for(let s=0,o=n.length;sfunction(e){if(!e.relatedTarget||e.relatedTarget!==e.delegateTarget&&!e.delegateTarget.contains(e.relatedTarget))return t.call(this,e)};n?n=t(n):i=t(i)}const[o,r,a]=S(e,i,n),l=x(t),c=l[a]||(l[a]={}),h=D(c,r,o?i:null);if(h)return void(h.oneOff=h.oneOff&&s);const d=L(r,e.replace(y,"")),u=o?function(t,e,i){return function n(s){const o=t.querySelectorAll(e);for(let{target:r}=s;r&&r!==this;r=r.parentNode)for(let a=o.length;a--;)if(o[a]===r)return s.delegateTarget=r,n.oneOff&&j.off(t,s.type,e,i),i.apply(r,[s]);return null}}(t,i,n):function(t,e){return function i(n){return n.delegateTarget=t,i.oneOff&&j.off(t,n.type,e),e.apply(t,[n])}}(t,i);u.delegationSelector=o?i:null,u.originalHandler=r,u.oneOff=s,u.uidEvent=d,c[d]=u,t.addEventListener(a,u,o)}function I(t,e,i,n,s){const o=D(e[i],n,s);o&&(t.removeEventListener(i,o,Boolean(s)),delete e[i][o.uidEvent])}function P(t){return t=t.replace(w,""),O[t]||t}const j={on(t,e,i,n){N(t,e,i,n,!1)},one(t,e,i,n){N(t,e,i,n,!0)},off(t,e,i,n){if("string"!=typeof e||!t)return;const[s,o,r]=S(e,i,n),a=r!==e,l=x(t),c=e.startsWith(".");if(void 0!==o){if(!l||!l[r])return;return void I(t,l,r,o,s?i:null)}c&&Object.keys(l).forEach((i=>{!function(t,e,i,n){const s=e[i]||{};Object.keys(s).forEach((o=>{if(o.includes(n)){const n=s[o];I(t,e,i,n.originalHandler,n.delegationSelector)}}))}(t,l,i,e.slice(1))}));const h=l[r]||{};Object.keys(h).forEach((i=>{const n=i.replace(E,"");if(!a||e.includes(n)){const e=h[i];I(t,l,r,e.originalHandler,e.delegationSelector)}}))},trigger(t,e,i){if("string"!=typeof e||!t)return null;const n=f(),s=P(e),o=e!==s,r=k.has(s);let a,l=!0,c=!0,h=!1,d=null;return o&&n&&(a=n.Event(e,i),n(t).trigger(a),l=!a.isPropagationStopped(),c=!a.isImmediatePropagationStopped(),h=a.isDefaultPrevented()),r?(d=document.createEvent("HTMLEvents"),d.initEvent(s,l,!0)):d=new CustomEvent(e,{bubbles:l,cancelable:!0}),void 0!==i&&Object.keys(i).forEach((t=>{Object.defineProperty(d,t,{get:()=>i[t]})})),h&&d.preventDefault(),c&&t.dispatchEvent(d),d.defaultPrevented&&void 0!==a&&a.preventDefault(),d}},M=new Map,H={set(t,e,i){M.has(t)||M.set(t,new Map);const n=M.get(t);n.has(e)||0===n.size?n.set(e,i):console.error(`Bootstrap doesn't allow more than one instance per element. Bound instance: ${Array.from(n.keys())[0]}.`)},get:(t,e)=>M.has(t)&&M.get(t).get(e)||null,remove(t,e){if(!M.has(t))return;const i=M.get(t);i.delete(e),0===i.size&&M.delete(t)}};class B{constructor(t){(t=r(t))&&(this._element=t,H.set(this._element,this.constructor.DATA_KEY,this))}dispose(){H.remove(this._element,this.constructor.DATA_KEY),j.off(this._element,this.constructor.EVENT_KEY),Object.getOwnPropertyNames(this).forEach((t=>{this[t]=null}))}_queueCallback(t,e,i=!0){b(t,e,i)}static getInstance(t){return H.get(r(t),this.DATA_KEY)}static getOrCreateInstance(t,e={}){return this.getInstance(t)||new this(t,"object"==typeof e?e:null)}static get VERSION(){return"5.1.3"}static get NAME(){throw new Error('You have to implement the static method "NAME", for each component!')}static get DATA_KEY(){return`bs.${this.NAME}`}static get EVENT_KEY(){return`.${this.DATA_KEY}`}}const R=(t,e="hide")=>{const i=`click.dismiss${t.EVENT_KEY}`,s=t.NAME;j.on(document,i,`[data-bs-dismiss="${s}"]`,(function(i){if(["A","AREA"].includes(this.tagName)&&i.preventDefault(),c(this))return;const o=n(this)||this.closest(`.${s}`);t.getOrCreateInstance(o)[e]()}))};class W extends B{static get NAME(){return"alert"}close(){if(j.trigger(this._element,"close.bs.alert").defaultPrevented)return;this._element.classList.remove("show");const t=this._element.classList.contains("fade");this._queueCallback((()=>this._destroyElement()),this._element,t)}_destroyElement(){this._element.remove(),j.trigger(this._element,"closed.bs.alert"),this.dispose()}static jQueryInterface(t){return this.each((function(){const e=W.getOrCreateInstance(this);if("string"==typeof t){if(void 0===e[t]||t.startsWith("_")||"constructor"===t)throw new TypeError(`No method named "${t}"`);e[t](this)}}))}}R(W,"close"),g(W);const $='[data-bs-toggle="button"]';class z extends B{static get NAME(){return"button"}toggle(){this._element.setAttribute("aria-pressed",this._element.classList.toggle("active"))}static jQueryInterface(t){return this.each((function(){const e=z.getOrCreateInstance(this);"toggle"===t&&e[t]()}))}}function q(t){return"true"===t||"false"!==t&&(t===Number(t).toString()?Number(t):""===t||"null"===t?null:t)}function F(t){return t.replace(/[A-Z]/g,(t=>`-${t.toLowerCase()}`))}j.on(document,"click.bs.button.data-api",$,(t=>{t.preventDefault();const e=t.target.closest($);z.getOrCreateInstance(e).toggle()})),g(z);const U={setDataAttribute(t,e,i){t.setAttribute(`data-bs-${F(e)}`,i)},removeDataAttribute(t,e){t.removeAttribute(`data-bs-${F(e)}`)},getDataAttributes(t){if(!t)return{};const e={};return Object.keys(t.dataset).filter((t=>t.startsWith("bs"))).forEach((i=>{let n=i.replace(/^bs/,"");n=n.charAt(0).toLowerCase()+n.slice(1,n.length),e[n]=q(t.dataset[i])})),e},getDataAttribute:(t,e)=>q(t.getAttribute(`data-bs-${F(e)}`)),offset(t){const e=t.getBoundingClientRect();return{top:e.top+window.pageYOffset,left:e.left+window.pageXOffset}},position:t=>({top:t.offsetTop,left:t.offsetLeft})},V={find:(t,e=document.documentElement)=>[].concat(...Element.prototype.querySelectorAll.call(e,t)),findOne:(t,e=document.documentElement)=>Element.prototype.querySelector.call(e,t),children:(t,e)=>[].concat(...t.children).filter((t=>t.matches(e))),parents(t,e){const i=[];let n=t.parentNode;for(;n&&n.nodeType===Node.ELEMENT_NODE&&3!==n.nodeType;)n.matches(e)&&i.push(n),n=n.parentNode;return i},prev(t,e){let i=t.previousElementSibling;for(;i;){if(i.matches(e))return[i];i=i.previousElementSibling}return[]},next(t,e){let i=t.nextElementSibling;for(;i;){if(i.matches(e))return[i];i=i.nextElementSibling}return[]},focusableChildren(t){const e=["a","button","input","textarea","select","details","[tabindex]",'[contenteditable="true"]'].map((t=>`${t}:not([tabindex^="-"])`)).join(", ");return this.find(e,t).filter((t=>!c(t)&&l(t)))}},K="carousel",X={interval:5e3,keyboard:!0,slide:!1,pause:"hover",wrap:!0,touch:!0},Y={interval:"(number|boolean)",keyboard:"boolean",slide:"(boolean|string)",pause:"(string|boolean)",wrap:"boolean",touch:"boolean"},Q="next",G="prev",Z="left",J="right",tt={ArrowLeft:J,ArrowRight:Z},et="slid.bs.carousel",it="active",nt=".active.carousel-item";class st extends B{constructor(t,e){super(t),this._items=null,this._interval=null,this._activeElement=null,this._isPaused=!1,this._isSliding=!1,this.touchTimeout=null,this.touchStartX=0,this.touchDeltaX=0,this._config=this._getConfig(e),this._indicatorsElement=V.findOne(".carousel-indicators",this._element),this._touchSupported="ontouchstart"in document.documentElement||navigator.maxTouchPoints>0,this._pointerEvent=Boolean(window.PointerEvent),this._addEventListeners()}static get Default(){return X}static get NAME(){return K}next(){this._slide(Q)}nextWhenVisible(){!document.hidden&&l(this._element)&&this.next()}prev(){this._slide(G)}pause(t){t||(this._isPaused=!0),V.findOne(".carousel-item-next, .carousel-item-prev",this._element)&&(s(this._element),this.cycle(!0)),clearInterval(this._interval),this._interval=null}cycle(t){t||(this._isPaused=!1),this._interval&&(clearInterval(this._interval),this._interval=null),this._config&&this._config.interval&&!this._isPaused&&(this._updateInterval(),this._interval=setInterval((document.visibilityState?this.nextWhenVisible:this.next).bind(this),this._config.interval))}to(t){this._activeElement=V.findOne(nt,this._element);const e=this._getItemIndex(this._activeElement);if(t>this._items.length-1||t<0)return;if(this._isSliding)return void j.one(this._element,et,(()=>this.to(t)));if(e===t)return this.pause(),void this.cycle();const i=t>e?Q:G;this._slide(i,this._items[t])}_getConfig(t){return t={...X,...U.getDataAttributes(this._element),..."object"==typeof t?t:{}},a(K,t,Y),t}_handleSwipe(){const t=Math.abs(this.touchDeltaX);if(t<=40)return;const e=t/this.touchDeltaX;this.touchDeltaX=0,e&&this._slide(e>0?J:Z)}_addEventListeners(){this._config.keyboard&&j.on(this._element,"keydown.bs.carousel",(t=>this._keydown(t))),"hover"===this._config.pause&&(j.on(this._element,"mouseenter.bs.carousel",(t=>this.pause(t))),j.on(this._element,"mouseleave.bs.carousel",(t=>this.cycle(t)))),this._config.touch&&this._touchSupported&&this._addTouchEventListeners()}_addTouchEventListeners(){const t=t=>this._pointerEvent&&("pen"===t.pointerType||"touch"===t.pointerType),e=e=>{t(e)?this.touchStartX=e.clientX:this._pointerEvent||(this.touchStartX=e.touches[0].clientX)},i=t=>{this.touchDeltaX=t.touches&&t.touches.length>1?0:t.touches[0].clientX-this.touchStartX},n=e=>{t(e)&&(this.touchDeltaX=e.clientX-this.touchStartX),this._handleSwipe(),"hover"===this._config.pause&&(this.pause(),this.touchTimeout&&clearTimeout(this.touchTimeout),this.touchTimeout=setTimeout((t=>this.cycle(t)),500+this._config.interval))};V.find(".carousel-item img",this._element).forEach((t=>{j.on(t,"dragstart.bs.carousel",(t=>t.preventDefault()))})),this._pointerEvent?(j.on(this._element,"pointerdown.bs.carousel",(t=>e(t))),j.on(this._element,"pointerup.bs.carousel",(t=>n(t))),this._element.classList.add("pointer-event")):(j.on(this._element,"touchstart.bs.carousel",(t=>e(t))),j.on(this._element,"touchmove.bs.carousel",(t=>i(t))),j.on(this._element,"touchend.bs.carousel",(t=>n(t))))}_keydown(t){if(/input|textarea/i.test(t.target.tagName))return;const e=tt[t.key];e&&(t.preventDefault(),this._slide(e))}_getItemIndex(t){return this._items=t&&t.parentNode?V.find(".carousel-item",t.parentNode):[],this._items.indexOf(t)}_getItemByOrder(t,e){const i=t===Q;return v(this._items,e,i,this._config.wrap)}_triggerSlideEvent(t,e){const i=this._getItemIndex(t),n=this._getItemIndex(V.findOne(nt,this._element));return j.trigger(this._element,"slide.bs.carousel",{relatedTarget:t,direction:e,from:n,to:i})}_setActiveIndicatorElement(t){if(this._indicatorsElement){const e=V.findOne(".active",this._indicatorsElement);e.classList.remove(it),e.removeAttribute("aria-current");const i=V.find("[data-bs-target]",this._indicatorsElement);for(let e=0;e{j.trigger(this._element,et,{relatedTarget:o,direction:d,from:s,to:r})};if(this._element.classList.contains("slide")){o.classList.add(h),u(o),n.classList.add(c),o.classList.add(c);const t=()=>{o.classList.remove(c,h),o.classList.add(it),n.classList.remove(it,h,c),this._isSliding=!1,setTimeout(f,0)};this._queueCallback(t,n,!0)}else n.classList.remove(it),o.classList.add(it),this._isSliding=!1,f();a&&this.cycle()}_directionToOrder(t){return[J,Z].includes(t)?m()?t===Z?G:Q:t===Z?Q:G:t}_orderToDirection(t){return[Q,G].includes(t)?m()?t===G?Z:J:t===G?J:Z:t}static carouselInterface(t,e){const i=st.getOrCreateInstance(t,e);let{_config:n}=i;"object"==typeof e&&(n={...n,...e});const s="string"==typeof e?e:n.slide;if("number"==typeof e)i.to(e);else if("string"==typeof s){if(void 0===i[s])throw new TypeError(`No method named "${s}"`);i[s]()}else n.interval&&n.ride&&(i.pause(),i.cycle())}static jQueryInterface(t){return this.each((function(){st.carouselInterface(this,t)}))}static dataApiClickHandler(t){const e=n(this);if(!e||!e.classList.contains("carousel"))return;const i={...U.getDataAttributes(e),...U.getDataAttributes(this)},s=this.getAttribute("data-bs-slide-to");s&&(i.interval=!1),st.carouselInterface(e,i),s&&st.getInstance(e).to(s),t.preventDefault()}}j.on(document,"click.bs.carousel.data-api","[data-bs-slide], [data-bs-slide-to]",st.dataApiClickHandler),j.on(window,"load.bs.carousel.data-api",(()=>{const t=V.find('[data-bs-ride="carousel"]');for(let e=0,i=t.length;et===this._element));null!==s&&o.length&&(this._selector=s,this._triggerArray.push(e))}this._initializeChildren(),this._config.parent||this._addAriaAndCollapsedClass(this._triggerArray,this._isShown()),this._config.toggle&&this.toggle()}static get Default(){return rt}static get NAME(){return ot}toggle(){this._isShown()?this.hide():this.show()}show(){if(this._isTransitioning||this._isShown())return;let t,e=[];if(this._config.parent){const t=V.find(ut,this._config.parent);e=V.find(".collapse.show, .collapse.collapsing",this._config.parent).filter((e=>!t.includes(e)))}const i=V.findOne(this._selector);if(e.length){const n=e.find((t=>i!==t));if(t=n?pt.getInstance(n):null,t&&t._isTransitioning)return}if(j.trigger(this._element,"show.bs.collapse").defaultPrevented)return;e.forEach((e=>{i!==e&&pt.getOrCreateInstance(e,{toggle:!1}).hide(),t||H.set(e,"bs.collapse",null)}));const n=this._getDimension();this._element.classList.remove(ct),this._element.classList.add(ht),this._element.style[n]=0,this._addAriaAndCollapsedClass(this._triggerArray,!0),this._isTransitioning=!0;const s=`scroll${n[0].toUpperCase()+n.slice(1)}`;this._queueCallback((()=>{this._isTransitioning=!1,this._element.classList.remove(ht),this._element.classList.add(ct,lt),this._element.style[n]="",j.trigger(this._element,"shown.bs.collapse")}),this._element,!0),this._element.style[n]=`${this._element[s]}px`}hide(){if(this._isTransitioning||!this._isShown())return;if(j.trigger(this._element,"hide.bs.collapse").defaultPrevented)return;const t=this._getDimension();this._element.style[t]=`${this._element.getBoundingClientRect()[t]}px`,u(this._element),this._element.classList.add(ht),this._element.classList.remove(ct,lt);const e=this._triggerArray.length;for(let t=0;t{this._isTransitioning=!1,this._element.classList.remove(ht),this._element.classList.add(ct),j.trigger(this._element,"hidden.bs.collapse")}),this._element,!0)}_isShown(t=this._element){return t.classList.contains(lt)}_getConfig(t){return(t={...rt,...U.getDataAttributes(this._element),...t}).toggle=Boolean(t.toggle),t.parent=r(t.parent),a(ot,t,at),t}_getDimension(){return this._element.classList.contains("collapse-horizontal")?"width":"height"}_initializeChildren(){if(!this._config.parent)return;const t=V.find(ut,this._config.parent);V.find(ft,this._config.parent).filter((e=>!t.includes(e))).forEach((t=>{const e=n(t);e&&this._addAriaAndCollapsedClass([t],this._isShown(e))}))}_addAriaAndCollapsedClass(t,e){t.length&&t.forEach((t=>{e?t.classList.remove(dt):t.classList.add(dt),t.setAttribute("aria-expanded",e)}))}static jQueryInterface(t){return this.each((function(){const e={};"string"==typeof t&&/show|hide/.test(t)&&(e.toggle=!1);const i=pt.getOrCreateInstance(this,e);if("string"==typeof t){if(void 0===i[t])throw new TypeError(`No method named "${t}"`);i[t]()}}))}}j.on(document,"click.bs.collapse.data-api",ft,(function(t){("A"===t.target.tagName||t.delegateTarget&&"A"===t.delegateTarget.tagName)&&t.preventDefault();const e=i(this);V.find(e).forEach((t=>{pt.getOrCreateInstance(t,{toggle:!1}).toggle()}))})),g(pt);var mt="top",gt="bottom",_t="right",bt="left",vt="auto",yt=[mt,gt,_t,bt],wt="start",Et="end",At="clippingParents",Tt="viewport",Ot="popper",Ct="reference",kt=yt.reduce((function(t,e){return t.concat([e+"-"+wt,e+"-"+Et])}),[]),Lt=[].concat(yt,[vt]).reduce((function(t,e){return t.concat([e,e+"-"+wt,e+"-"+Et])}),[]),xt="beforeRead",Dt="read",St="afterRead",Nt="beforeMain",It="main",Pt="afterMain",jt="beforeWrite",Mt="write",Ht="afterWrite",Bt=[xt,Dt,St,Nt,It,Pt,jt,Mt,Ht];function Rt(t){return t?(t.nodeName||"").toLowerCase():null}function Wt(t){if(null==t)return window;if("[object Window]"!==t.toString()){var e=t.ownerDocument;return e&&e.defaultView||window}return t}function $t(t){return t instanceof Wt(t).Element||t instanceof Element}function zt(t){return t instanceof Wt(t).HTMLElement||t instanceof HTMLElement}function qt(t){return"undefined"!=typeof ShadowRoot&&(t instanceof Wt(t).ShadowRoot||t instanceof ShadowRoot)}const Ft={name:"applyStyles",enabled:!0,phase:"write",fn:function(t){var e=t.state;Object.keys(e.elements).forEach((function(t){var i=e.styles[t]||{},n=e.attributes[t]||{},s=e.elements[t];zt(s)&&Rt(s)&&(Object.assign(s.style,i),Object.keys(n).forEach((function(t){var e=n[t];!1===e?s.removeAttribute(t):s.setAttribute(t,!0===e?"":e)})))}))},effect:function(t){var e=t.state,i={popper:{position:e.options.strategy,left:"0",top:"0",margin:"0"},arrow:{position:"absolute"},reference:{}};return Object.assign(e.elements.popper.style,i.popper),e.styles=i,e.elements.arrow&&Object.assign(e.elements.arrow.style,i.arrow),function(){Object.keys(e.elements).forEach((function(t){var n=e.elements[t],s=e.attributes[t]||{},o=Object.keys(e.styles.hasOwnProperty(t)?e.styles[t]:i[t]).reduce((function(t,e){return t[e]="",t}),{});zt(n)&&Rt(n)&&(Object.assign(n.style,o),Object.keys(s).forEach((function(t){n.removeAttribute(t)})))}))}},requires:["computeStyles"]};function Ut(t){return t.split("-")[0]}function Vt(t,e){var i=t.getBoundingClientRect();return{width:i.width/1,height:i.height/1,top:i.top/1,right:i.right/1,bottom:i.bottom/1,left:i.left/1,x:i.left/1,y:i.top/1}}function Kt(t){var e=Vt(t),i=t.offsetWidth,n=t.offsetHeight;return Math.abs(e.width-i)<=1&&(i=e.width),Math.abs(e.height-n)<=1&&(n=e.height),{x:t.offsetLeft,y:t.offsetTop,width:i,height:n}}function Xt(t,e){var i=e.getRootNode&&e.getRootNode();if(t.contains(e))return!0;if(i&&qt(i)){var n=e;do{if(n&&t.isSameNode(n))return!0;n=n.parentNode||n.host}while(n)}return!1}function Yt(t){return Wt(t).getComputedStyle(t)}function Qt(t){return["table","td","th"].indexOf(Rt(t))>=0}function Gt(t){return(($t(t)?t.ownerDocument:t.document)||window.document).documentElement}function Zt(t){return"html"===Rt(t)?t:t.assignedSlot||t.parentNode||(qt(t)?t.host:null)||Gt(t)}function Jt(t){return zt(t)&&"fixed"!==Yt(t).position?t.offsetParent:null}function te(t){for(var e=Wt(t),i=Jt(t);i&&Qt(i)&&"static"===Yt(i).position;)i=Jt(i);return i&&("html"===Rt(i)||"body"===Rt(i)&&"static"===Yt(i).position)?e:i||function(t){var e=-1!==navigator.userAgent.toLowerCase().indexOf("firefox");if(-1!==navigator.userAgent.indexOf("Trident")&&zt(t)&&"fixed"===Yt(t).position)return null;for(var i=Zt(t);zt(i)&&["html","body"].indexOf(Rt(i))<0;){var n=Yt(i);if("none"!==n.transform||"none"!==n.perspective||"paint"===n.contain||-1!==["transform","perspective"].indexOf(n.willChange)||e&&"filter"===n.willChange||e&&n.filter&&"none"!==n.filter)return i;i=i.parentNode}return null}(t)||e}function ee(t){return["top","bottom"].indexOf(t)>=0?"x":"y"}var ie=Math.max,ne=Math.min,se=Math.round;function oe(t,e,i){return ie(t,ne(e,i))}function re(t){return Object.assign({},{top:0,right:0,bottom:0,left:0},t)}function ae(t,e){return e.reduce((function(e,i){return e[i]=t,e}),{})}const le={name:"arrow",enabled:!0,phase:"main",fn:function(t){var e,i=t.state,n=t.name,s=t.options,o=i.elements.arrow,r=i.modifiersData.popperOffsets,a=Ut(i.placement),l=ee(a),c=[bt,_t].indexOf(a)>=0?"height":"width";if(o&&r){var h=function(t,e){return re("number"!=typeof(t="function"==typeof t?t(Object.assign({},e.rects,{placement:e.placement})):t)?t:ae(t,yt))}(s.padding,i),d=Kt(o),u="y"===l?mt:bt,f="y"===l?gt:_t,p=i.rects.reference[c]+i.rects.reference[l]-r[l]-i.rects.popper[c],m=r[l]-i.rects.reference[l],g=te(o),_=g?"y"===l?g.clientHeight||0:g.clientWidth||0:0,b=p/2-m/2,v=h[u],y=_-d[c]-h[f],w=_/2-d[c]/2+b,E=oe(v,w,y),A=l;i.modifiersData[n]=((e={})[A]=E,e.centerOffset=E-w,e)}},effect:function(t){var e=t.state,i=t.options.element,n=void 0===i?"[data-popper-arrow]":i;null!=n&&("string"!=typeof n||(n=e.elements.popper.querySelector(n)))&&Xt(e.elements.popper,n)&&(e.elements.arrow=n)},requires:["popperOffsets"],requiresIfExists:["preventOverflow"]};function ce(t){return t.split("-")[1]}var he={top:"auto",right:"auto",bottom:"auto",left:"auto"};function de(t){var e,i=t.popper,n=t.popperRect,s=t.placement,o=t.variation,r=t.offsets,a=t.position,l=t.gpuAcceleration,c=t.adaptive,h=t.roundOffsets,d=!0===h?function(t){var e=t.x,i=t.y,n=window.devicePixelRatio||1;return{x:se(se(e*n)/n)||0,y:se(se(i*n)/n)||0}}(r):"function"==typeof h?h(r):r,u=d.x,f=void 0===u?0:u,p=d.y,m=void 0===p?0:p,g=r.hasOwnProperty("x"),_=r.hasOwnProperty("y"),b=bt,v=mt,y=window;if(c){var w=te(i),E="clientHeight",A="clientWidth";w===Wt(i)&&"static"!==Yt(w=Gt(i)).position&&"absolute"===a&&(E="scrollHeight",A="scrollWidth"),w=w,s!==mt&&(s!==bt&&s!==_t||o!==Et)||(v=gt,m-=w[E]-n.height,m*=l?1:-1),s!==bt&&(s!==mt&&s!==gt||o!==Et)||(b=_t,f-=w[A]-n.width,f*=l?1:-1)}var T,O=Object.assign({position:a},c&&he);return l?Object.assign({},O,((T={})[v]=_?"0":"",T[b]=g?"0":"",T.transform=(y.devicePixelRatio||1)<=1?"translate("+f+"px, "+m+"px)":"translate3d("+f+"px, "+m+"px, 0)",T)):Object.assign({},O,((e={})[v]=_?m+"px":"",e[b]=g?f+"px":"",e.transform="",e))}const ue={name:"computeStyles",enabled:!0,phase:"beforeWrite",fn:function(t){var e=t.state,i=t.options,n=i.gpuAcceleration,s=void 0===n||n,o=i.adaptive,r=void 0===o||o,a=i.roundOffsets,l=void 0===a||a,c={placement:Ut(e.placement),variation:ce(e.placement),popper:e.elements.popper,popperRect:e.rects.popper,gpuAcceleration:s};null!=e.modifiersData.popperOffsets&&(e.styles.popper=Object.assign({},e.styles.popper,de(Object.assign({},c,{offsets:e.modifiersData.popperOffsets,position:e.options.strategy,adaptive:r,roundOffsets:l})))),null!=e.modifiersData.arrow&&(e.styles.arrow=Object.assign({},e.styles.arrow,de(Object.assign({},c,{offsets:e.modifiersData.arrow,position:"absolute",adaptive:!1,roundOffsets:l})))),e.attributes.popper=Object.assign({},e.attributes.popper,{"data-popper-placement":e.placement})},data:{}};var fe={passive:!0};const pe={name:"eventListeners",enabled:!0,phase:"write",fn:function(){},effect:function(t){var e=t.state,i=t.instance,n=t.options,s=n.scroll,o=void 0===s||s,r=n.resize,a=void 0===r||r,l=Wt(e.elements.popper),c=[].concat(e.scrollParents.reference,e.scrollParents.popper);return o&&c.forEach((function(t){t.addEventListener("scroll",i.update,fe)})),a&&l.addEventListener("resize",i.update,fe),function(){o&&c.forEach((function(t){t.removeEventListener("scroll",i.update,fe)})),a&&l.removeEventListener("resize",i.update,fe)}},data:{}};var me={left:"right",right:"left",bottom:"top",top:"bottom"};function ge(t){return t.replace(/left|right|bottom|top/g,(function(t){return me[t]}))}var _e={start:"end",end:"start"};function be(t){return t.replace(/start|end/g,(function(t){return _e[t]}))}function ve(t){var e=Wt(t);return{scrollLeft:e.pageXOffset,scrollTop:e.pageYOffset}}function ye(t){return Vt(Gt(t)).left+ve(t).scrollLeft}function we(t){var e=Yt(t),i=e.overflow,n=e.overflowX,s=e.overflowY;return/auto|scroll|overlay|hidden/.test(i+s+n)}function Ee(t){return["html","body","#document"].indexOf(Rt(t))>=0?t.ownerDocument.body:zt(t)&&we(t)?t:Ee(Zt(t))}function Ae(t,e){var i;void 0===e&&(e=[]);var n=Ee(t),s=n===(null==(i=t.ownerDocument)?void 0:i.body),o=Wt(n),r=s?[o].concat(o.visualViewport||[],we(n)?n:[]):n,a=e.concat(r);return s?a:a.concat(Ae(Zt(r)))}function Te(t){return Object.assign({},t,{left:t.x,top:t.y,right:t.x+t.width,bottom:t.y+t.height})}function Oe(t,e){return e===Tt?Te(function(t){var e=Wt(t),i=Gt(t),n=e.visualViewport,s=i.clientWidth,o=i.clientHeight,r=0,a=0;return n&&(s=n.width,o=n.height,/^((?!chrome|android).)*safari/i.test(navigator.userAgent)||(r=n.offsetLeft,a=n.offsetTop)),{width:s,height:o,x:r+ye(t),y:a}}(t)):zt(e)?function(t){var e=Vt(t);return e.top=e.top+t.clientTop,e.left=e.left+t.clientLeft,e.bottom=e.top+t.clientHeight,e.right=e.left+t.clientWidth,e.width=t.clientWidth,e.height=t.clientHeight,e.x=e.left,e.y=e.top,e}(e):Te(function(t){var e,i=Gt(t),n=ve(t),s=null==(e=t.ownerDocument)?void 0:e.body,o=ie(i.scrollWidth,i.clientWidth,s?s.scrollWidth:0,s?s.clientWidth:0),r=ie(i.scrollHeight,i.clientHeight,s?s.scrollHeight:0,s?s.clientHeight:0),a=-n.scrollLeft+ye(t),l=-n.scrollTop;return"rtl"===Yt(s||i).direction&&(a+=ie(i.clientWidth,s?s.clientWidth:0)-o),{width:o,height:r,x:a,y:l}}(Gt(t)))}function Ce(t){var e,i=t.reference,n=t.element,s=t.placement,o=s?Ut(s):null,r=s?ce(s):null,a=i.x+i.width/2-n.width/2,l=i.y+i.height/2-n.height/2;switch(o){case mt:e={x:a,y:i.y-n.height};break;case gt:e={x:a,y:i.y+i.height};break;case _t:e={x:i.x+i.width,y:l};break;case bt:e={x:i.x-n.width,y:l};break;default:e={x:i.x,y:i.y}}var c=o?ee(o):null;if(null!=c){var h="y"===c?"height":"width";switch(r){case wt:e[c]=e[c]-(i[h]/2-n[h]/2);break;case Et:e[c]=e[c]+(i[h]/2-n[h]/2)}}return e}function ke(t,e){void 0===e&&(e={});var i=e,n=i.placement,s=void 0===n?t.placement:n,o=i.boundary,r=void 0===o?At:o,a=i.rootBoundary,l=void 0===a?Tt:a,c=i.elementContext,h=void 0===c?Ot:c,d=i.altBoundary,u=void 0!==d&&d,f=i.padding,p=void 0===f?0:f,m=re("number"!=typeof p?p:ae(p,yt)),g=h===Ot?Ct:Ot,_=t.rects.popper,b=t.elements[u?g:h],v=function(t,e,i){var n="clippingParents"===e?function(t){var e=Ae(Zt(t)),i=["absolute","fixed"].indexOf(Yt(t).position)>=0&&zt(t)?te(t):t;return $t(i)?e.filter((function(t){return $t(t)&&Xt(t,i)&&"body"!==Rt(t)})):[]}(t):[].concat(e),s=[].concat(n,[i]),o=s[0],r=s.reduce((function(e,i){var n=Oe(t,i);return e.top=ie(n.top,e.top),e.right=ne(n.right,e.right),e.bottom=ne(n.bottom,e.bottom),e.left=ie(n.left,e.left),e}),Oe(t,o));return r.width=r.right-r.left,r.height=r.bottom-r.top,r.x=r.left,r.y=r.top,r}($t(b)?b:b.contextElement||Gt(t.elements.popper),r,l),y=Vt(t.elements.reference),w=Ce({reference:y,element:_,strategy:"absolute",placement:s}),E=Te(Object.assign({},_,w)),A=h===Ot?E:y,T={top:v.top-A.top+m.top,bottom:A.bottom-v.bottom+m.bottom,left:v.left-A.left+m.left,right:A.right-v.right+m.right},O=t.modifiersData.offset;if(h===Ot&&O){var C=O[s];Object.keys(T).forEach((function(t){var e=[_t,gt].indexOf(t)>=0?1:-1,i=[mt,gt].indexOf(t)>=0?"y":"x";T[t]+=C[i]*e}))}return T}function Le(t,e){void 0===e&&(e={});var i=e,n=i.placement,s=i.boundary,o=i.rootBoundary,r=i.padding,a=i.flipVariations,l=i.allowedAutoPlacements,c=void 0===l?Lt:l,h=ce(n),d=h?a?kt:kt.filter((function(t){return ce(t)===h})):yt,u=d.filter((function(t){return c.indexOf(t)>=0}));0===u.length&&(u=d);var f=u.reduce((function(e,i){return e[i]=ke(t,{placement:i,boundary:s,rootBoundary:o,padding:r})[Ut(i)],e}),{});return Object.keys(f).sort((function(t,e){return f[t]-f[e]}))}const xe={name:"flip",enabled:!0,phase:"main",fn:function(t){var e=t.state,i=t.options,n=t.name;if(!e.modifiersData[n]._skip){for(var s=i.mainAxis,o=void 0===s||s,r=i.altAxis,a=void 0===r||r,l=i.fallbackPlacements,c=i.padding,h=i.boundary,d=i.rootBoundary,u=i.altBoundary,f=i.flipVariations,p=void 0===f||f,m=i.allowedAutoPlacements,g=e.options.placement,_=Ut(g),b=l||(_!==g&&p?function(t){if(Ut(t)===vt)return[];var e=ge(t);return[be(t),e,be(e)]}(g):[ge(g)]),v=[g].concat(b).reduce((function(t,i){return t.concat(Ut(i)===vt?Le(e,{placement:i,boundary:h,rootBoundary:d,padding:c,flipVariations:p,allowedAutoPlacements:m}):i)}),[]),y=e.rects.reference,w=e.rects.popper,E=new Map,A=!0,T=v[0],O=0;O=0,D=x?"width":"height",S=ke(e,{placement:C,boundary:h,rootBoundary:d,altBoundary:u,padding:c}),N=x?L?_t:bt:L?gt:mt;y[D]>w[D]&&(N=ge(N));var I=ge(N),P=[];if(o&&P.push(S[k]<=0),a&&P.push(S[N]<=0,S[I]<=0),P.every((function(t){return t}))){T=C,A=!1;break}E.set(C,P)}if(A)for(var j=function(t){var e=v.find((function(e){var i=E.get(e);if(i)return i.slice(0,t).every((function(t){return t}))}));if(e)return T=e,"break"},M=p?3:1;M>0&&"break"!==j(M);M--);e.placement!==T&&(e.modifiersData[n]._skip=!0,e.placement=T,e.reset=!0)}},requiresIfExists:["offset"],data:{_skip:!1}};function De(t,e,i){return void 0===i&&(i={x:0,y:0}),{top:t.top-e.height-i.y,right:t.right-e.width+i.x,bottom:t.bottom-e.height+i.y,left:t.left-e.width-i.x}}function Se(t){return[mt,_t,gt,bt].some((function(e){return t[e]>=0}))}const Ne={name:"hide",enabled:!0,phase:"main",requiresIfExists:["preventOverflow"],fn:function(t){var e=t.state,i=t.name,n=e.rects.reference,s=e.rects.popper,o=e.modifiersData.preventOverflow,r=ke(e,{elementContext:"reference"}),a=ke(e,{altBoundary:!0}),l=De(r,n),c=De(a,s,o),h=Se(l),d=Se(c);e.modifiersData[i]={referenceClippingOffsets:l,popperEscapeOffsets:c,isReferenceHidden:h,hasPopperEscaped:d},e.attributes.popper=Object.assign({},e.attributes.popper,{"data-popper-reference-hidden":h,"data-popper-escaped":d})}},Ie={name:"offset",enabled:!0,phase:"main",requires:["popperOffsets"],fn:function(t){var e=t.state,i=t.options,n=t.name,s=i.offset,o=void 0===s?[0,0]:s,r=Lt.reduce((function(t,i){return t[i]=function(t,e,i){var n=Ut(t),s=[bt,mt].indexOf(n)>=0?-1:1,o="function"==typeof i?i(Object.assign({},e,{placement:t})):i,r=o[0],a=o[1];return r=r||0,a=(a||0)*s,[bt,_t].indexOf(n)>=0?{x:a,y:r}:{x:r,y:a}}(i,e.rects,o),t}),{}),a=r[e.placement],l=a.x,c=a.y;null!=e.modifiersData.popperOffsets&&(e.modifiersData.popperOffsets.x+=l,e.modifiersData.popperOffsets.y+=c),e.modifiersData[n]=r}},Pe={name:"popperOffsets",enabled:!0,phase:"read",fn:function(t){var e=t.state,i=t.name;e.modifiersData[i]=Ce({reference:e.rects.reference,element:e.rects.popper,strategy:"absolute",placement:e.placement})},data:{}},je={name:"preventOverflow",enabled:!0,phase:"main",fn:function(t){var e=t.state,i=t.options,n=t.name,s=i.mainAxis,o=void 0===s||s,r=i.altAxis,a=void 0!==r&&r,l=i.boundary,c=i.rootBoundary,h=i.altBoundary,d=i.padding,u=i.tether,f=void 0===u||u,p=i.tetherOffset,m=void 0===p?0:p,g=ke(e,{boundary:l,rootBoundary:c,padding:d,altBoundary:h}),_=Ut(e.placement),b=ce(e.placement),v=!b,y=ee(_),w="x"===y?"y":"x",E=e.modifiersData.popperOffsets,A=e.rects.reference,T=e.rects.popper,O="function"==typeof m?m(Object.assign({},e.rects,{placement:e.placement})):m,C={x:0,y:0};if(E){if(o||a){var k="y"===y?mt:bt,L="y"===y?gt:_t,x="y"===y?"height":"width",D=E[y],S=E[y]+g[k],N=E[y]-g[L],I=f?-T[x]/2:0,P=b===wt?A[x]:T[x],j=b===wt?-T[x]:-A[x],M=e.elements.arrow,H=f&&M?Kt(M):{width:0,height:0},B=e.modifiersData["arrow#persistent"]?e.modifiersData["arrow#persistent"].padding:{top:0,right:0,bottom:0,left:0},R=B[k],W=B[L],$=oe(0,A[x],H[x]),z=v?A[x]/2-I-$-R-O:P-$-R-O,q=v?-A[x]/2+I+$+W+O:j+$+W+O,F=e.elements.arrow&&te(e.elements.arrow),U=F?"y"===y?F.clientTop||0:F.clientLeft||0:0,V=e.modifiersData.offset?e.modifiersData.offset[e.placement][y]:0,K=E[y]+z-V-U,X=E[y]+q-V;if(o){var Y=oe(f?ne(S,K):S,D,f?ie(N,X):N);E[y]=Y,C[y]=Y-D}if(a){var Q="x"===y?mt:bt,G="x"===y?gt:_t,Z=E[w],J=Z+g[Q],tt=Z-g[G],et=oe(f?ne(J,K):J,Z,f?ie(tt,X):tt);E[w]=et,C[w]=et-Z}}e.modifiersData[n]=C}},requiresIfExists:["offset"]};function Me(t,e,i){void 0===i&&(i=!1);var n=zt(e);zt(e)&&function(t){var e=t.getBoundingClientRect();e.width,t.offsetWidth,e.height,t.offsetHeight}(e);var s,o,r=Gt(e),a=Vt(t),l={scrollLeft:0,scrollTop:0},c={x:0,y:0};return(n||!n&&!i)&&(("body"!==Rt(e)||we(r))&&(l=(s=e)!==Wt(s)&&zt(s)?{scrollLeft:(o=s).scrollLeft,scrollTop:o.scrollTop}:ve(s)),zt(e)?((c=Vt(e)).x+=e.clientLeft,c.y+=e.clientTop):r&&(c.x=ye(r))),{x:a.left+l.scrollLeft-c.x,y:a.top+l.scrollTop-c.y,width:a.width,height:a.height}}function He(t){var e=new Map,i=new Set,n=[];function s(t){i.add(t.name),[].concat(t.requires||[],t.requiresIfExists||[]).forEach((function(t){if(!i.has(t)){var n=e.get(t);n&&s(n)}})),n.push(t)}return t.forEach((function(t){e.set(t.name,t)})),t.forEach((function(t){i.has(t.name)||s(t)})),n}var Be={placement:"bottom",modifiers:[],strategy:"absolute"};function Re(){for(var t=arguments.length,e=new Array(t),i=0;ij.on(t,"mouseover",d))),this._element.focus(),this._element.setAttribute("aria-expanded",!0),this._menu.classList.add(Je),this._element.classList.add(Je),j.trigger(this._element,"shown.bs.dropdown",t)}hide(){if(c(this._element)||!this._isShown(this._menu))return;const t={relatedTarget:this._element};this._completeHide(t)}dispose(){this._popper&&this._popper.destroy(),super.dispose()}update(){this._inNavbar=this._detectNavbar(),this._popper&&this._popper.update()}_completeHide(t){j.trigger(this._element,"hide.bs.dropdown",t).defaultPrevented||("ontouchstart"in document.documentElement&&[].concat(...document.body.children).forEach((t=>j.off(t,"mouseover",d))),this._popper&&this._popper.destroy(),this._menu.classList.remove(Je),this._element.classList.remove(Je),this._element.setAttribute("aria-expanded","false"),U.removeDataAttribute(this._menu,"popper"),j.trigger(this._element,"hidden.bs.dropdown",t))}_getConfig(t){if(t={...this.constructor.Default,...U.getDataAttributes(this._element),...t},a(Ue,t,this.constructor.DefaultType),"object"==typeof t.reference&&!o(t.reference)&&"function"!=typeof t.reference.getBoundingClientRect)throw new TypeError(`${Ue.toUpperCase()}: Option "reference" provided type "object" without a required "getBoundingClientRect" method.`);return t}_createPopper(t){if(void 0===Fe)throw new TypeError("Bootstrap's dropdowns require Popper (https://popper.js.org)");let e=this._element;"parent"===this._config.reference?e=t:o(this._config.reference)?e=r(this._config.reference):"object"==typeof this._config.reference&&(e=this._config.reference);const i=this._getPopperConfig(),n=i.modifiers.find((t=>"applyStyles"===t.name&&!1===t.enabled));this._popper=qe(e,this._menu,i),n&&U.setDataAttribute(this._menu,"popper","static")}_isShown(t=this._element){return t.classList.contains(Je)}_getMenuElement(){return V.next(this._element,ei)[0]}_getPlacement(){const t=this._element.parentNode;if(t.classList.contains("dropend"))return ri;if(t.classList.contains("dropstart"))return ai;const e="end"===getComputedStyle(this._menu).getPropertyValue("--bs-position").trim();return t.classList.contains("dropup")?e?ni:ii:e?oi:si}_detectNavbar(){return null!==this._element.closest(".navbar")}_getOffset(){const{offset:t}=this._config;return"string"==typeof t?t.split(",").map((t=>Number.parseInt(t,10))):"function"==typeof t?e=>t(e,this._element):t}_getPopperConfig(){const t={placement:this._getPlacement(),modifiers:[{name:"preventOverflow",options:{boundary:this._config.boundary}},{name:"offset",options:{offset:this._getOffset()}}]};return"static"===this._config.display&&(t.modifiers=[{name:"applyStyles",enabled:!1}]),{...t,..."function"==typeof this._config.popperConfig?this._config.popperConfig(t):this._config.popperConfig}}_selectMenuItem({key:t,target:e}){const i=V.find(".dropdown-menu .dropdown-item:not(.disabled):not(:disabled)",this._menu).filter(l);i.length&&v(i,e,t===Ye,!i.includes(e)).focus()}static jQueryInterface(t){return this.each((function(){const e=hi.getOrCreateInstance(this,t);if("string"==typeof t){if(void 0===e[t])throw new TypeError(`No method named "${t}"`);e[t]()}}))}static clearMenus(t){if(t&&(2===t.button||"keyup"===t.type&&"Tab"!==t.key))return;const e=V.find(ti);for(let i=0,n=e.length;ie+t)),this._setElementAttributes(di,"paddingRight",(e=>e+t)),this._setElementAttributes(ui,"marginRight",(e=>e-t))}_disableOverFlow(){this._saveInitialAttribute(this._element,"overflow"),this._element.style.overflow="hidden"}_setElementAttributes(t,e,i){const n=this.getWidth();this._applyManipulationCallback(t,(t=>{if(t!==this._element&&window.innerWidth>t.clientWidth+n)return;this._saveInitialAttribute(t,e);const s=window.getComputedStyle(t)[e];t.style[e]=`${i(Number.parseFloat(s))}px`}))}reset(){this._resetElementAttributes(this._element,"overflow"),this._resetElementAttributes(this._element,"paddingRight"),this._resetElementAttributes(di,"paddingRight"),this._resetElementAttributes(ui,"marginRight")}_saveInitialAttribute(t,e){const i=t.style[e];i&&U.setDataAttribute(t,e,i)}_resetElementAttributes(t,e){this._applyManipulationCallback(t,(t=>{const i=U.getDataAttribute(t,e);void 0===i?t.style.removeProperty(e):(U.removeDataAttribute(t,e),t.style[e]=i)}))}_applyManipulationCallback(t,e){o(t)?e(t):V.find(t,this._element).forEach(e)}isOverflowing(){return this.getWidth()>0}}const pi={className:"modal-backdrop",isVisible:!0,isAnimated:!1,rootElement:"body",clickCallback:null},mi={className:"string",isVisible:"boolean",isAnimated:"boolean",rootElement:"(element|string)",clickCallback:"(function|null)"},gi="show",_i="mousedown.bs.backdrop";class bi{constructor(t){this._config=this._getConfig(t),this._isAppended=!1,this._element=null}show(t){this._config.isVisible?(this._append(),this._config.isAnimated&&u(this._getElement()),this._getElement().classList.add(gi),this._emulateAnimation((()=>{_(t)}))):_(t)}hide(t){this._config.isVisible?(this._getElement().classList.remove(gi),this._emulateAnimation((()=>{this.dispose(),_(t)}))):_(t)}_getElement(){if(!this._element){const t=document.createElement("div");t.className=this._config.className,this._config.isAnimated&&t.classList.add("fade"),this._element=t}return this._element}_getConfig(t){return(t={...pi,..."object"==typeof t?t:{}}).rootElement=r(t.rootElement),a("backdrop",t,mi),t}_append(){this._isAppended||(this._config.rootElement.append(this._getElement()),j.on(this._getElement(),_i,(()=>{_(this._config.clickCallback)})),this._isAppended=!0)}dispose(){this._isAppended&&(j.off(this._element,_i),this._element.remove(),this._isAppended=!1)}_emulateAnimation(t){b(t,this._getElement(),this._config.isAnimated)}}const vi={trapElement:null,autofocus:!0},yi={trapElement:"element",autofocus:"boolean"},wi=".bs.focustrap",Ei="backward";class Ai{constructor(t){this._config=this._getConfig(t),this._isActive=!1,this._lastTabNavDirection=null}activate(){const{trapElement:t,autofocus:e}=this._config;this._isActive||(e&&t.focus(),j.off(document,wi),j.on(document,"focusin.bs.focustrap",(t=>this._handleFocusin(t))),j.on(document,"keydown.tab.bs.focustrap",(t=>this._handleKeydown(t))),this._isActive=!0)}deactivate(){this._isActive&&(this._isActive=!1,j.off(document,wi))}_handleFocusin(t){const{target:e}=t,{trapElement:i}=this._config;if(e===document||e===i||i.contains(e))return;const n=V.focusableChildren(i);0===n.length?i.focus():this._lastTabNavDirection===Ei?n[n.length-1].focus():n[0].focus()}_handleKeydown(t){"Tab"===t.key&&(this._lastTabNavDirection=t.shiftKey?Ei:"forward")}_getConfig(t){return t={...vi,..."object"==typeof t?t:{}},a("focustrap",t,yi),t}}const Ti="modal",Oi="Escape",Ci={backdrop:!0,keyboard:!0,focus:!0},ki={backdrop:"(boolean|string)",keyboard:"boolean",focus:"boolean"},Li="hidden.bs.modal",xi="show.bs.modal",Di="resize.bs.modal",Si="click.dismiss.bs.modal",Ni="keydown.dismiss.bs.modal",Ii="mousedown.dismiss.bs.modal",Pi="modal-open",ji="show",Mi="modal-static";class Hi extends B{constructor(t,e){super(t),this._config=this._getConfig(e),this._dialog=V.findOne(".modal-dialog",this._element),this._backdrop=this._initializeBackDrop(),this._focustrap=this._initializeFocusTrap(),this._isShown=!1,this._ignoreBackdropClick=!1,this._isTransitioning=!1,this._scrollBar=new fi}static get Default(){return Ci}static get NAME(){return Ti}toggle(t){return this._isShown?this.hide():this.show(t)}show(t){this._isShown||this._isTransitioning||j.trigger(this._element,xi,{relatedTarget:t}).defaultPrevented||(this._isShown=!0,this._isAnimated()&&(this._isTransitioning=!0),this._scrollBar.hide(),document.body.classList.add(Pi),this._adjustDialog(),this._setEscapeEvent(),this._setResizeEvent(),j.on(this._dialog,Ii,(()=>{j.one(this._element,"mouseup.dismiss.bs.modal",(t=>{t.target===this._element&&(this._ignoreBackdropClick=!0)}))})),this._showBackdrop((()=>this._showElement(t))))}hide(){if(!this._isShown||this._isTransitioning)return;if(j.trigger(this._element,"hide.bs.modal").defaultPrevented)return;this._isShown=!1;const t=this._isAnimated();t&&(this._isTransitioning=!0),this._setEscapeEvent(),this._setResizeEvent(),this._focustrap.deactivate(),this._element.classList.remove(ji),j.off(this._element,Si),j.off(this._dialog,Ii),this._queueCallback((()=>this._hideModal()),this._element,t)}dispose(){[window,this._dialog].forEach((t=>j.off(t,".bs.modal"))),this._backdrop.dispose(),this._focustrap.deactivate(),super.dispose()}handleUpdate(){this._adjustDialog()}_initializeBackDrop(){return new bi({isVisible:Boolean(this._config.backdrop),isAnimated:this._isAnimated()})}_initializeFocusTrap(){return new Ai({trapElement:this._element})}_getConfig(t){return t={...Ci,...U.getDataAttributes(this._element),..."object"==typeof t?t:{}},a(Ti,t,ki),t}_showElement(t){const e=this._isAnimated(),i=V.findOne(".modal-body",this._dialog);this._element.parentNode&&this._element.parentNode.nodeType===Node.ELEMENT_NODE||document.body.append(this._element),this._element.style.display="block",this._element.removeAttribute("aria-hidden"),this._element.setAttribute("aria-modal",!0),this._element.setAttribute("role","dialog"),this._element.scrollTop=0,i&&(i.scrollTop=0),e&&u(this._element),this._element.classList.add(ji),this._queueCallback((()=>{this._config.focus&&this._focustrap.activate(),this._isTransitioning=!1,j.trigger(this._element,"shown.bs.modal",{relatedTarget:t})}),this._dialog,e)}_setEscapeEvent(){this._isShown?j.on(this._element,Ni,(t=>{this._config.keyboard&&t.key===Oi?(t.preventDefault(),this.hide()):this._config.keyboard||t.key!==Oi||this._triggerBackdropTransition()})):j.off(this._element,Ni)}_setResizeEvent(){this._isShown?j.on(window,Di,(()=>this._adjustDialog())):j.off(window,Di)}_hideModal(){this._element.style.display="none",this._element.setAttribute("aria-hidden",!0),this._element.removeAttribute("aria-modal"),this._element.removeAttribute("role"),this._isTransitioning=!1,this._backdrop.hide((()=>{document.body.classList.remove(Pi),this._resetAdjustments(),this._scrollBar.reset(),j.trigger(this._element,Li)}))}_showBackdrop(t){j.on(this._element,Si,(t=>{this._ignoreBackdropClick?this._ignoreBackdropClick=!1:t.target===t.currentTarget&&(!0===this._config.backdrop?this.hide():"static"===this._config.backdrop&&this._triggerBackdropTransition())})),this._backdrop.show(t)}_isAnimated(){return this._element.classList.contains("fade")}_triggerBackdropTransition(){if(j.trigger(this._element,"hidePrevented.bs.modal").defaultPrevented)return;const{classList:t,scrollHeight:e,style:i}=this._element,n=e>document.documentElement.clientHeight;!n&&"hidden"===i.overflowY||t.contains(Mi)||(n||(i.overflowY="hidden"),t.add(Mi),this._queueCallback((()=>{t.remove(Mi),n||this._queueCallback((()=>{i.overflowY=""}),this._dialog)}),this._dialog),this._element.focus())}_adjustDialog(){const t=this._element.scrollHeight>document.documentElement.clientHeight,e=this._scrollBar.getWidth(),i=e>0;(!i&&t&&!m()||i&&!t&&m())&&(this._element.style.paddingLeft=`${e}px`),(i&&!t&&!m()||!i&&t&&m())&&(this._element.style.paddingRight=`${e}px`)}_resetAdjustments(){this._element.style.paddingLeft="",this._element.style.paddingRight=""}static jQueryInterface(t,e){return this.each((function(){const i=Hi.getOrCreateInstance(this,t);if("string"==typeof t){if(void 0===i[t])throw new TypeError(`No method named "${t}"`);i[t](e)}}))}}j.on(document,"click.bs.modal.data-api",'[data-bs-toggle="modal"]',(function(t){const e=n(this);["A","AREA"].includes(this.tagName)&&t.preventDefault(),j.one(e,xi,(t=>{t.defaultPrevented||j.one(e,Li,(()=>{l(this)&&this.focus()}))}));const i=V.findOne(".modal.show");i&&Hi.getInstance(i).hide(),Hi.getOrCreateInstance(e).toggle(this)})),R(Hi),g(Hi);const Bi="offcanvas",Ri={backdrop:!0,keyboard:!0,scroll:!1},Wi={backdrop:"boolean",keyboard:"boolean",scroll:"boolean"},$i="show",zi=".offcanvas.show",qi="hidden.bs.offcanvas";class Fi extends B{constructor(t,e){super(t),this._config=this._getConfig(e),this._isShown=!1,this._backdrop=this._initializeBackDrop(),this._focustrap=this._initializeFocusTrap(),this._addEventListeners()}static get NAME(){return Bi}static get Default(){return Ri}toggle(t){return this._isShown?this.hide():this.show(t)}show(t){this._isShown||j.trigger(this._element,"show.bs.offcanvas",{relatedTarget:t}).defaultPrevented||(this._isShown=!0,this._element.style.visibility="visible",this._backdrop.show(),this._config.scroll||(new fi).hide(),this._element.removeAttribute("aria-hidden"),this._element.setAttribute("aria-modal",!0),this._element.setAttribute("role","dialog"),this._element.classList.add($i),this._queueCallback((()=>{this._config.scroll||this._focustrap.activate(),j.trigger(this._element,"shown.bs.offcanvas",{relatedTarget:t})}),this._element,!0))}hide(){this._isShown&&(j.trigger(this._element,"hide.bs.offcanvas").defaultPrevented||(this._focustrap.deactivate(),this._element.blur(),this._isShown=!1,this._element.classList.remove($i),this._backdrop.hide(),this._queueCallback((()=>{this._element.setAttribute("aria-hidden",!0),this._element.removeAttribute("aria-modal"),this._element.removeAttribute("role"),this._element.style.visibility="hidden",this._config.scroll||(new fi).reset(),j.trigger(this._element,qi)}),this._element,!0)))}dispose(){this._backdrop.dispose(),this._focustrap.deactivate(),super.dispose()}_getConfig(t){return t={...Ri,...U.getDataAttributes(this._element),..."object"==typeof t?t:{}},a(Bi,t,Wi),t}_initializeBackDrop(){return new bi({className:"offcanvas-backdrop",isVisible:this._config.backdrop,isAnimated:!0,rootElement:this._element.parentNode,clickCallback:()=>this.hide()})}_initializeFocusTrap(){return new Ai({trapElement:this._element})}_addEventListeners(){j.on(this._element,"keydown.dismiss.bs.offcanvas",(t=>{this._config.keyboard&&"Escape"===t.key&&this.hide()}))}static jQueryInterface(t){return this.each((function(){const e=Fi.getOrCreateInstance(this,t);if("string"==typeof t){if(void 0===e[t]||t.startsWith("_")||"constructor"===t)throw new TypeError(`No method named "${t}"`);e[t](this)}}))}}j.on(document,"click.bs.offcanvas.data-api",'[data-bs-toggle="offcanvas"]',(function(t){const e=n(this);if(["A","AREA"].includes(this.tagName)&&t.preventDefault(),c(this))return;j.one(e,qi,(()=>{l(this)&&this.focus()}));const i=V.findOne(zi);i&&i!==e&&Fi.getInstance(i).hide(),Fi.getOrCreateInstance(e).toggle(this)})),j.on(window,"load.bs.offcanvas.data-api",(()=>V.find(zi).forEach((t=>Fi.getOrCreateInstance(t).show())))),R(Fi),g(Fi);const Ui=new Set(["background","cite","href","itemtype","longdesc","poster","src","xlink:href"]),Vi=/^(?:(?:https?|mailto|ftp|tel|file|sms):|[^#&/:?]*(?:[#/?]|$))/i,Ki=/^data:(?:image\/(?:bmp|gif|jpeg|jpg|png|tiff|webp)|video\/(?:mpeg|mp4|ogg|webm)|audio\/(?:mp3|oga|ogg|opus));base64,[\d+/a-z]+=*$/i,Xi=(t,e)=>{const i=t.nodeName.toLowerCase();if(e.includes(i))return!Ui.has(i)||Boolean(Vi.test(t.nodeValue)||Ki.test(t.nodeValue));const n=e.filter((t=>t instanceof RegExp));for(let t=0,e=n.length;t{Xi(t,r)||i.removeAttribute(t.nodeName)}))}return n.body.innerHTML}const Qi="tooltip",Gi=new Set(["sanitize","allowList","sanitizeFn"]),Zi={animation:"boolean",template:"string",title:"(string|element|function)",trigger:"string",delay:"(number|object)",html:"boolean",selector:"(string|boolean)",placement:"(string|function)",offset:"(array|string|function)",container:"(string|element|boolean)",fallbackPlacements:"array",boundary:"(string|element)",customClass:"(string|function)",sanitize:"boolean",sanitizeFn:"(null|function)",allowList:"object",popperConfig:"(null|object|function)"},Ji={AUTO:"auto",TOP:"top",RIGHT:m()?"left":"right",BOTTOM:"bottom",LEFT:m()?"right":"left"},tn={animation:!0,template:'
',trigger:"hover focus",title:"",delay:0,html:!1,selector:!1,placement:"top",offset:[0,0],container:!1,fallbackPlacements:["top","right","bottom","left"],boundary:"clippingParents",customClass:"",sanitize:!0,sanitizeFn:null,allowList:{"*":["class","dir","id","lang","role",/^aria-[\w-]*$/i],a:["target","href","title","rel"],area:[],b:[],br:[],col:[],code:[],div:[],em:[],hr:[],h1:[],h2:[],h3:[],h4:[],h5:[],h6:[],i:[],img:["src","srcset","alt","title","width","height"],li:[],ol:[],p:[],pre:[],s:[],small:[],span:[],sub:[],sup:[],strong:[],u:[],ul:[]},popperConfig:null},en={HIDE:"hide.bs.tooltip",HIDDEN:"hidden.bs.tooltip",SHOW:"show.bs.tooltip",SHOWN:"shown.bs.tooltip",INSERTED:"inserted.bs.tooltip",CLICK:"click.bs.tooltip",FOCUSIN:"focusin.bs.tooltip",FOCUSOUT:"focusout.bs.tooltip",MOUSEENTER:"mouseenter.bs.tooltip",MOUSELEAVE:"mouseleave.bs.tooltip"},nn="fade",sn="show",on="show",rn="out",an=".tooltip-inner",ln=".modal",cn="hide.bs.modal",hn="hover",dn="focus";class un extends B{constructor(t,e){if(void 0===Fe)throw new TypeError("Bootstrap's tooltips require Popper (https://popper.js.org)");super(t),this._isEnabled=!0,this._timeout=0,this._hoverState="",this._activeTrigger={},this._popper=null,this._config=this._getConfig(e),this.tip=null,this._setListeners()}static get Default(){return tn}static get NAME(){return Qi}static get Event(){return en}static get DefaultType(){return Zi}enable(){this._isEnabled=!0}disable(){this._isEnabled=!1}toggleEnabled(){this._isEnabled=!this._isEnabled}toggle(t){if(this._isEnabled)if(t){const e=this._initializeOnDelegatedTarget(t);e._activeTrigger.click=!e._activeTrigger.click,e._isWithActiveTrigger()?e._enter(null,e):e._leave(null,e)}else{if(this.getTipElement().classList.contains(sn))return void this._leave(null,this);this._enter(null,this)}}dispose(){clearTimeout(this._timeout),j.off(this._element.closest(ln),cn,this._hideModalHandler),this.tip&&this.tip.remove(),this._disposePopper(),super.dispose()}show(){if("none"===this._element.style.display)throw new Error("Please use show on visible elements");if(!this.isWithContent()||!this._isEnabled)return;const t=j.trigger(this._element,this.constructor.Event.SHOW),e=h(this._element),i=null===e?this._element.ownerDocument.documentElement.contains(this._element):e.contains(this._element);if(t.defaultPrevented||!i)return;"tooltip"===this.constructor.NAME&&this.tip&&this.getTitle()!==this.tip.querySelector(an).innerHTML&&(this._disposePopper(),this.tip.remove(),this.tip=null);const n=this.getTipElement(),s=(t=>{do{t+=Math.floor(1e6*Math.random())}while(document.getElementById(t));return t})(this.constructor.NAME);n.setAttribute("id",s),this._element.setAttribute("aria-describedby",s),this._config.animation&&n.classList.add(nn);const o="function"==typeof this._config.placement?this._config.placement.call(this,n,this._element):this._config.placement,r=this._getAttachment(o);this._addAttachmentClass(r);const{container:a}=this._config;H.set(n,this.constructor.DATA_KEY,this),this._element.ownerDocument.documentElement.contains(this.tip)||(a.append(n),j.trigger(this._element,this.constructor.Event.INSERTED)),this._popper?this._popper.update():this._popper=qe(this._element,n,this._getPopperConfig(r)),n.classList.add(sn);const l=this._resolvePossibleFunction(this._config.customClass);l&&n.classList.add(...l.split(" ")),"ontouchstart"in document.documentElement&&[].concat(...document.body.children).forEach((t=>{j.on(t,"mouseover",d)}));const c=this.tip.classList.contains(nn);this._queueCallback((()=>{const t=this._hoverState;this._hoverState=null,j.trigger(this._element,this.constructor.Event.SHOWN),t===rn&&this._leave(null,this)}),this.tip,c)}hide(){if(!this._popper)return;const t=this.getTipElement();if(j.trigger(this._element,this.constructor.Event.HIDE).defaultPrevented)return;t.classList.remove(sn),"ontouchstart"in document.documentElement&&[].concat(...document.body.children).forEach((t=>j.off(t,"mouseover",d))),this._activeTrigger.click=!1,this._activeTrigger.focus=!1,this._activeTrigger.hover=!1;const e=this.tip.classList.contains(nn);this._queueCallback((()=>{this._isWithActiveTrigger()||(this._hoverState!==on&&t.remove(),this._cleanTipClass(),this._element.removeAttribute("aria-describedby"),j.trigger(this._element,this.constructor.Event.HIDDEN),this._disposePopper())}),this.tip,e),this._hoverState=""}update(){null!==this._popper&&this._popper.update()}isWithContent(){return Boolean(this.getTitle())}getTipElement(){if(this.tip)return this.tip;const t=document.createElement("div");t.innerHTML=this._config.template;const e=t.children[0];return this.setContent(e),e.classList.remove(nn,sn),this.tip=e,this.tip}setContent(t){this._sanitizeAndSetContent(t,this.getTitle(),an)}_sanitizeAndSetContent(t,e,i){const n=V.findOne(i,t);e||!n?this.setElementContent(n,e):n.remove()}setElementContent(t,e){if(null!==t)return o(e)?(e=r(e),void(this._config.html?e.parentNode!==t&&(t.innerHTML="",t.append(e)):t.textContent=e.textContent)):void(this._config.html?(this._config.sanitize&&(e=Yi(e,this._config.allowList,this._config.sanitizeFn)),t.innerHTML=e):t.textContent=e)}getTitle(){const t=this._element.getAttribute("data-bs-original-title")||this._config.title;return this._resolvePossibleFunction(t)}updateAttachment(t){return"right"===t?"end":"left"===t?"start":t}_initializeOnDelegatedTarget(t,e){return e||this.constructor.getOrCreateInstance(t.delegateTarget,this._getDelegateConfig())}_getOffset(){const{offset:t}=this._config;return"string"==typeof t?t.split(",").map((t=>Number.parseInt(t,10))):"function"==typeof t?e=>t(e,this._element):t}_resolvePossibleFunction(t){return"function"==typeof t?t.call(this._element):t}_getPopperConfig(t){const e={placement:t,modifiers:[{name:"flip",options:{fallbackPlacements:this._config.fallbackPlacements}},{name:"offset",options:{offset:this._getOffset()}},{name:"preventOverflow",options:{boundary:this._config.boundary}},{name:"arrow",options:{element:`.${this.constructor.NAME}-arrow`}},{name:"onChange",enabled:!0,phase:"afterWrite",fn:t=>this._handlePopperPlacementChange(t)}],onFirstUpdate:t=>{t.options.placement!==t.placement&&this._handlePopperPlacementChange(t)}};return{...e,..."function"==typeof this._config.popperConfig?this._config.popperConfig(e):this._config.popperConfig}}_addAttachmentClass(t){this.getTipElement().classList.add(`${this._getBasicClassPrefix()}-${this.updateAttachment(t)}`)}_getAttachment(t){return Ji[t.toUpperCase()]}_setListeners(){this._config.trigger.split(" ").forEach((t=>{if("click"===t)j.on(this._element,this.constructor.Event.CLICK,this._config.selector,(t=>this.toggle(t)));else if("manual"!==t){const e=t===hn?this.constructor.Event.MOUSEENTER:this.constructor.Event.FOCUSIN,i=t===hn?this.constructor.Event.MOUSELEAVE:this.constructor.Event.FOCUSOUT;j.on(this._element,e,this._config.selector,(t=>this._enter(t))),j.on(this._element,i,this._config.selector,(t=>this._leave(t)))}})),this._hideModalHandler=()=>{this._element&&this.hide()},j.on(this._element.closest(ln),cn,this._hideModalHandler),this._config.selector?this._config={...this._config,trigger:"manual",selector:""}:this._fixTitle()}_fixTitle(){const t=this._element.getAttribute("title"),e=typeof this._element.getAttribute("data-bs-original-title");(t||"string"!==e)&&(this._element.setAttribute("data-bs-original-title",t||""),!t||this._element.getAttribute("aria-label")||this._element.textContent||this._element.setAttribute("aria-label",t),this._element.setAttribute("title",""))}_enter(t,e){e=this._initializeOnDelegatedTarget(t,e),t&&(e._activeTrigger["focusin"===t.type?dn:hn]=!0),e.getTipElement().classList.contains(sn)||e._hoverState===on?e._hoverState=on:(clearTimeout(e._timeout),e._hoverState=on,e._config.delay&&e._config.delay.show?e._timeout=setTimeout((()=>{e._hoverState===on&&e.show()}),e._config.delay.show):e.show())}_leave(t,e){e=this._initializeOnDelegatedTarget(t,e),t&&(e._activeTrigger["focusout"===t.type?dn:hn]=e._element.contains(t.relatedTarget)),e._isWithActiveTrigger()||(clearTimeout(e._timeout),e._hoverState=rn,e._config.delay&&e._config.delay.hide?e._timeout=setTimeout((()=>{e._hoverState===rn&&e.hide()}),e._config.delay.hide):e.hide())}_isWithActiveTrigger(){for(const t in this._activeTrigger)if(this._activeTrigger[t])return!0;return!1}_getConfig(t){const e=U.getDataAttributes(this._element);return Object.keys(e).forEach((t=>{Gi.has(t)&&delete e[t]})),(t={...this.constructor.Default,...e,..."object"==typeof t&&t?t:{}}).container=!1===t.container?document.body:r(t.container),"number"==typeof t.delay&&(t.delay={show:t.delay,hide:t.delay}),"number"==typeof t.title&&(t.title=t.title.toString()),"number"==typeof t.content&&(t.content=t.content.toString()),a(Qi,t,this.constructor.DefaultType),t.sanitize&&(t.template=Yi(t.template,t.allowList,t.sanitizeFn)),t}_getDelegateConfig(){const t={};for(const e in this._config)this.constructor.Default[e]!==this._config[e]&&(t[e]=this._config[e]);return t}_cleanTipClass(){const t=this.getTipElement(),e=new RegExp(`(^|\\s)${this._getBasicClassPrefix()}\\S+`,"g"),i=t.getAttribute("class").match(e);null!==i&&i.length>0&&i.map((t=>t.trim())).forEach((e=>t.classList.remove(e)))}_getBasicClassPrefix(){return"bs-tooltip"}_handlePopperPlacementChange(t){const{state:e}=t;e&&(this.tip=e.elements.popper,this._cleanTipClass(),this._addAttachmentClass(this._getAttachment(e.placement)))}_disposePopper(){this._popper&&(this._popper.destroy(),this._popper=null)}static jQueryInterface(t){return this.each((function(){const e=un.getOrCreateInstance(this,t);if("string"==typeof t){if(void 0===e[t])throw new TypeError(`No method named "${t}"`);e[t]()}}))}}g(un);const fn={...un.Default,placement:"right",offset:[0,8],trigger:"click",content:"",template:'