diff --git a/.DS_Store b/.DS_Store index 7a6ea23..c9fe512 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/.Rhistory b/.Rhistory index c6ddf4b..2226a49 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,38 +1,3 @@ -limits = c(0.04, 0.1), -labels = scales::percent) + -scale_y_continuous(expand = expansion(mult = c(0, 0.002)), -breaks = c(0, 0.01, 0.02, 0.03, 0.04, 0.05), -limits = c(0, 0.05), -labels = scales::percent) + -labs(x = "Seasonally-adjusted unemployment rate", -y = "Seasonally-adjusted vacancy rate") + -scatter_grid() -# Chunk 22: slope-plot -# https://www.bls.gov/lau/ -library(ggrepel) -unemployment <- tibble( -time = c("October 2009", "October 2009", "October 2009", "August 2017", "August 2017", "August 2017"), -rate = c(7.4, 7.1, 10.0, 3.9, 3.8, 6.4), -state = c("Maryland", "Virginia", "Washington, D.C.", "Maryland", "Virginia", "Washington, D.C.") -) -label <- tibble(label = c("October 2009", "August 2017")) -october <- filter(unemployment, time == "October 2009") -august <- filter(unemployment, time == "August 2017") -unemployment %>% -mutate(time = factor(time, levels = c("October 2009", "August 2017")), -state = factor(state, levels = c("Washington, D.C.", "Maryland", "Virginia"))) %>% -ggplot() + -geom_line(aes(time, rate, group = state, color = state), show.legend = FALSE) + -geom_point(aes(x = time, y = rate, color = state)) + -labs(subtitle = "Unemployment Rate") + -theme(axis.ticks.x = element_blank(), -axis.title.x = element_blank(), -axis.ticks.y = element_blank(), -axis.title.y = element_blank(), -axis.text.y = element_blank(), -panel.grid.major.y = element_blank(), -panel.grid.minor.y = element_blank(), -panel.grid.major.x = element_blank(), axis.line = element_blank()) + geom_text_repel(data = october, mapping = aes(x = time, y = rate, label = as.character(rate)), nudge_x = -0.06) + geom_text_repel(data = august, mapping = aes(x = time, y = rate, label = as.character(rate)), nudge_x = 0.06) @@ -510,3 +475,38 @@ palette_urbn_main[1:4] palette_urbn_spacegray[1:5] # Chunk 59: System Info and Package Versioning sessionInfo() +renv::status() +# load ggsankey package +remotes::install_github("davidsjoberg/ggsankey") +# create a dummy dataset of housing status +df <- data_frame(entry_status = c(rep("Housed", 7), rep("Unhoused", 15), rep("Staying w/ Family", 8)), +exit_status = c(rep("Housed", 15), rep("Unhoused", 2), rep("Staying w/ Family", 13))) %>% +# transform the data frame into the proper format for the sankey plot +make_long(entry_status, exit_status) %>% +# recode the labels to be cleaner in the plot +mutate(x = recode(x, entry_status = "Prior Housing Status", exit_status = "Exit Housing Status"), +next_x = recode(next_x, entry_status = "Prior Housing Status", exit_status = "Exit Housing Status")) +library(tidyverse) +# load ggsankey package +remotes::install_github("davidsjoberg/ggsankey") +library(ggsankey) +# create a dummy dataset of housing status +df <- data_frame(entry_status = c(rep("Housed", 7), rep("Unhoused", 15), rep("Staying w/ Family", 8)), +exit_status = c(rep("Housed", 15), rep("Unhoused", 2), rep("Staying w/ Family", 13))) %>% +# transform the data frame into the proper format for the sankey plot +make_long(entry_status, exit_status) %>% +# recode the labels to be cleaner in the plot +mutate(x = recode(x, entry_status = "Prior Housing Status", exit_status = "Exit Housing Status"), +next_x = recode(next_x, entry_status = "Prior Housing Status", exit_status = "Exit Housing Status")) +# create sankey plot +ggplot(df, aes(x = x, +next_x = next_x, +node = node, +next_node = next_node, +fill = factor(node), +label = node)) + +geom_sankey(flow.alpha = 0.5, node.color = 1, show.legend = FALSE) + +# add labels to plot and style +geom_sankey_label(size = 3.5, color = 1, fill = "white") + +theme_sankey(base_size = 16)+ +labs(x = NULL) diff --git a/.Rproj.user/.DS_Store b/.Rproj.user/.DS_Store index 842952f..ce78431 100644 Binary files a/.Rproj.user/.DS_Store and b/.Rproj.user/.DS_Store differ diff --git a/.Rproj.user/4C8EE3A7/.DS_Store b/.Rproj.user/4C8EE3A7/.DS_Store index 97b82f5..2b644ae 100644 Binary files a/.Rproj.user/4C8EE3A7/.DS_Store and b/.Rproj.user/4C8EE3A7/.DS_Store differ diff --git a/.Rproj.user/4C8EE3A7/pcs/source-pane.pper b/.Rproj.user/4C8EE3A7/pcs/source-pane.pper index 544e582..902cc6f 100644 --- a/.Rproj.user/4C8EE3A7/pcs/source-pane.pper +++ b/.Rproj.user/4C8EE3A7/pcs/source-pane.pper @@ -1,3 +1,3 @@ { - "activeTab": 5 + "activeTab": 0 } \ No newline at end of file diff --git a/.Rproj.user/4C8EE3A7/pcs/windowlayoutstate.pper b/.Rproj.user/4C8EE3A7/pcs/windowlayoutstate.pper index c13f616..c412cd7 100644 --- a/.Rproj.user/4C8EE3A7/pcs/windowlayoutstate.pper +++ b/.Rproj.user/4C8EE3A7/pcs/windowlayoutstate.pper @@ -1,13 +1,13 @@ { "left": { - "splitterpos": 157, + "splitterpos": 246, "topwindowstate": "NORMAL", "panelheight": 682, "windowheight": 720 }, "right": { - "splitterpos": 426, - "topwindowstate": "NORMAL", + "splitterpos": 424, + "topwindowstate": "MINIMIZE", "panelheight": 682, "windowheight": 720 } diff --git a/.Rproj.user/4C8EE3A7/pcs/workbench-pane.pper b/.Rproj.user/4C8EE3A7/pcs/workbench-pane.pper index d3c7634..07157f3 100644 --- a/.Rproj.user/4C8EE3A7/pcs/workbench-pane.pper +++ b/.Rproj.user/4C8EE3A7/pcs/workbench-pane.pper @@ -1,5 +1,5 @@ { "TabSet1": 0, - "TabSet2": 1, + "TabSet2": 4, "TabZoom": {} } \ No newline at end of file diff --git a/.Rproj.user/4C8EE3A7/sources/prop/53D28D73 b/.Rproj.user/4C8EE3A7/sources/prop/53D28D73 index 6c9ac38..f5ad44a 100644 --- a/.Rproj.user/4C8EE3A7/sources/prop/53D28D73 +++ b/.Rproj.user/4C8EE3A7/sources/prop/53D28D73 @@ -5,7 +5,7 @@ "source_window_id": "", "Source": "Source", "cursorPosition": "1101,0", - "scrollLine": "0", + "scrollLine": "299", "docOutlineVisible": "1", "rmdVisualCollapsedChunks": "", "rmdVisualModeLocation": "33850:26770.400390625", diff --git a/.Rproj.user/4C8EE3A7/sources/prop/91139DE3 b/.Rproj.user/4C8EE3A7/sources/prop/91139DE3 index ea0f99c..16144db 100644 --- a/.Rproj.user/4C8EE3A7/sources/prop/91139DE3 +++ b/.Rproj.user/4C8EE3A7/sources/prop/91139DE3 @@ -1,13 +1,13 @@ { - "rmdVisualMode": "true", + "rmdVisualMode": "false", "rmdVisualWrapConfigured": "true", "tempName": "Untitled1", "source_window_id": "", "Source": "Source", - "cursorPosition": "884,0", - "scrollLine": "0", + "cursorPosition": "1354,15", + "scrollLine": "1352", "docOutlineVisible": "1", "rmdVisualCollapsedChunks": "", - "rmdVisualModeLocation": "33796:16858.400390625", + "rmdVisualModeLocation": "50667:25980", "chunk_output_type": "console" } \ No newline at end of file diff --git a/.quarto/_freeze/optimization/execute-results/html.json b/.quarto/_freeze/optimization/execute-results/html.json index 44868de..5f6edba 100644 --- a/.quarto/_freeze/optimization/execute-results/html.json +++ b/.quarto/_freeze/optimization/execute-results/html.json @@ -1,7 +1,7 @@ { "hash": "d1ac4719dca216662dad7d103405571b", "result": { - "markdown": "---\noutput: \n html_document:\n includes:\n in_header: analytics.html \t\n css: styles.css\n code_folding: show\n toc: TRUE\n toc_float: TRUE\n pandoc_args:\n \"--tab-stop=2\"\n---\n\n\n\n\n::: {#header}\n\n:::\n\n\n\n\n\n# Introduction\n\nThis guide outlines tools and tips for improving the speed and execution of R code.\n\nSometimes, simply tweaking a few lines of code can lead to large performance gains in the execution of a program. Other issues may take more time to work through but can be a huge benefit to a project in the long term.\n\nAn important lesson to learn when it comes to optimising an R (or any) program is knowing both if to start and when to stop. You most likely want to optimize your code because it is \"too slow\", but what that means will vary from project to project. Be sure to consider what \"fast enough\" is for your project and how much needs to be optimized. If your program takes an hour to complete, spending 5 hours trying to make it faster can be time well spent if the script will be run regularly, and a complete waste of time if it's an ad-hoc analysis.\n\nFor more information, see the CRAN Task View [High-Performance and Parallel Computing with R](https://CRAN.R-project.org/view=HighPerformanceComputing).\n\nThe \"Performant Code\" section of Hadley Wickham's [Advanced R](http://adv-r.had.co.nz/) is another great resource and provides a deeper dive into what is covered in this guide.\n\n------------------------------------------------------------------------\n\n# Update Your Installation\n\nOne of the easiest ways to improve the performance of R is to update R. In general, R will have a big annual release (i.e., 3.5.0) in the spring and around 3-4 smaller patch releases (i.e., 3.5.1) throughout the rest of the year. If the middle digit of your installation is behind the current release, you should consider updating.\n\nFor instance, R 3.5.0 implemented an improved read from text files. A 5GB file took over 5 minutes to read in 3.4.4:\n\n![](optimization/images/data-load-3-4.png){width=\"75%\"}\n\nWhile 3.5.0 took less than half the time:\n\n![](optimization/images/data-load-3-5.png){width=\"75%\"}\n\nTo see what the R-core development team is up to, check out the [NEWS](https://cran.r-project.org/doc/manuals/r-devel/NEWS.html) file from the R project.\n\n------------------------------------------------------------------------\n\n# Profiling & Benchmarking\n\nIn order to efficiently optimize your code, you'll first need to know where it's running slowest. The `profvis` package provides a nice way of visualizing the execution time and memory useage of your program.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(profvis)\nlibrary(dplyr)\n\nprofvis({\n\tdiamonds <- read.csv(\"optimization/data/diamonds.csv\")\n\n\tdiamonds_by_cut <- diamonds %>%\n\t\tgroup_by(cut) %>%\n\t\tsummarise_if(is.numeric, mean)\n\n\twrite.csv(diamonds_by_cut, file = \"optimization/data/diamonds_by_cut.csv\")\n\n})\n```\n\n::: {.cell-output-display}\n```{=html}\n
\n\n```\n:::\n:::\n\n\nIn this toy example it looks like the `read.csv` function is the bottleneck, so\n\nwork on optimizing that first.\n\nOnce you find the bottleneck that needs to be optimized, it can be useful to\n\nbenchmark different potential solutions. The `microbenchmark` package can help\n\nyou choose between different options. Continuing with the simple example with\n\nthe `diamonds` dataset, compare the base `read.csv` function with `read_csv`\n\nfrom the `readr` package.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(microbenchmark)\n\nmicrobenchmark(\n\n read.csv(\"optimization/data/diamonds.csv\"),\n\n readr::read_csv(\"optimization/data/diamonds.csv\")\n\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nUnit: milliseconds\n expr min lq mean\n read.csv(\"optimization/data/diamonds.csv\") 64.77053 66.14555 71.04118\n readr::read_csv(\"optimization/data/diamonds.csv\") 32.04847 33.20362 37.12877\n median uq max neval\n 68.25725 69.80660 134.2046 100\n 34.07731 35.69267 158.6962 100\n```\n:::\n:::\n\n\nIn this case, `read_csv` is about twice as fast as the base R implementations.\n\n# Parallel Computing\n\nOften, time-intensive R code can be sped up by breaking the execution of\n\nthe job across additional cores of your computer. This is called parallel computing.\n\n## Learn `lapply`/`purrr::map`\n\nLearning the `lapply` (and variants) function from Base R or the `map` (and variants) function from the `purrr` package is the first step in learning to run R code in parallel. Once you understand how `lapply` and `map` work, running your code in parallel will be simple.\n\nSay you have a vector of numbers and want to find the square root of each one\n\n(ignore for now that `sqrt` is vectorized, which will be covered later).\n\nYou could write a for loop and iterate over each element of the vector:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- c(1, 4, 9, 16)\n\nout <- vector(\"list\", length(x))\n\nfor (i in seq_along(x)) {\n\n out[[i]] <- sqrt(x[[i]])\n\n}\n\nunlist(out)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\nThe `lapply` function essentially handles the overhead of constructing a for\n\nloop for you. The syntax is:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlapply(X, FUN, ...)\n```\n:::\n\n\n`lapply` will then take each element of `X` and apply the `FUN`ction to it.\n\nOur simple example then becomes:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- c(1, 4, 9, 16)\n\nout <- lapply(x, sqrt)\n\nunlist(out)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\nThose working within the `tidyverse` may use `map` from the `purrr` package equivalently:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(purrr)\n\nx <- c(1, 4, 9, 16)\n\nout <- map(x, sqrt)\n\nunlist(out)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\n## Motivating Example\n\nOnce you are comfortable with `lapply` and/or `map`, running the same code in\n\nparallel takes just an additional line of code.\n\nFor `lapply` users, the `future.apply` package contains an equivalent\n\n`future_lapply` function. Just be sure to call `plan(multiprocess)` beforehand,\n\nwhich will handle the back-end orchestration needed to run in parallel.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# install.packages(\"future.apply\")\n\nlibrary(future.apply)\n\nplan(multisession)\n\nout <- future_lapply(x, sqrt)\n\nunlist(out)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\nFor `purrr` users, the `furrr` (i.e., future purrr) package includes an\n\nequivalent `future_map` function:\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# install.packages(\"furrr\")\n\nlibrary(furrr)\n\nplan(multisession)\n\ny <- future_map(x, sqrt)\n\nunlist(y)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\nHow much faster did this simple example run in parallel?\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(future.apply)\n\nplan(multisession)\n\nx <- c(1, 4, 9, 16)\n\nmicrobenchmark::microbenchmark(\n\n sequential = lapply(x, sqrt),\n\n parallel = future_lapply(x, sqrt),\n\n unit = \"s\"\n\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nUnit: seconds\n expr min lq mean median uq\n sequential 0.000001066 0.00000123 0.00000177858 0.0000016195 0.000002132\n parallel 0.014658443 0.01518220 0.01828126942 0.0157106875 0.017450646\n max neval\n 0.000006847 100\n 0.198878864 100\n```\n:::\n:::\n\n\nParallelization was actually slower. In this case, the overhead of\n\nsetting the code to run in parallel far outweighed any performance gain. In\n\ngeneral, parallelization works well on long-running & compute intensive jobs.\n\n## A (somewhat) More Complex Example\n\nIn this example we'll use the `diamonds` dataset from `ggplot2` and perform a\n\nkmeans cluster. We'll use `lapply` to iterate the number of clusters from 2 to\n\n5:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndf <- ggplot2::diamonds\n\ndf <- dplyr::select(df, -c(cut, color, clarity))\n\ncenters = 2:5\n\nsystem.time(\n\n lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n user system elapsed \n 21.846 0.663 22.535 \n```\n:::\n:::\n\n\nA now running the same code in parallel:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(future.apply)\n\nplan(multisession)\n\nsystem.time(\n\n future_lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n user system elapsed \n 0.376 0.121 11.164 \n```\n:::\n:::\n\n\nWhile we didn't achieve perfect scaling, we still get a nice bump in execution\n\ntime.\n\n## Additional Packages\n\nFor the sake of ease and brevity, this guide focused on the `futures` framework\n\nfor parallelization. However, you should be aware that there are a number of\n\nother ways to parallelize your code.\n\n### The `parallel` Package\n\nThe `parallel` package is included in your base R installation. It includes\n\nanalogues of the various `apply` functions:\n\n- `parLapply`\n\n- `mclapply` - not available on Windows\n\nThese functions generally require more setup, especially on Windows machines.\n\n### The `doParallel` Package\n\nThe `doParallel` package builds off of `parallel` and is\n\nuseful for code that uses for loops instead of `lapply`. Like the parallel\n\npackage, it generally requires more setup, especially on Windows machines.\n\n### Machine Learning - `caret`\n\nFor those running machine learning models, the `caret` package can easily\n\nleverage `doParallel` to speed up the execution of multiple models. Lifting\n\nthe example from the package documentation:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(doParallel)\n\ncl <- makePSOCKcluster(5) # number of cores to use\n\nregisterDoParallel(cl)\n\n## All subsequent models are then run in parallel\n\nmodel <- train(y ~ ., data = training, method = \"rf\")\n\n## When you are done:\n\nstopCluster(cl)\n```\n:::\n\n\nBe sure to check out the full\n\n[documentation](http://topepo.github.io/caret/parallel-processing.html)\n\nfor more detail.\n\n------------------------------------------------------------------------\n\n# Big Data\n\nAs data collection and storage becomes easier and cheaper, it is relatively\n\nsimple to obtain relatively large data files. An important point to keep in\n\nmind is that the size of your data will generally expand when it is read\n\nfrom a storage device into R. A general rule of thumb is that a file will take\n\nsomewhere around 3-4 times more space in memory than it does on disk.\n\nFor instance, compare the size of the `iris` data set when it is saved as a\n\n.csv file locally vs the size of the object when it is read in to an R session:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfile.size(\"optimization/data/iris.csv\") / 1000\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 3.716\n```\n:::\n\n```{.r .cell-code}\ndf <- readr::read_csv(\"optimization/data/iris.csv\")\n\npryr::object_size(df)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n10.14 kB\n```\n:::\n:::\n\n\nThis means that on a standard Urban Institute desktop, you may have issues\n\nreading in files that are larger than 4 GB.\n\n## Object Size\n\nThe type of your data can have a big impact on the size of your data frame\n\nwhen you are dealing with larger files. There are four main types of atomic\n\nvectors in R:\n\n1. `logical`\n\n2. `integer`\n\n3. `double` (also called `numeric`)\n\n4. `character`\n\n## Each of these data types occupies a different amount of space in memory\n\n`logical` and `integer` vectors use 4 bytes per element, while a `double` will\n\noccupy 8 bytes. R uses a global string pool, so `character` vectors are hard\n\nto estimate, but will generally take up more space for element.\n\nConsider the following example:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- 1:100\n\npryr::object_size(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n680 B\n```\n:::\n\n```{.r .cell-code}\npryr::object_size(as.double(x))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n680 B\n```\n:::\n\n```{.r .cell-code}\npryr::object_size(as.character(x))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n1.32 kB\n```\n:::\n:::\n\n\nAn incorrect data type can easily cost you a lot of space in memory, especially\n\nat scale. This often happens when reading data from a text or csv file - data\n\nmay have a format such as `c(1.0, 2.0, 3.0)` and will be read in as a `numeric`\n\ncolumn, when `integer` is more appropriate and compact.\n\nYou may also be familiar with `factor` variables within R. Essentially a\n\n`factor` will represent your data as integers, and map them back to their\n\ncharacter representation. This can save memory when you have a compact and\n\nunique level of factors:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- sample(letters, 10000, replace = TRUE)\n\npryr::object_size(as.character(x))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n81.50 kB\n```\n:::\n\n```{.r .cell-code}\npryr::object_size(as.factor(x))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n42.10 kB\n```\n:::\n:::\n\n\nHowever if each element is unique, or if there is not a lot of overlap among\n\nelements, than the overhead will make a factor larger than its character\n\nrepresentation:\n\n\n::: {.cell}\n\n```{.r .cell-code}\npryr::object_size(as.factor(letters))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n2.22 kB\n```\n:::\n\n```{.r .cell-code}\npryr::object_size(as.character(letters))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n1.71 kB\n```\n:::\n:::\n\n\n## Cloud Computing\n\nSometimes, you will have data that are simply too large to ever fit on your\n\nlocal desktop machine. If that is the case, then the Elastic Cloud Computing\n\nEnvironment from the Office of Technology and Data Science can provide you with\n\neasy access to powerful analytic tools for computationally intensive project.\n\nThe Elastic Cloud Computing Environment allows researchers to quickly spin-up\n\nan Amazon Web Services (AWS) Elastic Cloud Compute (EC2) instance. These\n\ninstances offer increased memory to read in large datasets, along with\n\nadditional CPUs to provide the ability to process data in parallel at an\n\nimpressive scale.\n\n| Instance \\| CPU \\| Memory (GB) \\|\n\n\\|----------\\|-----\\|--------\\|\n\n| Desktop \\| 8 \\| 16 \\|\n\n| c5.4xlarge \\| 16 \\| 32 \\|\n\n| c5.9xlarge \\| 36 \\| 72 \\|\n\n| c5.18xlarge \\| 72 \\| 144 \\|\n\n| x1e.8xlarge \\| 32 \\| 976 \\|\n\n| x1e.16xlarge \\| 64 \\| 1952 \\|\n\nFeel free to contact Erika Tyagi (etyagi\\@urban.org) if this would be useful\n\nfor your project.\n\n------------------------------------------------------------------------\n\n# Common Pitfalls\n\n## For Loops and Vector Allocation\n\nA refrain you will often hear is that for loops in R are slow and need to be\n\navoided at all costs. This is not true! Rather, an improperly constructed loop\n\nin R can bring the execution of your program to a near standstill.\n\nA common for loop structure may look something like:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- 1:100\n\nout <- c()\n\nfor (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n```\n:::\n\n\nThe bottleneck in this loop is with the allocation of the vector `out`. Every\n\ntime we iterate over an item in `x` and append it to `out`, R makes a copy\n\nof all the items already in `out`. As the size of the loop grows, your code\n\nwill take longer and longer to run.\n\nA better practice is to pre-allocate `out` to be the correct length, and then\n\ninsert the results as the loop runs.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- 1:100\n\nout <- rep(NA, length(x))\n\nfor (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n}\n```\n:::\n\n\nA quick benchmark shows how much more efficient a loop with a pre-allocated\n\nresults vector is:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nbad_loop <- function(x) {\n\n out <- c()\n\n for (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n\n}\n\ngood_loop <- function(x) {\n\n out <- rep(NA, length(x))\n\n for (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n }\n\n}\n\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(x),\n\n good_loop(x)\n\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nUnit: microseconds\n expr min lq mean median uq max neval\n bad_loop(x) 896.465 967.5590 2027.18719 1054.7250 1132.6660 55959.588 100\n good_loop(x) 4.346 4.7355 21.39134 5.8425 7.9745 1437.009 100\n```\n:::\n:::\n\n\nAnd note how performance of the \"bad\" loop degrades as the loop size grows.\n\n\n::: {.cell}\n\n```{.r .cell-code}\ny <- 1:250\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(y),\n\n good_loop(y)\n\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nUnit: microseconds\n expr min lq mean median uq max\n bad_loop(y) 13175.473 17043.3310 18404.84383 17790.6995 18655.1230 65857.726\n good_loop(y) 9.717 10.2705 14.36558 11.3775 16.7485 35.711\n neval\n 100\n 100\n```\n:::\n:::\n\n\n## Vectorized Functions\n\nMany functions in R are vectorized, meaning they can accept an entire vector\n\n(and not just a single value) as input. The `sqrt` function from the\n\nprior examples is one:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- c(1, 4, 9, 16)\n\nsqrt(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\nThis removes the need to use `lapply` or a for loop. Vectorized functions in\n\nR are generally written in a compiled language like C, C++, or FORTRAN, which\n\nmakes their implementation faster.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n lapply(x, sqrt),\n\n sqrt(x)\n\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nUnit: nanoseconds\n expr min lq mean median uq max neval\n lapply(x, sqrt) 14801 15047 15325.39 15170 15334 21279 100\n sqrt(x) 205 246 341.53 287 369 1107 100\n```\n:::\n:::\n", + "markdown": "---\noutput: \n html_document:\n includes:\n in_header: analytics.html \t\n css: styles.css\n code_folding: show\n toc: TRUE\n toc_float: TRUE\n pandoc_args:\n \"--tab-stop=2\"\n---\n\n\n\n\n::: {#header}\n\n:::\n\n\n\n\n\n# Introduction\n\nThis guide outlines tools and tips for improving the speed and execution of R code.\n\nSometimes, simply tweaking a few lines of code can lead to large performance gains in the execution of a program. Other issues may take more time to work through but can be a huge benefit to a project in the long term.\n\nAn important lesson to learn when it comes to optimising an R (or any) program is knowing both if to start and when to stop. You most likely want to optimize your code because it is \"too slow\", but what that means will vary from project to project. Be sure to consider what \"fast enough\" is for your project and how much needs to be optimized. If your program takes an hour to complete, spending 5 hours trying to make it faster can be time well spent if the script will be run regularly, and a complete waste of time if it's an ad-hoc analysis.\n\nFor more information, see the CRAN Task View [High-Performance and Parallel Computing with R](https://CRAN.R-project.org/view=HighPerformanceComputing).\n\nThe \"Performant Code\" section of Hadley Wickham's [Advanced R](http://adv-r.had.co.nz/) is another great resource and provides a deeper dive into what is covered in this guide.\n\n------------------------------------------------------------------------\n\n# Update Your Installation\n\nOne of the easiest ways to improve the performance of R is to update R. In general, R will have a big annual release (i.e., 3.5.0) in the spring and around 3-4 smaller patch releases (i.e., 3.5.1) throughout the rest of the year. If the middle digit of your installation is behind the current release, you should consider updating.\n\nFor instance, R 3.5.0 implemented an improved read from text files. A 5GB file took over 5 minutes to read in 3.4.4:\n\n![](optimization/images/data-load-3-4.png){width=\"75%\"}\n\nWhile 3.5.0 took less than half the time:\n\n![](optimization/images/data-load-3-5.png){width=\"75%\"}\n\nTo see what the R-core development team is up to, check out the [NEWS](https://cran.r-project.org/doc/manuals/r-devel/NEWS.html) file from the R project.\n\n------------------------------------------------------------------------\n\n# Profiling & Benchmarking\n\nIn order to efficiently optimize your code, you'll first need to know where it's running slowest. The `profvis` package provides a nice way of visualizing the execution time and memory useage of your program.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(profvis)\nlibrary(dplyr)\n\nprofvis({\n\tdiamonds <- read.csv(\"optimization/data/diamonds.csv\")\n\n\tdiamonds_by_cut <- diamonds %>%\n\t\tgroup_by(cut) %>%\n\t\tsummarise_if(is.numeric, mean)\n\n\twrite.csv(diamonds_by_cut, file = \"optimization/data/diamonds_by_cut.csv\")\n\n})\n```\n\n::: {.cell-output-display}\n```{=html}\n
\n\n```\n:::\n:::\n\n\nIn this toy example it looks like the `read.csv` function is the bottleneck, so\n\nwork on optimizing that first.\n\nOnce you find the bottleneck that needs to be optimized, it can be useful to\n\nbenchmark different potential solutions. The `microbenchmark` package can help\n\nyou choose between different options. Continuing with the simple example with\n\nthe `diamonds` dataset, compare the base `read.csv` function with `read_csv`\n\nfrom the `readr` package.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(microbenchmark)\n\nmicrobenchmark(\n\n read.csv(\"optimization/data/diamonds.csv\"),\n\n readr::read_csv(\"optimization/data/diamonds.csv\")\n\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nUnit: milliseconds\n expr min lq\n read.csv(\"optimization/data/diamonds.csv\") 103.14624 111.61502\n readr::read_csv(\"optimization/data/diamonds.csv\") 55.57689 59.80873\n mean median uq max neval\n 135.10956 115.71928 127.93492 453.5855 100\n 75.97688 63.64992 71.88532 372.4557 100\n```\n:::\n:::\n\n\nIn this case, `read_csv` is about twice as fast as the base R implementations.\n\n# Parallel Computing\n\nOften, time-intensive R code can be sped up by breaking the execution of\n\nthe job across additional cores of your computer. This is called parallel computing.\n\n## Learn `lapply`/`purrr::map`\n\nLearning the `lapply` (and variants) function from Base R or the `map` (and variants) function from the `purrr` package is the first step in learning to run R code in parallel. Once you understand how `lapply` and `map` work, running your code in parallel will be simple.\n\nSay you have a vector of numbers and want to find the square root of each one\n\n(ignore for now that `sqrt` is vectorized, which will be covered later).\n\nYou could write a for loop and iterate over each element of the vector:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- c(1, 4, 9, 16)\n\nout <- vector(\"list\", length(x))\n\nfor (i in seq_along(x)) {\n\n out[[i]] <- sqrt(x[[i]])\n\n}\n\nunlist(out)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\nThe `lapply` function essentially handles the overhead of constructing a for\n\nloop for you. The syntax is:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlapply(X, FUN, ...)\n```\n:::\n\n\n`lapply` will then take each element of `X` and apply the `FUN`ction to it.\n\nOur simple example then becomes:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- c(1, 4, 9, 16)\n\nout <- lapply(x, sqrt)\n\nunlist(out)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\nThose working within the `tidyverse` may use `map` from the `purrr` package equivalently:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(purrr)\n\nx <- c(1, 4, 9, 16)\n\nout <- map(x, sqrt)\n\nunlist(out)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\n## Motivating Example\n\nOnce you are comfortable with `lapply` and/or `map`, running the same code in\n\nparallel takes just an additional line of code.\n\nFor `lapply` users, the `future.apply` package contains an equivalent\n\n`future_lapply` function. Just be sure to call `plan(multiprocess)` beforehand,\n\nwhich will handle the back-end orchestration needed to run in parallel.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# install.packages(\"future.apply\")\n\nlibrary(future.apply)\n\nplan(multisession)\n\nout <- future_lapply(x, sqrt)\n\nunlist(out)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\nFor `purrr` users, the `furrr` (i.e., future purrr) package includes an\n\nequivalent `future_map` function:\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# install.packages(\"furrr\")\n\nlibrary(furrr)\n\nplan(multisession)\n\ny <- future_map(x, sqrt)\n\nunlist(y)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\nHow much faster did this simple example run in parallel?\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(future.apply)\n\nplan(multisession)\n\nx <- c(1, 4, 9, 16)\n\nmicrobenchmark::microbenchmark(\n\n sequential = lapply(x, sqrt),\n\n parallel = future_lapply(x, sqrt),\n\n unit = \"s\"\n\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nUnit: seconds\n expr min lq mean median uq\n sequential 0.000001763 0.0000020705 0.0000029848 0.0000029315 0.000003772\n parallel 0.026585548 0.0282111980 0.0333569452 0.0291357070 0.030628517\n max neval\n 0.000009799 100\n 0.337903181 100\n```\n:::\n:::\n\n\nParallelization was actually slower. In this case, the overhead of\n\nsetting the code to run in parallel far outweighed any performance gain. In\n\ngeneral, parallelization works well on long-running & compute intensive jobs.\n\n## A (somewhat) More Complex Example\n\nIn this example we'll use the `diamonds` dataset from `ggplot2` and perform a\n\nkmeans cluster. We'll use `lapply` to iterate the number of clusters from 2 to\n\n5:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndf <- ggplot2::diamonds\n\ndf <- dplyr::select(df, -c(cut, color, clarity))\n\ncenters = 2:5\n\nsystem.time(\n\n lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n user system elapsed \n 35.229 2.688 42.309 \n```\n:::\n:::\n\n\nA now running the same code in parallel:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(future.apply)\n\nplan(multisession)\n\nsystem.time(\n\n future_lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n user system elapsed \n 0.876 0.210 21.655 \n```\n:::\n:::\n\n\nWhile we didn't achieve perfect scaling, we still get a nice bump in execution\n\ntime.\n\n## Additional Packages\n\nFor the sake of ease and brevity, this guide focused on the `futures` framework\n\nfor parallelization. However, you should be aware that there are a number of\n\nother ways to parallelize your code.\n\n### The `parallel` Package\n\nThe `parallel` package is included in your base R installation. It includes\n\nanalogues of the various `apply` functions:\n\n- `parLapply`\n\n- `mclapply` - not available on Windows\n\nThese functions generally require more setup, especially on Windows machines.\n\n### The `doParallel` Package\n\nThe `doParallel` package builds off of `parallel` and is\n\nuseful for code that uses for loops instead of `lapply`. Like the parallel\n\npackage, it generally requires more setup, especially on Windows machines.\n\n### Machine Learning - `caret`\n\nFor those running machine learning models, the `caret` package can easily\n\nleverage `doParallel` to speed up the execution of multiple models. Lifting\n\nthe example from the package documentation:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(doParallel)\n\ncl <- makePSOCKcluster(5) # number of cores to use\n\nregisterDoParallel(cl)\n\n## All subsequent models are then run in parallel\n\nmodel <- train(y ~ ., data = training, method = \"rf\")\n\n## When you are done:\n\nstopCluster(cl)\n```\n:::\n\n\nBe sure to check out the full\n\n[documentation](http://topepo.github.io/caret/parallel-processing.html)\n\nfor more detail.\n\n------------------------------------------------------------------------\n\n# Big Data\n\nAs data collection and storage becomes easier and cheaper, it is relatively\n\nsimple to obtain relatively large data files. An important point to keep in\n\nmind is that the size of your data will generally expand when it is read\n\nfrom a storage device into R. A general rule of thumb is that a file will take\n\nsomewhere around 3-4 times more space in memory than it does on disk.\n\nFor instance, compare the size of the `iris` data set when it is saved as a\n\n.csv file locally vs the size of the object when it is read in to an R session:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfile.size(\"optimization/data/iris.csv\") / 1000\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 3.716\n```\n:::\n\n```{.r .cell-code}\ndf <- readr::read_csv(\"optimization/data/iris.csv\")\n\npryr::object_size(df)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n10.14 kB\n```\n:::\n:::\n\n\nThis means that on a standard Urban Institute desktop, you may have issues\n\nreading in files that are larger than 4 GB.\n\n## Object Size\n\nThe type of your data can have a big impact on the size of your data frame\n\nwhen you are dealing with larger files. There are four main types of atomic\n\nvectors in R:\n\n1. `logical`\n\n2. `integer`\n\n3. `double` (also called `numeric`)\n\n4. `character`\n\n## Each of these data types occupies a different amount of space in memory\n\n`logical` and `integer` vectors use 4 bytes per element, while a `double` will\n\noccupy 8 bytes. R uses a global string pool, so `character` vectors are hard\n\nto estimate, but will generally take up more space for element.\n\nConsider the following example:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- 1:100\n\npryr::object_size(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n680 B\n```\n:::\n\n```{.r .cell-code}\npryr::object_size(as.double(x))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n680 B\n```\n:::\n\n```{.r .cell-code}\npryr::object_size(as.character(x))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n1.32 kB\n```\n:::\n:::\n\n\nAn incorrect data type can easily cost you a lot of space in memory, especially\n\nat scale. This often happens when reading data from a text or csv file - data\n\nmay have a format such as `c(1.0, 2.0, 3.0)` and will be read in as a `numeric`\n\ncolumn, when `integer` is more appropriate and compact.\n\nYou may also be familiar with `factor` variables within R. Essentially a\n\n`factor` will represent your data as integers, and map them back to their\n\ncharacter representation. This can save memory when you have a compact and\n\nunique level of factors:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- sample(letters, 10000, replace = TRUE)\n\npryr::object_size(as.character(x))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n81.50 kB\n```\n:::\n\n```{.r .cell-code}\npryr::object_size(as.factor(x))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n42.10 kB\n```\n:::\n:::\n\n\nHowever if each element is unique, or if there is not a lot of overlap among\n\nelements, than the overhead will make a factor larger than its character\n\nrepresentation:\n\n\n::: {.cell}\n\n```{.r .cell-code}\npryr::object_size(as.factor(letters))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n2.22 kB\n```\n:::\n\n```{.r .cell-code}\npryr::object_size(as.character(letters))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n1.71 kB\n```\n:::\n:::\n\n\n## Cloud Computing\n\nSometimes, you will have data that are simply too large to ever fit on your\n\nlocal desktop machine. If that is the case, then the Elastic Cloud Computing\n\nEnvironment from the Office of Technology and Data Science can provide you with\n\neasy access to powerful analytic tools for computationally intensive project.\n\nThe Elastic Cloud Computing Environment allows researchers to quickly spin-up\n\nan Amazon Web Services (AWS) Elastic Cloud Compute (EC2) instance. These\n\ninstances offer increased memory to read in large datasets, along with\n\nadditional CPUs to provide the ability to process data in parallel at an\n\nimpressive scale.\n\n| Instance \\| CPU \\| Memory (GB) \\|\n\n\\|----------\\|-----\\|--------\\|\n\n| Desktop \\| 8 \\| 16 \\|\n\n| c5.4xlarge \\| 16 \\| 32 \\|\n\n| c5.9xlarge \\| 36 \\| 72 \\|\n\n| c5.18xlarge \\| 72 \\| 144 \\|\n\n| x1e.8xlarge \\| 32 \\| 976 \\|\n\n| x1e.16xlarge \\| 64 \\| 1952 \\|\n\nFeel free to contact Erika Tyagi (etyagi\\@urban.org) if this would be useful\n\nfor your project.\n\n------------------------------------------------------------------------\n\n# Common Pitfalls\n\n## For Loops and Vector Allocation\n\nA refrain you will often hear is that for loops in R are slow and need to be\n\navoided at all costs. This is not true! Rather, an improperly constructed loop\n\nin R can bring the execution of your program to a near standstill.\n\nA common for loop structure may look something like:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- 1:100\n\nout <- c()\n\nfor (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n```\n:::\n\n\nThe bottleneck in this loop is with the allocation of the vector `out`. Every\n\ntime we iterate over an item in `x` and append it to `out`, R makes a copy\n\nof all the items already in `out`. As the size of the loop grows, your code\n\nwill take longer and longer to run.\n\nA better practice is to pre-allocate `out` to be the correct length, and then\n\ninsert the results as the loop runs.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- 1:100\n\nout <- rep(NA, length(x))\n\nfor (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n}\n```\n:::\n\n\nA quick benchmark shows how much more efficient a loop with a pre-allocated\n\nresults vector is:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nbad_loop <- function(x) {\n\n out <- c()\n\n for (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n\n}\n\ngood_loop <- function(x) {\n\n out <- rep(NA, length(x))\n\n for (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n }\n\n}\n\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(x),\n\n good_loop(x)\n\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nUnit: microseconds\n expr min lq mean median uq max neval\n bad_loop(x) 1042.179 1267.577 1891.78264 1328.5640 1446.9720 10173.125 100\n good_loop(x) 6.191 6.437 32.23338 6.7035 11.2545 2366.725 100\n```\n:::\n:::\n\n\nAnd note how performance of the \"bad\" loop degrades as the loop size grows.\n\n\n::: {.cell}\n\n```{.r .cell-code}\ny <- 1:250\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(y),\n\n good_loop(y)\n\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nUnit: microseconds\n expr min lq mean median uq max\n bad_loop(y) 19249.582 22663.9595 24695.7231 23492.774 24909.407 81335.882\n good_loop(y) 14.022 14.5345 21.2626 23.329 26.486 64.616\n neval\n 100\n 100\n```\n:::\n:::\n\n\n## Vectorized Functions\n\nMany functions in R are vectorized, meaning they can accept an entire vector\n\n(and not just a single value) as input. The `sqrt` function from the\n\nprior examples is one:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- c(1, 4, 9, 16)\n\nsqrt(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1 2 3 4\n```\n:::\n:::\n\n\nThis removes the need to use `lapply` or a for loop. Vectorized functions in\n\nR are generally written in a compiled language like C, C++, or FORTRAN, which\n\nmakes their implementation faster.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n lapply(x, sqrt),\n\n sqrt(x)\n\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nUnit: nanoseconds\n expr min lq mean median uq max neval\n lapply(x, sqrt) 20172 20418 20847.27 20541 20725.5 37228 100\n sqrt(x) 287 328 397.70 369 369.0 2296 100\n```\n:::\n:::\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/.quarto/crossref/index.qmd/index.html.json b/.quarto/crossref/index.qmd/index.html.json index 2f559c9..16f4127 100644 --- a/.quarto/crossref/index.qmd/index.html.json +++ b/.quarto/crossref/index.qmd/index.html.json @@ -1 +1 @@ -{"entries":[],"headings":["r-users-group","sign-up-for-list-serv","section","section-1","contact-info","r-lunch-labs"]} \ No newline at end of file +{"headings":["r-users-group","sign-up-for-list-serv","section","section-1","contact-info","r-lunch-labs"],"entries":[]} \ No newline at end of file diff --git a/.quarto/crossref/optimization.qmd/optimization.html.json b/.quarto/crossref/optimization.qmd/optimization.html.json index 04047c3..d1fddea 100644 --- a/.quarto/crossref/optimization.qmd/optimization.html.json +++ b/.quarto/crossref/optimization.qmd/optimization.html.json @@ -1 +1 @@ -{"headings":["introduction","update-your-installation","profiling-benchmarking","parallel-computing","learn-lapplypurrrmap","motivating-example","a-somewhat-more-complex-example","additional-packages","the-parallel-package","the-doparallel-package","machine-learning---caret","big-data","object-size","each-of-these-data-types-occupies-a-different-amount-of-space-in-memory","cloud-computing","common-pitfalls","for-loops-and-vector-allocation","vectorized-functions"],"entries":[]} \ No newline at end of file +{"entries":[],"headings":["introduction","update-your-installation","profiling-benchmarking","parallel-computing","learn-lapplypurrrmap","motivating-example","a-somewhat-more-complex-example","additional-packages","the-parallel-package","the-doparallel-package","machine-learning---caret","big-data","object-size","each-of-these-data-types-occupies-a-different-amount-of-space-in-memory","cloud-computing","common-pitfalls","for-loops-and-vector-allocation","vectorized-functions"]} \ No newline at end of file diff --git a/.quarto/idx/getting-data.qmd.json b/.quarto/idx/getting-data.qmd.json index 3c30b92..9f536a9 100644 --- a/.quarto/idx/getting-data.qmd.json +++ b/.quarto/idx/getting-data.qmd.json @@ -1 +1 @@ -{"title":"source(here::here(\"getting-data\", \"census_api_key.R\"))","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}}},"headingText":"source(here::here(\"getting-data\", \"census_api_key.R\"))","containsRefs":false,"markdown":"\n\n\n\n::: {#header}\n\n:::\n\n```{r echo = FALSE}\n```\n\n```{r markdown-setup, include=FALSE}\n\nknitr::opts_chunk$set(fig.path = \"intro-to-r/www/images/\")\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(warning = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\n\n\noptions(scipen = 999)\n```\n\n# Introduction\n\nThis guide outlines some useful workflows for pulling data sets commonly used by the Urban Institute.\n\n## `library(tidycensus)`\n\n`library(tidycensus)` by Kyle Walker ([complete intro here](https://walkerke.github.io/tidycensus/)) is the best tool for accessing some Census data sets in R from the Census Bureau API. The package returns tidy data frames and can easily pull shapefiles by adding `geometry = TRUE`.\n\nYou will need to [apply for a Census API key](https://api.census.gov/data/key_signup.html) and [add it to your R session](https://walkerke.github.io/tidycensus/articles/basic-usage.html). Don't add your API key to your script and don't add it to a GitHub repository!\n\nHere is a simple example for one state with shapefiles:\n\n```{r tidycensus}\nlibrary(tidyverse)\nlibrary(purrr)\nlibrary(tidycensus)\n\n# pull median household income and shapefiles for Census tracts in Alabama\nget_acs(geography = \"tract\", \n\t\t\t\tvariables = \"B19013_001\", \n\t\t\t\tstate = \"01\",\n\t\t\t\tyear = 2015,\n\t\t\t\tgeometry = TRUE,\n\t\t\t\tprogress = FALSE)\n```\n\nSmaller geographies like Census tracts can only be pulled state-by-state. This example demonstrates how to iterate across FIPS codes to pull Census tracts for multiple states. The process is as follows:\n\n1. Pick the variables of interest\n2. Create a vector of state FIPS codes for the states of interest\n3. Create a custom function that works on a single state FIPS code\n4. Iterate the function along the vector of state FIPS codes with `map_df()` from `library(purrr)`\n\nHere is an example that pulls median household income at the Census tract level for multiple states:\n\n```{r tidycensus-iteration}\n# variables of interest\nvars <- c(\n \"B19013_001\" # median household income estimate\n)\n\n# states of interest: alabama, alaska, arizona\nstate_fips <- c(\"01\", \"02\", \"04\")\n\t\n# create a custom function that works for one state\nget_income <- function(state_fips) {\n\t\n\tincome_data <- get_acs(geography = \"tract\", \n\t\t\t\t\t\t\t\t\t\t\t\t variables = vars, \n\t\t\t\t\t\t\t\t\t\t\t\t state = state_fips,\n\t\t\t\t\t\t\t\t\t\t\t\t year = 2015)\n\t\n\treturn(income_data)\n\t\n}\n\n# iterate the function\nmap_df(.x = state_fips, # iterate along the vector of state fips codes\n\t\t\t .f = get_income) # apply get_income() to each fips_code \n```\n\n`library(tidycensus)` works well with `library(tidyverse)` and enables access to geospatial data, but it is limited to only some Census Bureau data sets. The next package has less functionality but allows for accessing any data available on the Census API.\n\n
\n\n## `library(censusapi)`\n\n`library(censusapi)` by Hannah Recht ([complete intro here](https://cran.r-project.org/web/packages/censusapi/vignettes/getting-started.html)) can access any published table that is accessible through the Census Bureau API. A full listing is available [here](https://api.census.gov/data.html).\n\nYou will need to [apply for a Census API key](https://api.census.gov/data/key_signup.html) and [add it to your R session](https://cran.r-project.org/web/packages/censusapi/vignettes/getting-started.html). Don't add your API key to your script and don't add it to a GitHub repository!\n\nHere is a simple example that pulls median household income and its margin of error for Census tracts in Alabama:\n\n```{r censusapi}\nlibrary(tidyverse)\nlibrary(purrr)\nlibrary(censusapi)\nvars <- c(\n \"B19013_001E\", # median household income estimate\n \"B19013_001M\" # median household income margin of error\n)\n\ngetCensus(name = \"acs/acs5\",\n\t\t\t\t\tkey = Sys.getenv(\"CENSUS_API_KEY\"),\n\t\t\t\t\tvars = vars, \n\t\t\t\t\tregion = \"tract:*\",\n\t\t\t\t\tregionin = \"state:01\",\n\t\t\t\t\tvintage = 2015) %>%\n\tas_tibble()\n```\n\nSmaller geographies like Census tracts can only be pulled state-by-state. This example demonstrates how to iterate across FIPS codes to pull Census tracts for multiple states. The process is as follows:\n\n1. Pick the variables of interest\n2. Create a vector of state FIPS codes for the states of interest\n3. Create a custom function that works on a single state FIPS code\n4. Iterate the function along the vector of state FIPS codes with `map_df()` from `library(purrr)`\n\nHere is an example that pulls median household income at the Census tract level for multiple states:\n\n```{r censusapi-iteration}\n# variables of interest\nvars <- c(\n \"B19013_001E\", # median household income estimate\n \"B19013_001M\" # median household income margin of error\n)\n\n# states of interest: alabama, alaska, arizona\nstate_fips <- c(\"01\", \"02\", \"04\")\n\t\n# create a custom function that works for one state\nget_income <- function(state_fips) {\n\t\n\tincome_data <- getCensus(name = \"acs/acs5\", \n\t\t\t\t\t\t\t\t\t\t\t\t\t key = Sys.getenv(\"CENSUS_API_KEY\"),\n\t\t\t\t\t\t\t\t\t\t\t\t\t vars = vars, \n\t\t\t\t\t\t\t\t\t\t\t\t\t region = \"tract:*\",\n\t\t\t\t\t\t\t\t\t\t\t\t\t regionin = paste0(\"state:\", state_fips),\n\t\t\t\t\t\t\t\t\t\t\t\t\t vintage = 2015)\n\t\n\treturn(income_data)\n\t\n}\n\n# iterate the function\nmap_df(.x = state_fips, # iterate along the vector of state fips codes\n\t\t\t .f = get_income) %>% # apply get_income() to each fips_code \n\tas_tibble() \n```\n"},"formats":{"html":{"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"knitr"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[]},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"getting-data.html"},"language":{},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.2.269"},"extensions":{"book":{"multiFile":true}}}}} \ No newline at end of file +{"title":"source(here::here(\"getting-data\", \"census_api_key.R\"))","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}}},"headingText":"source(here::here(\"getting-data\", \"census_api_key.R\"))","containsRefs":false,"markdown":"\n\n\n\n::: {#header}\n\n:::\n\n```{r echo = FALSE}\n```\n\n```{r markdown-setup, include=FALSE}\n\nknitr::opts_chunk$set(fig.path = \"intro-to-r/www/images/\")\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(warning = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\n\n\noptions(scipen = 999)\n```\n\n# Introduction\n\nThis guide outlines some useful workflows for pulling data sets commonly used by the Urban Institute.\n\n## `library(tidycensus)`\n\n`library(tidycensus)` by Kyle Walker ([complete intro here](https://walkerke.github.io/tidycensus/)) is the best tool for accessing some Census data sets in R from the Census Bureau API. The package returns tidy data frames and can easily pull shapefiles by adding `geometry = TRUE`.\n\nYou will need to [apply for a Census API key](https://api.census.gov/data/key_signup.html) and [add it to your R session](https://walkerke.github.io/tidycensus/articles/basic-usage.html). Don't add your API key to your script and don't add it to a GitHub repository!\n\nHere is a simple example for one state with shapefiles:\n\n```{r tidycensus}\nlibrary(tidyverse)\nlibrary(purrr)\nlibrary(tidycensus)\n\n# pull median household income and shapefiles for Census tracts in Alabama\nget_acs(geography = \"tract\", \n\t\t\t\tvariables = \"B19013_001\", \n\t\t\t\tstate = \"01\",\n\t\t\t\tyear = 2015,\n\t\t\t\tgeometry = TRUE,\n\t\t\t\tprogress = FALSE)\n```\n\nSmaller geographies like Census tracts can only be pulled state-by-state. This example demonstrates how to iterate across FIPS codes to pull Census tracts for multiple states. The process is as follows:\n\n1. Pick the variables of interest\n2. Create a vector of state FIPS codes for the states of interest\n3. Create a custom function that works on a single state FIPS code\n4. Iterate the function along the vector of state FIPS codes with `map_df()` from `library(purrr)`\n\nHere is an example that pulls median household income at the Census tract level for multiple states:\n\n```{r tidycensus-iteration}\n# variables of interest\nvars <- c(\n \"B19013_001\" # median household income estimate\n)\n\n# states of interest: alabama, alaska, arizona\nstate_fips <- c(\"01\", \"02\", \"04\")\n\t\n# create a custom function that works for one state\nget_income <- function(state_fips) {\n\t\n\tincome_data <- get_acs(geography = \"tract\", \n\t\t\t\t\t\t\t\t\t\t\t\t variables = vars, \n\t\t\t\t\t\t\t\t\t\t\t\t state = state_fips,\n\t\t\t\t\t\t\t\t\t\t\t\t year = 2015)\n\t\n\treturn(income_data)\n\t\n}\n\n# iterate the function\nmap_df(.x = state_fips, # iterate along the vector of state fips codes\n\t\t\t .f = get_income) # apply get_income() to each fips_code \n```\n\n`library(tidycensus)` works well with `library(tidyverse)` and enables access to geospatial data, but it is limited to only some Census Bureau data sets. The next package has less functionality but allows for accessing any data available on the Census API.\n\n
\n\n## `library(censusapi)`\n\n`library(censusapi)` by Hannah Recht ([complete intro here](https://cran.r-project.org/web/packages/censusapi/vignettes/getting-started.html)) can access any published table that is accessible through the Census Bureau API. A full listing is available [here](https://api.census.gov/data.html).\n\nYou will need to [apply for a Census API key](https://api.census.gov/data/key_signup.html) and [add it to your R session](https://cran.r-project.org/web/packages/censusapi/vignettes/getting-started.html). Don't add your API key to your script and don't add it to a GitHub repository!\n\nHere is a simple example that pulls median household income and its margin of error for Census tracts in Alabama:\n\n```{r censusapi}\nlibrary(tidyverse)\nlibrary(purrr)\nlibrary(censusapi)\nvars <- c(\n \"B19013_001E\", # median household income estimate\n \"B19013_001M\" # median household income margin of error\n)\n\ngetCensus(name = \"acs/acs5\",\n\t\t\t\t\tkey = Sys.getenv(\"CENSUS_API_KEY\"),\n\t\t\t\t\tvars = vars, \n\t\t\t\t\tregion = \"tract:*\",\n\t\t\t\t\tregionin = \"state:01\",\n\t\t\t\t\tvintage = 2015) %>%\n\tas_tibble()\n```\n\nSmaller geographies like Census tracts can only be pulled state-by-state. This example demonstrates how to iterate across FIPS codes to pull Census tracts for multiple states. The process is as follows:\n\n1. Pick the variables of interest\n2. Create a vector of state FIPS codes for the states of interest\n3. Create a custom function that works on a single state FIPS code\n4. Iterate the function along the vector of state FIPS codes with `map_df()` from `library(purrr)`\n\nHere is an example that pulls median household income at the Census tract level for multiple states:\n\n```{r censusapi-iteration}\n# variables of interest\nvars <- c(\n \"B19013_001E\", # median household income estimate\n \"B19013_001M\" # median household income margin of error\n)\n\n# states of interest: alabama, alaska, arizona\nstate_fips <- c(\"01\", \"02\", \"04\")\n\t\n# create a custom function that works for one state\nget_income <- function(state_fips) {\n\t\n\tincome_data <- getCensus(name = \"acs/acs5\", \n\t\t\t\t\t\t\t\t\t\t\t\t\t key = Sys.getenv(\"CENSUS_API_KEY\"),\n\t\t\t\t\t\t\t\t\t\t\t\t\t vars = vars, \n\t\t\t\t\t\t\t\t\t\t\t\t\t region = \"tract:*\",\n\t\t\t\t\t\t\t\t\t\t\t\t\t regionin = paste0(\"state:\", state_fips),\n\t\t\t\t\t\t\t\t\t\t\t\t\t vintage = 2015)\n\t\n\treturn(income_data)\n\t\n}\n\n# iterate the function\nmap_df(.x = state_fips, # iterate along the vector of state fips codes\n\t\t\t .f = get_income) %>% # apply get_income() to each fips_code \n\tas_tibble() \n```\n","srcMarkdownNoYaml":"\n\n\n\n::: {#header}\n\n:::\n\n```{r echo = FALSE}\n# source(here::here(\"getting-data\", \"census_api_key.R\"))\n```\n\n```{r markdown-setup, include=FALSE}\n\nknitr::opts_chunk$set(fig.path = \"intro-to-r/www/images/\")\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(warning = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\n\n\noptions(scipen = 999)\n```\n\n# Introduction\n\nThis guide outlines some useful workflows for pulling data sets commonly used by the Urban Institute.\n\n## `library(tidycensus)`\n\n`library(tidycensus)` by Kyle Walker ([complete intro here](https://walkerke.github.io/tidycensus/)) is the best tool for accessing some Census data sets in R from the Census Bureau API. The package returns tidy data frames and can easily pull shapefiles by adding `geometry = TRUE`.\n\nYou will need to [apply for a Census API key](https://api.census.gov/data/key_signup.html) and [add it to your R session](https://walkerke.github.io/tidycensus/articles/basic-usage.html). Don't add your API key to your script and don't add it to a GitHub repository!\n\nHere is a simple example for one state with shapefiles:\n\n```{r tidycensus}\nlibrary(tidyverse)\nlibrary(purrr)\nlibrary(tidycensus)\n\n# pull median household income and shapefiles for Census tracts in Alabama\nget_acs(geography = \"tract\", \n\t\t\t\tvariables = \"B19013_001\", \n\t\t\t\tstate = \"01\",\n\t\t\t\tyear = 2015,\n\t\t\t\tgeometry = TRUE,\n\t\t\t\tprogress = FALSE)\n```\n\nSmaller geographies like Census tracts can only be pulled state-by-state. This example demonstrates how to iterate across FIPS codes to pull Census tracts for multiple states. The process is as follows:\n\n1. Pick the variables of interest\n2. Create a vector of state FIPS codes for the states of interest\n3. Create a custom function that works on a single state FIPS code\n4. Iterate the function along the vector of state FIPS codes with `map_df()` from `library(purrr)`\n\nHere is an example that pulls median household income at the Census tract level for multiple states:\n\n```{r tidycensus-iteration}\n# variables of interest\nvars <- c(\n \"B19013_001\" # median household income estimate\n)\n\n# states of interest: alabama, alaska, arizona\nstate_fips <- c(\"01\", \"02\", \"04\")\n\t\n# create a custom function that works for one state\nget_income <- function(state_fips) {\n\t\n\tincome_data <- get_acs(geography = \"tract\", \n\t\t\t\t\t\t\t\t\t\t\t\t variables = vars, \n\t\t\t\t\t\t\t\t\t\t\t\t state = state_fips,\n\t\t\t\t\t\t\t\t\t\t\t\t year = 2015)\n\t\n\treturn(income_data)\n\t\n}\n\n# iterate the function\nmap_df(.x = state_fips, # iterate along the vector of state fips codes\n\t\t\t .f = get_income) # apply get_income() to each fips_code \n```\n\n`library(tidycensus)` works well with `library(tidyverse)` and enables access to geospatial data, but it is limited to only some Census Bureau data sets. The next package has less functionality but allows for accessing any data available on the Census API.\n\n
\n\n## `library(censusapi)`\n\n`library(censusapi)` by Hannah Recht ([complete intro here](https://cran.r-project.org/web/packages/censusapi/vignettes/getting-started.html)) can access any published table that is accessible through the Census Bureau API. A full listing is available [here](https://api.census.gov/data.html).\n\nYou will need to [apply for a Census API key](https://api.census.gov/data/key_signup.html) and [add it to your R session](https://cran.r-project.org/web/packages/censusapi/vignettes/getting-started.html). Don't add your API key to your script and don't add it to a GitHub repository!\n\nHere is a simple example that pulls median household income and its margin of error for Census tracts in Alabama:\n\n```{r censusapi}\nlibrary(tidyverse)\nlibrary(purrr)\nlibrary(censusapi)\nvars <- c(\n \"B19013_001E\", # median household income estimate\n \"B19013_001M\" # median household income margin of error\n)\n\ngetCensus(name = \"acs/acs5\",\n\t\t\t\t\tkey = Sys.getenv(\"CENSUS_API_KEY\"),\n\t\t\t\t\tvars = vars, \n\t\t\t\t\tregion = \"tract:*\",\n\t\t\t\t\tregionin = \"state:01\",\n\t\t\t\t\tvintage = 2015) %>%\n\tas_tibble()\n```\n\nSmaller geographies like Census tracts can only be pulled state-by-state. This example demonstrates how to iterate across FIPS codes to pull Census tracts for multiple states. The process is as follows:\n\n1. Pick the variables of interest\n2. Create a vector of state FIPS codes for the states of interest\n3. Create a custom function that works on a single state FIPS code\n4. Iterate the function along the vector of state FIPS codes with `map_df()` from `library(purrr)`\n\nHere is an example that pulls median household income at the Census tract level for multiple states:\n\n```{r censusapi-iteration}\n# variables of interest\nvars <- c(\n \"B19013_001E\", # median household income estimate\n \"B19013_001M\" # median household income margin of error\n)\n\n# states of interest: alabama, alaska, arizona\nstate_fips <- c(\"01\", \"02\", \"04\")\n\t\n# create a custom function that works for one state\nget_income <- function(state_fips) {\n\t\n\tincome_data <- getCensus(name = \"acs/acs5\", \n\t\t\t\t\t\t\t\t\t\t\t\t\t key = Sys.getenv(\"CENSUS_API_KEY\"),\n\t\t\t\t\t\t\t\t\t\t\t\t\t vars = vars, \n\t\t\t\t\t\t\t\t\t\t\t\t\t region = \"tract:*\",\n\t\t\t\t\t\t\t\t\t\t\t\t\t regionin = paste0(\"state:\", state_fips),\n\t\t\t\t\t\t\t\t\t\t\t\t\t vintage = 2015)\n\t\n\treturn(income_data)\n\t\n}\n\n# iterate the function\nmap_df(.x = state_fips, # iterate along the vector of state fips codes\n\t\t\t .f = get_income) %>% # apply get_income() to each fips_code \n\tas_tibble() \n```\n"},"formats":{"html":{"identifier":{"display-name":"HTML","target-format":"html","base-format":"html"},"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"knitr"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"inline-includes":false,"preserve-yaml":false,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[],"notebook-links":true,"format-links":true},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"getting-data.html"},"language":{"toc-title-document":"Table of contents","toc-title-website":"On this page","related-formats-title":"Other Formats","related-notebooks-title":"Notebooks","source-notebooks-prefix":"Source","section-title-abstract":"Abstract","section-title-appendices":"Appendices","section-title-footnotes":"Footnotes","section-title-references":"References","section-title-reuse":"Reuse","section-title-copyright":"Copyright","section-title-citation":"Citation","appendix-attribution-cite-as":"For attribution, please cite this work as:","appendix-attribution-bibtex":"BibTeX citation:","title-block-author-single":"Author","title-block-author-plural":"Authors","title-block-affiliation-single":"Affiliation","title-block-affiliation-plural":"Affiliations","title-block-published":"Published","title-block-modified":"Modified","callout-tip-title":"Tip","callout-note-title":"Note","callout-warning-title":"Warning","callout-important-title":"Important","callout-caution-title":"Caution","code-summary":"Code","code-tools-menu-caption":"Code","code-tools-show-all-code":"Show All Code","code-tools-hide-all-code":"Hide All Code","code-tools-view-source":"View Source","code-tools-source-code":"Source Code","code-line":"Line","code-lines":"Lines","copy-button-tooltip":"Copy to Clipboard","copy-button-tooltip-success":"Copied!","repo-action-links-edit":"Edit this page","repo-action-links-source":"View source","repo-action-links-issue":"Report an issue","back-to-top":"Back to top","search-no-results-text":"No results","search-matching-documents-text":"matching documents","search-copy-link-title":"Copy link to search","search-hide-matches-text":"Hide additional matches","search-more-match-text":"more match in this document","search-more-matches-text":"more matches in this document","search-clear-button-title":"Clear","search-detached-cancel-button-title":"Cancel","search-submit-button-title":"Submit","search-label":"Search","toggle-section":"Toggle section","toggle-sidebar":"Toggle sidebar navigation","toggle-dark-mode":"Toggle dark mode","toggle-reader-mode":"Toggle reader mode","toggle-navigation":"Toggle navigation","crossref-fig-title":"Figure","crossref-tbl-title":"Table","crossref-lst-title":"Listing","crossref-thm-title":"Theorem","crossref-lem-title":"Lemma","crossref-cor-title":"Corollary","crossref-prp-title":"Proposition","crossref-cnj-title":"Conjecture","crossref-def-title":"Definition","crossref-exm-title":"Example","crossref-exr-title":"Exercise","crossref-ch-prefix":"Chapter","crossref-apx-prefix":"Appendix","crossref-sec-prefix":"Section","crossref-eq-prefix":"Equation","crossref-lof-title":"List of Figures","crossref-lot-title":"List of Tables","crossref-lol-title":"List of Listings","environment-proof-title":"Proof","environment-remark-title":"Remark","environment-solution-title":"Solution","listing-page-order-by":"Order By","listing-page-order-by-default":"Default","listing-page-order-by-date-asc":"Oldest","listing-page-order-by-date-desc":"Newest","listing-page-order-by-number-desc":"High to Low","listing-page-order-by-number-asc":"Low to High","listing-page-field-date":"Date","listing-page-field-title":"Title","listing-page-field-description":"Description","listing-page-field-author":"Author","listing-page-field-filename":"File Name","listing-page-field-filemodified":"Modified","listing-page-field-subtitle":"Subtitle","listing-page-field-readingtime":"Reading Time","listing-page-field-categories":"Categories","listing-page-minutes-compact":"{0} min","listing-page-category-all":"All","listing-page-no-matches":"No matching items"},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.3.433"},"extensions":{"book":{"multiFile":true}}}},"projectFormats":["html"]} \ No newline at end of file diff --git a/.quarto/idx/graphics-guide.qmd.json b/.quarto/idx/graphics-guide.qmd.json index 91adc8d..336dd69 100644 --- a/.quarto/idx/graphics-guide.qmd.json +++ b/.quarto/idx/graphics-guide.qmd.json @@ -1 +1 @@ -{"title":"Urban Institute R Graphics Guide","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"editor_options":{"chunk_output_type":"console"}},"headingText":"Urban Institute R Graphics Guide","containsRefs":false,"markdown":"\n\n\n\n::: {#header}\n\n:::\n\n\n```{r setup, include=FALSE}\nlibrary(knitr)\nlibrary(datasets)\nlibrary(tidyverse)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nopts_chunk$set(fig.path = \"graphics-guide/www/images/\")\nopts_chunk$set(echo = TRUE)\nopts_chunk$set(warning = FALSE)\nopts_chunk$set(message = FALSE)\nopts_chunk$set(fig.width = 6.5)\nopts_chunk$set(fig.height = 4)\nopts_chunk$set(fig.retina = 3)\noptions(scipen = 999)\n```\n\nR is a powerful, open-source programming language and environment. R excels at data management and munging, traditional statistical analysis, machine learning, and reproducible research, but it is probably best known for its graphics. This guide contains examples and instructions for popular and lesser-known plotting techniques in R. It also includes instructions for using `urbnthemes`, the Urban Institute's R package for creating near-publication-ready plots with `ggplot2`. If you have any questions, please don't hesitate to contact Aaron Williams (awilliams\\@urban.org) or Kyle Ueyama (kueyama\\@urban.org).\n\n### Background\n\n`library(urbnthemes)` makes `ggplot2` output align more closely with [the Urban Institute's Data Visualization style guide](http://urbaninstitute.github.io/graphics-styleguide/). This package does **not produce publication ready graphics**. Visual styles must still be edited using your project/paper's normal editing workflow.\n\nExporting charts as a pdf will allow them to be more easily edited. See the Saving Plots section for more information.\n\nThe theme has been tested against `ggplot2 version 3.0.0`. It will not function properly with older versions of `ggplot2`\n\n### Using library(urbnthemes)\n\nRun the following code to install or update `urbnthemes`:\n\n install.packages(\"remotes\")\n remotes::install_github(\"UrbanInstitute/urbnthemes\")\n\nRun the following code at the top of each script:\n\n library(tidyverse)\n library(urbnthemes)\n\n set_urbn_defaults(style = \"print\")\n\n### Installing Lato {#installing_lato}\n\nYour Urban computer may not have the Lato font installed. If it is not installed, please install the free [Lato font from Google](https://www.google.com/fonts/specimen/Lato). Below are step by step instructions:\n\n1) Download the [Lato font](https://www.google.com/fonts/specimen/Lato) (as a zip file).\n2) Unzip the file on your computer.\n3) For each `.ttf` file in the unzipped `Lato/` folder, double click the file and click `Install` (on Windows) or `Install Font` (on Mac).\n4) Import and register Lato into R by running `urbnthemes::lato_import()` in the console once. Be patient as this may take a few minutes!\n5) To confirm installation, run `urbnthemes::lato_test()`. If this is successful you're done and Lato will automatically be used when creating plots with `library(urbnthemes)`. You only need to install Lato once per computer.\n\nWaffle charts with glyphs require fontawesome. `fontawesome_test()` and `fontawesome_install()` are the fontawesome versions of the above functions. Be sure to install fontawesome from [here](https://github.com/hrbrmstr/waffle/tree/master/inst/fonts) first.\n\n### Grammar of Graphics and Conventions\n\nHadley Wickham's ggplot2 is based on Leland Wilkinson's [*The Grammar of Graphics*](https://www.amazon.com/Grammar-Graphics-Statistics-Computing/dp/0387245448) and Wickham's [*A Layered Grammar of Graphics*](http://vita.had.co.nz/papers/layered-grammar.html). The layered grammar of graphics is a structured way of thinking about the components of a plot, which then lend themselves to the simple structure of ggplot2.\n\n- **Data** are what are visualizaed in a plot and **mappings** are directions for how data are mapped in a plot in a way that can be perceived by humans.\\\n- **Geoms** are representations of the actual data like points, lines, and bars.\n- **Stats** are statistical transformations that represent summaries of the data like histograms.\n- **Scales** map values in the data space to values in the aesthetic space. Scales draw legends and axes.\n- **Coordinate Systems** describe how geoms are mapped to the plane of the graphic.\\\n- **Facets** break the data into meaningful subsets like small multiples.\n- **Themes** control the finer points of a plot such as fonts, font sizes, and background colors.\n\nMore information: [ggplot2: Elegant Graphics for Data Analysis](https://www.amazon.com/ggplot2-Elegant-Graphics-Data-Analysis/dp/0387981403)\n\n### Tips and Tricks\n\n- `ggplot2` expects data to be in data frames or tibbles. It is preferable for the data frames to be \"tidy\" with each variable as a column, each obseravtion as a row, and each observational unit as a separate table. `dplyr` and `tidyr` contain concise and effective tools for \"tidying\" data.\n\n- R allows function arguments to be called explicitly by name and implicitly by position. The coding examples in this guide only contain named arguments for clarity.\n\n- Graphics will sometimes render differently on different operating systems. This is because anti-aliasing is activated in R on Mac and Linux but not activated in R on Windows. This won't be an issue once graphics are saved.\n\n- Continuous x-axes have ticks. Discrete x-axes do not have ticks. Use `remove_ticks()` to remove ticks.\n\n## Bar Plots\n\n------------------------------------------------------------------------\n\n### One Color\n\n```{r barplots}\nmtcars %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = factor(cyl), y = n)) +\n geom_col() +\n geom_text(mapping = aes(label = n), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) +\n remove_ticks() +\n remove_axis() \n```\n\n### One Color (Rotated)\n\nThis example introduces `coord_flip()` and `remove_axis(axis = \"x\", flip = TRUE)`. `remove_axis()` is from `library(urbnthemes)` and creates a custom theme for rotated bar plots.\n\n```{r barplot-rotated}\nmtcars %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = factor(cyl), y = n)) +\n geom_col() +\n geom_text(mapping = aes(label = n), hjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n coord_flip() +\n remove_axis(axis = \"x\", flip = TRUE)\n```\n\n### Three Colors\n\nThis is identical to the previous plot except colors and a legend are added with `fill = cyl`. Turning `x` into a factor with `factor(cyl)` skips 5 and 7 on the `x-axis`. Adding `fill = cyl` without `factor()` would have created a continuous color scheme and legend.\n\n```{r 3-color-barplot}\nmtcars %>%\n mutate(cyl = factor(cyl)) %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = cyl, y = n, fill = cyl)) +\n geom_col() +\n geom_text(mapping = aes(label = n), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) +\n remove_ticks() +\n remove_axis()\n```\n\n### Stacked Bar Plot\n\nAn additional aesthetic can easily be added to bar plots by adding `fill = categorical variable` to the mapping. Here, transmission type subsets each bar showing the count of cars with different numbers of cylinders.\n\n```{r stacked-bar-plot}\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>% \n group_by(am) %>%\n count(cyl) %>%\n group_by(cyl) %>%\n arrange(desc(am)) %>%\n mutate(label_height = cumsum(n)) %>%\n ggplot() +\n geom_col(mapping = aes(x = cyl, y = n, fill = am)) +\n geom_text(aes(x = cyl, y = label_height - 0.5, label = n, color = am)) +\n scale_color_manual(values = c(\"white\", \"black\")) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n remove_axis() +\n guides(color = \"none\")\n```\n\n### Stacked Bar Plot With Position = Fill\n\nThe previous examples used `geom_col()`, which takes a y value for bar height. This example uses `geom_bar()` which sums the values and generates a value for bar heights. In this example, `position = \"fill\"` in `geom_bar()` changes the y-axis from count to the proportion of each bar.\n\n```{r stacked-bar-plot-fill}\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>% \n ggplot() +\n geom_bar(mapping = aes(x = cyl, fill = am), position = \"fill\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1)), labels = scales::percent) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n guides(color = \"none\")\n```\n\n### Dodged Bar Plot\n\nSubsetted bar charts in ggplot2 are stacked by default. `position = \"dodge\"` in `geom_col()` expands the bar chart so the bars appear next to each other.\n\n```{r dodged-bar-plot}\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>%\n group_by(am) %>%\n count(cyl) %>%\n ggplot(mapping = aes(cyl, y = n, fill = factor(am))) +\n geom_col(position = \"dodge\") +\n geom_text(aes(label = n), position = position_dodge(width = 0.7), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n remove_axis()\n```\n\n### Lollipop plot/Cleveland dot plot {.tabset}\n\nLollipop plots and Cleveland dot plots are minimalist alternatives to bar plots. The key to both plots is to order the data based on the continuous variable using `arrange()` and then turn the discrete variable into a factor with the ordered levels of the continuous variable using `mutate()`. This step \"stores\" the order of the data.\n\n#### Lollipop plot\n\n```{r lollipop-plot, fig.height = 5}\nmtcars %>%\n\trownames_to_column(\"model\") %>%\n\tarrange(mpg) %>%\n\tmutate(model = factor(model, levels = .$model)) %>%\n\tggplot(aes(mpg, model)) +\n\t\tgeom_segment(aes(x = 0, xend = mpg, y = model, yend = model)) +\t\n\t\tgeom_point() +\n\t\tscale_x_continuous(expand = expansion(mult = c(0, 0)), limits = c(0, 40)) +\n\t\tlabs(x = NULL, \n\t\t\t\t y = \"Miles Per Gallon\")\n```\n\n#### Cleveland dot plot\n\n```{r cleveland-dot-plot, fig.height = 5}\nmtcars %>%\n\trownames_to_column(\"model\") %>%\n\tarrange(mpg) %>%\n\tmutate(model = factor(model, levels = .$model)) %>%\n\tggplot(aes(mpg, model)) +\n\t\tgeom_point() +\n\t\tscale_x_continuous(expand = expansion(mult = c(0, 0)), limits = c(0, 40)) +\n\t\tlabs(x = NULL, \n\t\t\t\t y = \"Miles Per Gallon\")\n```\n\n### Dumbell plot\n\n## Scatter Plots\n\n------------------------------------------------------------------------\n\n### One Color Scatter Plot\n\nScatter plots are useful for showing relationships between two or more variables. Use `scatter_grid()` from `library(urbnthemes)` to easily add vertical grid lines for scatter plots.\n\n```{r one-color-scatter-plot}\nmtcars %>%\n ggplot(mapping = aes(x = wt, y = mpg)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:8 * 5) +\n labs(x = \"Weight (thousands of pounds)\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n### High-Density Scatter Plot with Transparency\n\nLarge numbers of observations can sometimes make scatter plots tough to interpret because points overlap. Adding `alpha =` with a number between 0 and 1 adds transparency to points and clarity to plots. Now it's easy to see that jewelry stores are probably rounding up but not rounding down carats!\n\n```{r alpha-scatter-plot}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 20000),\n breaks = 0:4 * 5000,\n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n```\n\n### Hex Scatter Plot\n\nSometimes transparency isn't enough to bring clarity to a scatter plot with many observations. As n increases into the hundreds of thousands and even millions, `geom_hex` can be one of the best ways to display relationships between two variables.\n\n```{r scatter-plot-hex}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_hex(mapping = aes(fill = after_stat(count))) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 20000),\n breaks = 0:4 * 5000,\n labels = scales::dollar) +\n\tscale_fill_gradientn(labels = scales::comma) + \n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid() +\n theme(legend.position = \"right\",\n legend.direction = \"vertical\")\n```\n\n### Scatter Plots With Random Noise {.tabset}\n\nSometimes scatter plots have many overlapping points but a reasonable number of observations. `geom_jitter` adds a small amount of random noise so points are less likely to overlap. `width` and `height` control the amount of noise that is added. In the following before-and-after, notice how many more points are visible after adding jitter.\n\n#### Before\n\n```{r before-scatter-plot}\nmpg %>%\n ggplot(mapping = aes(x = displ, y = cty)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 8),\n breaks = 0:8) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:4 * 10) +\n labs(x = \"Displacement\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n#### After\n\n```{r jitter-plot}\nset.seed(2017)\nmpg %>%\n ggplot(mapping = aes(x = displ, y = cty)) +\n geom_jitter() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 8),\n breaks = 0:8) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:4 * 10) +\n labs(x = \"Displacement\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n### Scatter Plots with Varying Point Size\n\nWeights and populations can be mapped in scatter plots to the size of the points. Here, the number of households in each state is mapped to the size of each point using `aes(size = hhpop)`. Note: `ggplot2::geom_point()` is used instead of `geom_point()`.\n\n```{r geom_point-size, fig.height = 5}\nurbnmapr::statedata %>%\n ggplot(mapping = aes(x = medhhincome, y = horate)) +\n ggplot2::geom_point(mapping = aes(size = hhpop), alpha = 0.3) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(30000, 80000),\n breaks = 3:8 * 10000,\n labels = scales::dollar) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 0.8),\n breaks = 0:4 * 0.2) +\n scale_radius(range = c(3, 15),\n breaks = c(2500000, 7500000, 12500000), \n labels = scales::comma) +\n labs(x = \"Household income\",\n y = \"Homeownership rate\") +\n scatter_grid() +\n\ttheme(plot.margin = margin(r = 20))\n```\n\n### Scatter Plots with Fill\n\nA third aesthetic can be added to scatter plots. Here, color signifies the number of cylinders in each car. Before `ggplot()` is called, Cylinders is created using `library(dplyr)` and the piping operator `%>%`.\n\n```{r filled-scatter-plot}\nmtcars %>%\n mutate(cyl = paste(cyl, \"cylinders\")) %>%\n ggplot(aes(x = wt, y = mpg, color = cyl)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:8 * 5) +\n labs(x = \"Weight (thousands of pounds)\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n## Line Plots\n\n------------------------------------------------------------------------\n\n```{r line-plots}\neconomics %>%\n ggplot(mapping = aes(x = date, y = unemploy)) +\n geom_line() +\n scale_x_date(expand = expansion(mult = c(0.002, 0)), \n breaks = \"10 years\",\n limits = c(as.Date(\"1961-01-01\"), as.Date(\"2020-01-01\")),\n date_labels = \"%Y\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 4000,\n limits = c(0, 16000),\n labels = scales::comma) +\n labs(x = \"Year\", \n y = \"Number Unemployed (1,000s)\")\n```\n\n### Lines Plots With Multiple Lines\n\n```{r multiple-line-charts1}\nlibrary(gapminder)\n\ngapminder %>%\n filter(country %in% c(\"Australia\", \"Canada\", \"New Zealand\")) %>%\n mutate(country = factor(country, levels = c(\"Canada\", \"Australia\", \"New Zealand\"))) %>%\n ggplot(aes(year, gdpPercap, color = country)) +\n geom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n breaks = c(1952 + 0:12 * 5), \n limits = c(1952, 2007)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:8 * 5000,\n labels = scales::dollar, \n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Per capita GDP (US dollars)\")\n```\n\nPlotting more than one variable can be useful for seeing the relationship of variables over time, but it takes a small amount of data munging.\n\nThis is because `ggplot2` wants data in a \"long\" format instead of a \"wide\" format for line plots with multiple lines. `gather()` and `spread()` from the `tidyr` package make switching back-and-forth between \"long\" and \"wide\" painless. Essentially, variable titles go into \"key\" and variable values go into \"value\". Then ggplot2, turns the different levels of the key variable (population, unemployment) into colors.\n\n```{r multiple-line-charts2}\nas_tibble(EuStockMarkets) %>%\n\tmutate(date = time(EuStockMarkets)) %>%\n\tgather(key = \"key\", value = \"value\", -date) %>%\n\tggplot(mapping = aes(x = date, y = value, color = key)) +\n\tgeom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(1991, 1999), \n breaks = c(1991, 1993, 1995, 1997, 1999)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 2500,\n labels = scales::dollar, \n limits = c(0, 10000)) + \n\tlabs(x = \"Date\",\n\t\t\t y = \"Value\")\n```\n\n### Step plot\n\n`geom_line()` connects coordinates with the shortest possible straight line. Sometimes step plots are necessary because y values don't change between coordinates. For example, the upper-bound of the Federal Funds Rate is set at regular intervals and remains constant until it is changed.\n\n```{r step-plot}\n# downloaded from FRED on 2018-12-06\n\n# https://fred.stlouisfed.org/series/DFEDTARU\n\nfed_fund_rate <- read_csv(\n \"date, fed_funds_rate\n 2014-01-01,0.0025\n 2015-12-16,0.0050\n 2016-12-14,0.0075\n 2017-03-16,0.0100\n 2017-06-15,0.0125\n 2017-12-14,0.0150\n 2018-03-22,0.0175\n 2018-06-14,0.0200\n 2018-09-27,0.0225\n 2018-12-06,0.0225\")\n\nfed_fund_rate %>%\n ggplot(mapping = aes(x = date, y = fed_funds_rate)) + \n geom_step() +\n scale_x_date(expand = expansion(mult = c(0.002, 0)), \n breaks = \"1 year\",\n limits = c(as.Date(\"2014-01-01\"), as.Date(\"2019-01-01\")),\n date_labels = \"%Y\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = c(0, 0.01, 0.02, 0.03),\n limits = c(0, 0.03),\n labels = scales::percent) + \n\tlabs(x = \"Date\",\n\t\t\t y = \"Upper-bound of the Federal Funds Rate\")\n```\n\n### Path plot\n\nThe Beveridge curve is a macroeconomic plot that displays a relationship between the unemployment rate and the vacancy rate. Movements along the curve indicate changes in the business cyle and horizontal shifts of the curve suggest structural changes in the labor market.\n\nLines in Beveridge curves do not monotonically move from left to right. Therefore, it is necessary to use `geom_path()`.\n\n```{r, path-plot}\n# seasonally-adjusted, quarterly vacancy rate - JOLTS # seasonally-adjusted, quarterly unemployment rate - CPS\n\n# pulled from FRED on April 11, 2018. \n\nlibrary(ggrepel)\n\nbeveridge <- read_csv(\n\t\"quarter, vacanacy_rate, unempoyment_rate\n\t2006-01-01,0.0310,0.0473\n\t2006-04-01,0.0316,0.0463\n\t2006-07-01,0.0313,0.0463\n\t2006-10-01,0.0310,0.0443\n\t2007-01-01,0.0323,0.0450\n\t2007-04-01,0.0326,0.0450\n\t2007-07-01,0.0316,0.0466\n\t2007-10-01,0.0293,0.0480\n\t2008-01-01,0.0286,0.0500\n\t2008-04-01,0.0280,0.0533\n\t2008-07-01,0.0253,0.0600\n\t2008-10-01,0.0220,0.0686\n\t2009-01-01,0.0196,0.0826\n\t2009-04-01,0.0180,0.0930\n\t2009-07-01,0.0176,0.0963\n\t2009-10-01,0.0180,0.0993\n\t2010-01-01,0.0196,0.0983\n\t2010-04-01,0.0220,0.0963\n\t2010-07-01,0.0216,0.0946\n\t2010-10-01,0.0220,0.0950\n\t2011-01-01,0.0226,0.0903\n\t2011-04-01,0.0236,0.0906\n\t2011-07-01,0.0250,0.0900\n\t2011-10-01,0.0243,0.0863\n\t2012-01-01,0.0270,0.0826\n\t2012-04-01,0.0270,0.0820\n\t2012-07-01,0.0266,0.0803\n\t2012-10-01,0.0260,0.0780\n\t2013-01-01,0.0276,0.0773\n\t2013-04-01,0.0280,0.0753\n\t2013-07-01,0.0280,0.0723\n\t2013-10-01,0.0276,0.0693\n\t2014-01-01,0.0290,0.0666\n\t2014-04-01,0.0323,0.0623\n\t2014-07-01,0.0326,0.0610\n\t2014-10-01,0.0330,0.0570\n\t2015-01-01,0.0350,0.0556\n\t2015-04-01,0.0366,0.0540\n\t2015-07-01,0.0373,0.0510\n\t2015-10-01,0.0360,0.0500\n\t2016-01-01,0.0386,0.0493\n\t2016-04-01,0.0383,0.0486\n\t2016-07-01,0.0383,0.0493\n\t2016-10-01,0.0363,0.0473\n\t2017-01-01,0.0366,0.0466\n\t2017-04-01,0.0390,0.0433\n\t2017-07-01,0.0406,0.0430\n\t2017-10-01,0.0386,0.0410\")\n\nlabels <- beveridge %>%\n filter(lubridate::month(quarter) == 1)\n\nbeveridge %>%\n\tggplot() +\n\tgeom_path(mapping = aes(x = unempoyment_rate, y = vacanacy_rate), alpha = 0.5) +\n geom_point(data = labels, mapping = aes(x = unempoyment_rate, y = vacanacy_rate)) +\n geom_text_repel(data = labels, mapping = aes(x = unempoyment_rate, y = vacanacy_rate, label = lubridate::year(quarter))) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0.04, 0.1),\n labels = scales::percent) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = c(0, 0.01, 0.02, 0.03, 0.04, 0.05),\n limits = c(0, 0.05),\n labels = scales::percent) + \n\tlabs(x = \"Seasonally-adjusted unemployment rate\",\n\t\t\t y = \"Seasonally-adjusted vacancy rate\") + \n scatter_grid()\n```\n\n### Slope plots\n\n```{r slope-plot, fig.height = 5}\n# https://www.bls.gov/lau/\nlibrary(ggrepel)\n\nunemployment <- tibble(\n\ttime = c(\"October 2009\", \"October 2009\", \"October 2009\", \"August 2017\", \"August 2017\", \"August 2017\"),\n\trate = c(7.4, 7.1, 10.0, 3.9, 3.8, 6.4),\n\tstate = c(\"Maryland\", \"Virginia\", \"Washington, D.C.\", \"Maryland\", \"Virginia\", \"Washington, D.C.\")\n)\n\nlabel <- tibble(label = c(\"October 2009\", \"August 2017\"))\noctober <- filter(unemployment, time == \"October 2009\")\naugust <- filter(unemployment, time == \"August 2017\")\n\nunemployment %>%\n\tmutate(time = factor(time, levels = c(\"October 2009\", \"August 2017\")),\n\t state = factor(state, levels = c(\"Washington, D.C.\", \"Maryland\", \"Virginia\"))) %>%\n\tggplot() + \n\tgeom_line(aes(time, rate, group = state, color = state), show.legend = FALSE) +\n\tgeom_point(aes(x = time, y = rate, color = state)) +\n\tlabs(subtitle = \"Unemployment Rate\") +\n\ttheme(axis.ticks.x = element_blank(),\n\t\t\t\taxis.title.x = element_blank(),\n\t\t\t\taxis.ticks.y = element_blank(),\n axis.title.y = element_blank(), \n axis.text.y = element_blank(),\n\t\t\t\tpanel.grid.major.y = element_blank(),\n panel.grid.minor.y = element_blank(),\n panel.grid.major.x = element_blank(),\n\t\t\t\taxis.line = element_blank()) +\n\tgeom_text_repel(data = october, mapping = aes(x = time, y = rate, label = as.character(rate)), nudge_x = -0.06) + \n\tgeom_text_repel(data = august, mapping = aes(x = time, y = rate, label = as.character(rate)), nudge_x = 0.06)\n```\n\n## Univariate\n\n------------------------------------------------------------------------\n\nThere are a number of ways to explore the distributions of univariate data in R. Some methods, like strip charts, show all data points. Other methods, like the box and whisker plot, show selected data points that communicate key values like the median and 25th percentile. Finally, some methods don't show any of the underlying data but calculate density estimates. Each method has advantages and disadvantages, so it is worthwhile to understand the different forms. For more information, read [40 years of boxplots](http://vita.had.co.nz/papers/boxplots.pdf) by Hadley Wickham and Lisa Stryjewski.\n\n### Strip Chart\n\nStrip charts, the simplest univariate plot, show the distribution of values along one axis. Strip charts work best with variables that have plenty of variation. If not, the points tend to cluster on top of each other. Even if the variable has plenty of variation, it is often important to add transparency to the points with `alpha =` so overlapping values are visible.\n\n```{r stripchart, fig.height=2}\nmsleep %>%\n ggplot(aes(x = sleep_total, y = factor(1))) +\n geom_point(alpha = 0.2, size = 5) +\n labs(y = NULL) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) +\n scale_y_discrete(labels = NULL) +\n labs(title = \"Total Sleep Time of Different Mammals\",\n x = \"Total sleep time (hours)\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n```\n\n### Strip Chart with Highlighting\n\nBecause strip charts show all values, they are useful for showing where selected points lie in the distribution of a variable. The clearest way to do this is by adding `geom_point()` twice with `filter()` in the data argument. This way, the highlighted values show up on top of unhighlighted values.\n\n```{r stripchart-with-highlighting, fig.height=2}\nggplot() +\n geom_point(data = filter(msleep, name != \"Red fox\"), \n aes(x = sleep_total, \n y = factor(1)),\n alpha = 0.2, \n size = 5,\n \t\t\t\t\t color = \"grey50\") +\n geom_point(data = filter(msleep, name == \"Red fox\"),\n aes(x = sleep_total, \n y = factor(1), \n color = name),\n alpha = 0.8,\n size = 5) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) + \n scale_y_discrete(labels = NULL) +\n labs(title = \"Total Sleep Time of Different Mammals\",\n x = \"Total sleep time (hours)\",\n y = NULL,\n legend) +\n guides(color = guide_legend(title = NULL)) +\n theme(axis.ticks.y = element_blank())\n```\n\n### Subsetted Strip Chart\n\nAdd a y variable to see the distributions of the continuous variable in subsets of a categorical variable.\n\n```{r subsetted-stripchart, fig.height=3}\nlibrary(forcats)\n\nmsleep %>%\n filter(!is.na(vore)) %>%\n mutate(vore = fct_recode(vore, \n \"Insectivore\" = \"insecti\",\n \"Omnivore\" = \"omni\", \n \"Herbivore\" = \"herbi\", \n \"Carnivore\" = \"carni\"\n )) %>%\n ggplot(aes(x = sleep_total, y = vore)) +\n geom_point(alpha = 0.2, size = 5) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) + \n labs(title = \"Total Sleep Time of Different Mammals by Diet\",\n x = \"Total sleep time (hours)\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n```\n\n### Beeswarm Plots\n\nBeesward plots are a variation of strip charts that shows the distribution of data, but without the points overlaping.\n\n```{r beeswarm}\nlibrary(ggbeeswarm)\n\ntxhousing %>%\n\tfilter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>% \n ggplot(aes(x = median, y = city)) +\n geom_beeswarm(alpha = 0.2, size = 5) + \n\tscale_x_continuous(labels = scales::dollar) +\n labs(title = \"Household Sale Price by City\",\n x = \"Sale Price\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n\n```\n\n### Histograms\n\nHistograms divide the distribution of a variable into n equal-sized bins and then count and display the number of observations in each bin. Histograms are sensitive to bin width. As `?geom_histogram` notes, \"You should always override \\[the default binwidth\\] value, exploring multiple widths to find the best to illustrate the stories in your data.\"\n\n```{r histogram}\nggplot(data = diamonds, mapping = aes(x = depth)) + \n geom_histogram(bins = 100) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 100)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.2)), labels = scales::comma) +\n labs(x = \"Depth\",\n y = \"Count\")\n```\n\n### Boxplots\n\nBoxplots were invented in the 1970s by John Tukey[^1]. Instead of showing the underlying data or binned counts of the underlying data, they focus on important values like the 25th percentile, median, and 75th percentile.\n\n[^1]: Wickham, H., & Stryjewski, L. (2011). 40 years of boxplots.\n\n```{r box-plot}\nInsectSprays %>%\n ggplot(mapping = aes(x = spray, y = count)) +\n geom_boxplot() +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Type of insect spray\",\n y = \"Number of dead insects\") +\n remove_ticks()\n```\n\n### Smoothed Kernel Density Plots\n\nContinuous variables with smooth distributions are sometimes better represented with smoothed kernel density estimates than histograms or boxplots. `geom_density()` computes and plots a kernel density estimate. Notice the lumps around integers and halves in the following distribution because of rounding.\n\n```{r kernel-density-plot}\ndiamonds %>%\n ggplot(mapping = aes(carat)) +\n geom_density(color = NA) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, NA)) +\n\tscale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Carat\",\n y = \"Density\")\n```\n\n```{r kernel-density-plot-filled}\ndiamonds %>%\n mutate(cost = ifelse(price > 5500, \"More than $5,500 +\", \"$0 to $5,500\")) %>%\n ggplot(mapping = aes(carat, fill = cost)) +\n geom_density(alpha = 0.25, color = NA) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, NA)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Carat\",\n y = \"Density\")\n```\n\n### Ridgeline Plots\n\nRidgeline plots are partially overlapping smoothed kernel density plots faceted by a categorical variable that pack a lot of information into one elegant plot.\n\n```{r ridgeline-plots}\nlibrary(ggridges)\n\nggplot(diamonds, mapping = aes(x = price, y = cut)) +\n\tgeom_density_ridges(fill = \"#1696d2\") +\n labs(x = \"Price\",\n y = \"Cut\")\n```\n\n### Violin Plots\n\nViolin plots are symmetrical displays of smooth kernel density plots.\n\n```{r violin-plot}\nInsectSprays %>%\n ggplot(mapping = aes(x = spray, y = count, fill = spray)) +\n geom_violin(color = NA) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Type of insect spray\",\n y = \"Number of dead insects\") +\n remove_ticks()\n```\n\n### Bean Plot\n\nIndividual outliers and important summary values are not visible in violin plots or smoothed kernel density plots. Bean plots, [created by Peter Kampstra in 2008](https://www.jstatsoft.org/article/view/v028c01), are violin plots with data shown as small lines in a one-dimensional sstrip plot and larger lines for the mean.\n\n```{r beanplot}\nmsleep %>%\n filter(!is.na(vore)) %>%\n mutate(vore = fct_recode(vore, \n \"Insectivore\" = \"insecti\",\n \"Omnivore\" = \"omni\", \n \"Herbivore\" = \"herbi\", \n \"Carnivore\" = \"carni\"\n )) %>%\n ggplot(aes(x = vore, y = sleep_total, fill = vore)) +\n stat_summary(fun = \"mean\",\n colour = \"black\", \n size = 30,\n shape = 95,\n geom = \"point\") +\n geom_violin(color = NA) +\n geom_jitter(width = 0,\n height = 0.05,\n alpha = 0.4,\n shape = \"-\",\n size = 10,\n \t\t\t\t\t\tcolor = \"grey50\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) + \n labs(x = NULL,\n y = \"Total sleep time (hours)\") +\n theme(legend.position = \"none\") +\n remove_ticks()\n```\n\n## Area Plot\n\n------------------------------------------------------------------------\n\n### Stacked Area\n\n```{r area-plot-stack}\ntxhousing %>%\n filter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>%\n group_by(city, year) %>%\n summarize(sales = sum(sales)) %>%\n ggplot(aes(x = year, y = sales, fill = city)) +\n geom_area(position = \"stack\") +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(2000, 2015),\n breaks = 2000 + 0:15) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n labels = scales::comma) +\n labs(x = \"Year\",\n y = \"Home sales\")\n```\n\n### Filled Area\n\n```{r area-plot-fill}\ntxhousing %>%\n filter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>%\n group_by(city, year) %>%\n summarize(sales = sum(sales)) %>%\n ggplot(aes(x = year, y = sales, fill = city)) +\n geom_area(position = \"fill\") +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(2000, 2015),\n breaks = 2000 + 0:15) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.02)),\n breaks = c(0, 0.25, 0.5, 0.75, 1),\n labels = scales::percent) +\n labs(x = \"Year\",\n y = \"Home sales\")\n```\n\n## Sankey Plot\n\n------------------------------------------------------------------------\n\nSankey plots visualize flows from one set of variables to another. This can be useful for showing outcomes from the start of a program to the end. You'll need to install the `ggsankey` package to create Sankey plots in R. In this example I make a dummy data set of housing status prior to program start and at exit to show the flow of people between outcomes. A key step is to transform your data set using the `make_long` function from the package. This creates a data frame that specifies each of the initial nodes and how they flow into the next stage.\n\n```{r}\n# load ggsankey package\nremotes::install_github(\"davidsjoberg/ggsankey\")\nlibrary(ggsankey)\n\n# create a dummy dataset of housing status\ndf <- data_frame(entry_status = c(rep(\"Housed\", 7), rep(\"Unhoused\", 15), rep(\"Staying w/ Family\", 8)), \n exit_status = c(rep(\"Housed\", 15), rep(\"Unhoused\", 2), rep(\"Staying w/ Family\", 13))) %>% \n\t# transform the data frame into the proper format for the sankey plot\n make_long(entry_status, exit_status) %>% \n\t# recode the labels to be cleaner in the plot \n mutate(x = recode(x, entry_status = \"Prior Housing Status\", exit_status = \"Exit Housing Status\"),\n next_x = recode(next_x, entry_status = \"Prior Housing Status\", exit_status = \"Exit Housing Status\"))\n\n# create sankey plot\nggplot(df, aes(x = x, \n next_x = next_x, \n node = node, \n next_node = next_node,\n fill = factor(node), \n label = node)) +\n geom_sankey(flow.alpha = 0.5, node.color = 1, show.legend = FALSE) +\n # add labels to plot and style\n geom_sankey_label(size = 3.5, color = 1, fill = \"white\") +\n theme_sankey(base_size = 16)+\n labs(x = NULL)\n```\n\n## Heat Map\n\n------------------------------------------------------------------------\n\n```{r heat-map}\nlibrary(fivethirtyeight)\n\nbad_drivers %>%\n filter(state %in% c(\"Maine\", \"New Hampshire\", \"Vermont\", \"Massachusetts\", \"Connecticut\", \"New York\")) %>%\n mutate(`Number of\\nDrivers` = scale(num_drivers),\n `Percent\\nSpeeding` = scale(perc_speeding),\n `Percent\\nAlcohol` = scale(perc_alcohol),\n `Percent Not\\nDistracted` = scale(perc_not_distracted),\n `Percent No\\nPrevious` = scale(perc_no_previous),\n state = factor(state, levels = rev(state))\n ) %>%\n select(-insurance_premiums, -losses, -(num_drivers:losses)) %>%\n gather(`Number of\\nDrivers`:`Percent No\\nPrevious`, key = \"variable\", value = \"SD's from Mean\") %>%\n ggplot(aes(variable, state)) +\n geom_tile(aes(fill = `SD's from Mean`)) +\n labs(x = NULL,\n y = NULL) + \n scale_fill_gradientn() +\n theme(legend.position = \"right\",\n legend.direction = \"vertical\",\n axis.line.x = element_blank(),\n panel.grid.major.y = element_blank()) +\n remove_ticks()\n#https://learnr.wordpress.com/2010/01/26/ggplot2-quick-heatmap-plotting/\n```\n\n## Faceting and Small Multiples\n\n------------------------------------------------------------------------\n\n### facet_wrap()\n\nR's faceting system is a powerful way to make \"small multiples\".\n\nSome edits to the theme may be necessary depending upon how many rows and columns are in the plot.\n\n```{r small-multiples, fig.height=2}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n facet_wrap(~cut, ncol = 5) +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 6)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 20000), \n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n```\n\n### facet_grid()\n\n```{r faceting, fig.height=7}\ndiamonds %>%\n filter(color %in% c(\"D\", \"E\", \"F\", \"G\")) %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n facet_grid(color ~ cut) +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 4)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 20000), \n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n theme(panel.spacing = unit(20L, \"pt\")) +\n scatter_grid()\n```\n\n## Smoothers\n\n------------------------------------------------------------------------\n\n`geom_smooth()` fits and plots models to data with two or more dimensions.\n\nUnderstanding and manipulating defaults is more important for `geom_smooth()` than other geoms because it contains a number of assumptions. `geom_smooth()` automatically uses loess for datasets with fewer than 1,000 observations and a generalized additive model with `formula = y ~ s(x, bs = \"cs\")` for datasets with greater than 1,000 observations. Both default to a 95% confidence interval with the confidence interval displayed.\n\nModels are chosen with `method =` and can be set to lm(), glm(), gam(), loess(), rlm(), and more. Formulas can be specified with `formula =` and `y ~ x` syntax. Plotting the standard error is toggled with `se = TRUE` and `se = FALSE`, and level is specificed with `level =`. As always, more information can be seen in RStudio with `?geom_smooth()`.\n\n`geom_point()` adds a scatterplot to `geom_smooth()`. The order of the function calls is important. The function called second will be layed on top of the function called first.\n\n```{r geom_smooth}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n\tgeom_point(alpha = 0.05) +\n\tgeom_smooth(color = \"#ec008b\") +\n\tscale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n\t limits = c(0, 5),\n\t breaks = 0:5) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 20000), \n labels = scales::dollar) + \n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n```\n\n`geom_smooth` can be subset by categorical and factor variables. This requires subgroups to have a decent number of observations and and a fair amount of variability across the x-axis. Confidence intervals often widen at the ends so special care is needed for the chart to be meaningful and readable.\n\nThis example uses Loess with MPG = displacement.\n\n```{r subset-geom_smooth}\nggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = factor(cyl))) +\n\tgeom_point(alpha = 0.2) +\n\tgeom_smooth() +\n\tscale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n\t limits = c(0, 7),\n\t breaks = 0:7) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 60)) + \n\tlabs(x = \"Engine displacement\",\n\t\t\t y = \"Highway MPG\") +\n scatter_grid()\n```\n\nThis example uses linear models with MPG = displacement.\n\n```{r subset-geom-smooth-lm}\nggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = factor(cyl))) +\n\tgeom_point(alpha = 0.2) +\n\tgeom_smooth(method = \"lm\") +\n\tscale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n\t limits = c(0, 7),\n\t breaks = 0:7) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 60)) + \n\tlabs(x = \"Engine displacement\",\n\t\t\t y = \"Highway MPG\") +\n scatter_grid()\n```\n\n## Highlighting\n\n------------------------------------------------------------------------\n\n[`library(gghighlight)`](https://yutannihilation.github.io/gghighlight/) enables the intuitive highlighting of ggplot2 plots. `gghighlight` modifies existing ggplot2 objects, so no other code should change. All of the highlighting is handled by the function `gghighlight()`, which can handle all types of geoms.\n\n*Warning:* R will throw an error if too many colors are highlighted because of the design of `urbnthemes`. Simply decrease the number of highlighted geoms to solve this issue.\n\nThere are two main ways to highlight.\n\n### Threshold\n\nThe first way to highlight is with a threshold. Add a logical test to `gghighlight()` to describe which lines should be highlighted. Here, lines with maximum change in per-capita Gross Domestic Product greater than \\$35,000 are highlighted by `gghighlight(max(pcgpd_change) > 35000, use_direct_label = FALSE)`.\n\n```{r gghighlight-threshold}\nlibrary(gghighlight)\nlibrary(gapminder)\n\ndata <- gapminder %>%\n filter(continent %in% c(\"Europe\")) %>%\n group_by(country) %>%\n mutate(pcgpd_change = ifelse(year == 1952, 0, gdpPercap - lag(gdpPercap))) %>%\n mutate(pcgpd_change = cumsum(pcgpd_change))\n \ndata %>%\n ggplot(aes(year, pcgpd_change, group = country, color = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change) > 35000, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\")\n```\n\n### Rank\n\nThe second way to highlight is by rank. Here, the countries with the first highest values for change in per-capita Gross Domestic Product are highlighted with `gghighlight(max(pcgpd_change), max_highlight = 5, use_direct_label = FALSE)`.\n\n```{r gghighlight-rank}\ndata %>%\n ggplot(aes(year, pcgpd_change, group = country, color = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change), max_highlight = 5, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\")\n```\n\n### Faceting\n\n`gghighlight()` works well with ggplot2's faceting system.\n\n```{r gghighlight-faceting}\ndata %>%\n ggplot(aes(year, pcgpd_change, group = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change), max_highlight = 4, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\") +\n facet_wrap(~ country) +\n theme(panel.spacing = unit(20L, \"pt\"))\n```\n\n## Text and Annotation\n\n------------------------------------------------------------------------\n\nSeveral functions can be used to annotate, label, and highlight different parts of plots. `geom_text()` and `geom_text_repel()` both display variables from data frames. `annotate()`, which has several different uses, displays variables and values included in the function call.\n\n### geom_text()\n\n`geom_text()` turns text variables in data sets into geometric objects. This is useful for labeling data in plots. Both functions need `x` values and `y` values to determine placement on the coordinate plane, and a text vector of labels.\n\nThis can be used to label `geom_bar()`.\n\n```{r bar-geom_text}\ndiamonds %>%\n group_by(cut) %>%\n summarize(price = mean(price)) %>%\n ggplot(aes(cut, price)) +\n geom_bar(stat = \"identity\") +\n geom_text(aes(label = scales::dollar(price)), vjust = -1) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2)),\n \t\t\t\t\t\t\t\t\t labels = scales::dollar) +\n labs(title = \"Average Diamond Price by Diamond Cut\",\n x = \"Cut\",\n y = \"Price\") +\n remove_ticks()\n```\n\nIt can also be used to label points in a scatter plot.\n\nIt's rarely useful to label every point in a scatter plot. Use `filter()` to create a second data set that is subsetted and pass it into the labelling function.\n\n```{r scatterplot-geom_text}\nlabels <- mtcars %>%\n\trownames_to_column(\"model\") %>%\n\tfilter(model %in% c(\"Toyota Corolla\", \"Merc 240D\", \"Datsun 710\"))\n\nmtcars %>%\n\tggplot() +\n\tgeom_point(mapping = aes(x = wt, y = mpg)) +\n\tgeom_text(data = labels, mapping = aes(x = wt, y = mpg, label = model), nudge_x = 0.38) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 6)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 40)) + \n labs(x = \"Weight (Tons)\",\n y = \"Miles per gallon (MPG)\") +\n scatter_grid()\n```\n\nText too often overlaps with other text or geoms when using `geom_text()`. `library(ggrepel)` is a `library(ggplot2)` add-on that automatically positions text so it doesn't overlap with geoms or other text. To add this functionality, install and load `library(ggrepel)` and then use `geom_text_repel()` with the same syntax as `geom_text()`.\n\n### geom_text_repel()\n\n```{r scatterplot-geom_text_repel}\nlibrary(ggrepel)\n\nlabels <- mtcars %>%\n\trownames_to_column(\"model\") %>%\n\ttop_n(5, mpg)\n\nmtcars %>%\n\tggplot(mapping = aes(x = wt, y = mpg)) +\n\tgeom_point() +\n\tgeom_text_repel(data = labels, \n\t mapping = aes(label = model), \n\t nudge_x = 0.38) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 6)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 40)) + \n labs(x = \"Weight (Tons)\",\n y = \"Miles per gallon (MPG)\") +\n scatter_grid()\n```\n\n### annotate()\n\n`annotate()` doesn't use data frames. Instead, it takes values for `x =` and `y =`. It can add text, rectangles, segments, and pointrange.\n\n```{r annotate-point}\nmsleep %>%\n filter(bodywt <= 1000) %>%\n ggplot(aes(bodywt, sleep_total)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(-10, 1000),\n \t\t\t\t\t\t\t\t\t labels = scales::comma) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 25)) + \n annotate(\"text\", x = 500, y = 12, label = \"These data suggest that heavy \\n animals sleep less than light animals\") +\n labs(x = \"Body weight (pounds)\",\n y = \"Sleep time (hours)\") +\n scatter_grid() \n```\n\n```{r annotate-rect}\nlibrary(AmesHousing)\n\names <- make_ames()\n\names %>%\n mutate(square_footage = Total_Bsmt_SF - Bsmt_Unf_SF + First_Flr_SF + Second_Flr_SF) %>%\n mutate(Sale_Price = Sale_Price / 1000) %>% \n ggplot(aes(square_footage, Sale_Price)) +\n geom_point(alpha = 0.2) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(-10, 12000),\n \t\t\t\t\t\t\t\t\t labels = scales::comma) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 800),\n \t\t\t\t\t\t\t\t\t labels = scales::dollar) + \n annotate(\"rect\", xmin = 6800, xmax = 11500, ymin = 145, ymax = 210, alpha = 0.1) +\n annotate(\"text\", x = 8750, y = 230, label = \"Unfinished homes\") +\n labs(x = \"Square footage\", \n y = \"Sale price (thousands)\") +\n scatter_grid() \n```\n\n## Layered Geoms\n\n------------------------------------------------------------------------\n\nGeoms can be layered in `ggplot2`. This is useful for design and analysis.\n\nIt is often useful to add points to line plots with a small number of values across the x-axis. This example from [R for Data Science](http://r4ds.had.co.nz/tidy-data.html) shows how changing the line to grey can be appealing.\n\n### Design {.tabset}\n\n#### Before\n\n```{r layering-geoms-design}\ntable1 %>%\n\tggplot(aes(x = year, y = cases)) +\n\t\tgeom_line(aes(color = country)) +\n\t\tgeom_point(aes(color = country)) +\n\t\tscale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n\t\t labels = scales::comma) +\n\t\tscale_x_continuous(breaks = c(1999, 2000)) +\n\t\tlabs(title = \"Changes in Tuberculosis Cases in Three Countries\")\n```\n\n#### After\n\n```{r layering-geoms-design-gray}\ntable1 %>%\n\tggplot(aes(year, cases)) +\n\t\tgeom_line(aes(group = country), color = \"grey50\") +\n\t\tgeom_point(aes(color = country)) +\n\t\tscale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n\t\t labels = scales::comma) +\n\t\tscale_x_continuous(breaks = c(1999, 2000)) +\n\t\tlabs(title = \"Changes in Tuberculosis Cases in Three Countries\")\n```\n\n### Centroids\n\n```{r centroids}\nmpg_summary <- mpg %>%\n\tgroup_by(cyl) %>%\n\tsummarize(displ = mean(displ), cty = mean(cty))\n\nmpg %>%\n\tggplot() +\n\tgeom_point(aes(x = displ, y = cty, color = factor(cyl)), alpha = 0.5) +\n\tgeom_point(data = mpg_summary, aes(x = displ, y = cty), size = 5, color = \"#ec008b\") +\n\tgeom_text(data = mpg_summary, aes(x = displ, y = cty, label = cyl)) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 8)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0)), \n limits = c(0, 40)) +\n\tlabs(x = \"Displacement\",\n\t y = \"City MPG\") +\n scatter_grid()\n```\n\n## Saving Plots\n\n------------------------------------------------------------------------\n\n`ggsave()` exports ggplot2 plots. The function can be used in two ways. If `plot =` isn't specified in the function call, then `ggsave()` automatically saves the plot that was last displayed in the Viewer window. Second, if `plot =` is specified, then `ggsave()` saves the specified plot. `ggsave()` guesses the type of graphics device to use in export (.png, .pdf, .svg, etc.) from the file extension in the filename.\n\n mtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\n ggsave(filename = \"cars.png\")\n\n plot2 <- mtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\n ggsave(filename = \"cars.png\", plot = plot2)\n\nExported plots rarely look identical to the plots that show up in the Viewer window in RStudio because the overall size and aspect ratio of the Viewer is often different than the defaults for `ggsave()`. Specific sizes, aspect ratios, and resolutions can be controlled with arguments in `ggsave()`. RStudio has a useful [cheatsheet](https://www.rstudio.com/resources/cheatsheets/) called \"How Big is Your Graph?\" that should help with choosing the best size, aspect ratio, and resolution.\n\nFonts are not embedded in PDFs by default. To embed fonts in PDFs, include `device = cairo_pdf` in `ggsave()`.\n\n plot <- mtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\n ggsave(filename = \"cars.pdf\", plot = plot2, width = 6.5, height = 4, device = cairo_pdf)\n\n## urbnthemes\n\n### Overview\n\n`urbnthemes` is a set of tools for creating Urban Institute-themed plots and maps in R. The package extends `ggplot2` with print and map themes as well as tools that make plotting easier at the Urban Institute. `urbnthemes` replaces the [urban_R\\_theme](https://github.com/UrbanInstitute/urban_R_theme).\n\nAlways load `library(urbnthemes)` after `library(ggplot2)` or `library(tidyverse)`.\n\n### Usage\n\nUse `set_urbn_defaults(style = \"print\")` to set the default styles. `scatter_grid()`, `remove_ticks()`, `add_axis()`, and `remove_axis()` can all be used to improve graphics.\n\n```{r example, message=FALSE}\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nggplot(data = mtcars, mapping = aes(factor(cyl))) +\n geom_bar() + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Number of Cylinders\",\n y = \"Count\") +\n remove_ticks()\n```\n\n### Combining elements\n\n`library(urbnthemes)` contains functions for combining plot elements into graphics. `urbn_plot()` brings all of the elements together.\n\n- `urbn_logo_text()`\n- `remove_ticks()`\n- `remove_axis()`\n- `scatter_grid()`\n- `add_axis()`\n- `urbn_geofacet`\n\n```{r example2}\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nplot <- ggplot(data = mtcars, mapping = aes(factor(cyl))) +\n geom_bar() + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Number of Cylinders\",\n y = \"Count\") +\n remove_ticks()\n\nurbn_plot(plot, urbn_logo_text(), ncol = 1, heights = c(30, 1))\n```\n\nSometimes it's important to horizontally add the y-axis title above the plot. `urbn_y_title()` can be sued for this task. The following example goes one step further and adds the title between the legend and the plot.\n\n```{r}\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults()\n\nplot <- ggplot(data = mtcars, mapping = aes(x = wt, y = mpg, color = factor(cyl))) +\n geom_point() + \n\tscale_x_continuous(expand = c(0, 0),\n\t\t\t\t\t\t\t\t\t\t limits = c(0, 8)) +\n scale_y_continuous(expand = c(0, 0),\n \t\t\t\t\t\t\t\t\t limits = c(0, 40)) +\n remove_ticks() +\n\tlabs(\"\") +\n\tscatter_grid()\n\nurbn_plot(get_legend(plot),\n\t\t\t\t\turbn_y_title(\"Miles per gallon\"),\n\t\t\t\t\tremove_legend(plot), \n\t\t\t\t\turbn_logo_text(), \n\t\t\t\t\tncol = 1, \n\t\t\t\t\theights = c(3, 1, 30, 1))\n```\n\n### Palettes\n\n`urbnthemes` contains many quick-access color palettes from the [Urban Institute Data Visualization Style Guide](http://urbaninstitute.github.io/graphics-styleguide/). These palettes can be used to quickly overwrite default color palettes from `urbnthemes`.\n\n- `palette_urbn_main` is the eight color discrete palette of the Urban Institute with cyan, yellow, black, gray, magenta, green, space gray, and red.\n- `palette_urbn_diverging` is an eight color diverging palette.\n- `palette_urbn_quintile` is a five color blue palette that is good for quintiles.\n- `palette_urbn_politics` is a two color palette with blue for Democrats and red for Republicans.\n\nThere are seven palettes that are continuous palettes of the seven unique colors in the discrete Urban Institute color palette:\n\n- `palette_urbn_cyan`\n- `palette_urbn_gray`\n- `palette_urbn_yellow`\n- `palette_urbn_magenta`\n- `palette_urbn_green`\n- `palette_urbn_spacegray`\n- `palette_urbn_red`\n\nUse `view_palette()` to see the palette:\n\n```{r view-palette}\nview_palette(palette_urbn_magenta)\n```\n\nThe vectors can be subset using base R syntax. This allows for the quick selection of specific colors from a palette.\n\n```{r palette-subset1}\npalette_urbn_main[1:4]\n```\n\n```{r palette-subset2}\npalette_urbn_spacegray[1:5]\n```\n\n### Utility functions\n\n`library(urbnthemes)` contains four functions that are helpful with managing font instalations:\n\n- `lato_test()`\n- `lato_install()`\n- `fontawesome_test()`\n- `fontawesome_install()`\n\n## Bibliography and Session Information\n\n------------------------------------------------------------------------\n\n*Note:* Examples present in [this document](https://awunderground.github.io/ggplot2-themes/) by Aaron Williams were created during personal time.\n\nBob Rudis and Dave Gandy (2017). waffle: Create Waffle Chart Visualizations in R. R package version 0.7.0. https://CRAN.R-project.org/package=waffle\n\nChester Ismay and Jennifer Chunn (2017). fivethirtyeight: Data and Code Behind the Stories and Interactives at 'FiveThirtyEight'. R package version 0.3.0. https://CRAN.R-project.org/package=fivethirtyeight\n\nHadley Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2009.\n\nHadley Wickham (2017). tidyverse: Easily Install and Load the 'Tidyverse'. R package version 1.2.1. https://CRAN.R-project.org/package=tidyverse\n\nHadley Wickham (2017). forcats: Tools for Working with Categorical Variables (Factors). R package version 0.2.0. https://CRAN.R-project.org/package=forcats\n\nJennifer Bryan (2017). gapminder: Data from Gapminder. R package version 0.3.0. https://CRAN.R-project.org/package=gapminder\n\nKamil Slowikowski (2017). ggrepel: Repulsive Text and Label Geoms for 'ggplot2'. R package version 0.7.0. https://CRAN.R-project.org/package=ggrepel\n\nMax Kuhn (2017). AmesHousing: The Ames Iowa Housing Data. R package version 0.0.3. https://CRAN.R-project.org/package=AmesHousing\n\nPeter Kampstra (2008). Beanplot: A Boxplot Alternative for Visual Comparison of Distributions, Journal of Statistical Software, 2008. https://www.jstatsoft.org/article/view/v028c01\n\nR Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.\n\nWinston Chang, (2014). extrafont: Tools for using fonts. R package version 0.17. https://CRAN.R-project.org/package=extrafont\n\nYihui Xie (2018). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.19.\n\n```{r System Info and Package Versioning}\nsessionInfo()\n```\n"},"formats":{"html":{"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"knitr"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[]},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"graphics-guide.html"},"language":{},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.2.269","editor_options":{"chunk_output_type":"console"}},"extensions":{"book":{"multiFile":true}}}}} \ No newline at end of file +{"title":"Urban Institute R Graphics Guide","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"editor_options":{"chunk_output_type":"console"}},"headingText":"Urban Institute R Graphics Guide","containsRefs":false,"markdown":"\n\n\n\n::: {#header}\n\n:::\n\n\n```{r setup, include=FALSE}\nlibrary(knitr)\nlibrary(datasets)\nlibrary(tidyverse)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nopts_chunk$set(fig.path = \"graphics-guide/www/images/\")\nopts_chunk$set(echo = TRUE)\nopts_chunk$set(warning = FALSE)\nopts_chunk$set(message = FALSE)\nopts_chunk$set(fig.width = 6.5)\nopts_chunk$set(fig.height = 4)\nopts_chunk$set(fig.retina = 3)\noptions(scipen = 999)\n```\n\nR is a powerful, open-source programming language and environment. R excels at data management and munging, traditional statistical analysis, machine learning, and reproducible research, but it is probably best known for its graphics. This guide contains examples and instructions for popular and lesser-known plotting techniques in R. It also includes instructions for using `urbnthemes`, the Urban Institute's R package for creating near-publication-ready plots with `ggplot2`. If you have any questions, please don't hesitate to contact Aaron Williams (awilliams\\@urban.org) or Kyle Ueyama (kueyama\\@urban.org).\n\n### Background\n\n`library(urbnthemes)` makes `ggplot2` output align more closely with [the Urban Institute's Data Visualization style guide](http://urbaninstitute.github.io/graphics-styleguide/). This package does **not produce publication ready graphics**. Visual styles must still be edited using your project/paper's normal editing workflow.\n\nExporting charts as a pdf will allow them to be more easily edited. See the Saving Plots section for more information.\n\nThe theme has been tested against `ggplot2 version 3.0.0`. It will not function properly with older versions of `ggplot2`\n\n### Using library(urbnthemes)\n\nRun the following code to install or update `urbnthemes`:\n\n``` \ninstall.packages(\"remotes\")\nremotes::install_github(\"UrbanInstitute/urbnthemes\")\n```\n\nRun the following code at the top of each script:\n\n``` \nlibrary(tidyverse)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n```\n\n### Installing Lato {#installing_lato}\n\nYour Urban computer may not have the Lato font installed. If it is not installed, please install the free [Lato font from Google](https://www.google.com/fonts/specimen/Lato). Below are step by step instructions:\n\n1) Download the [Lato font](https://www.google.com/fonts/specimen/Lato) (as a zip file).\n2) Unzip the file on your computer.\n3) For each `.ttf` file in the unzipped `Lato/` folder, double click the file and click `Install` (on Windows) or `Install Font` (on Mac).\n4) Import and register Lato into R by running `urbnthemes::lato_import()` in the console once. Be patient as this may take a few minutes!\n5) To confirm installation, run `urbnthemes::lato_test()`. If this is successful you're done and Lato will automatically be used when creating plots with `library(urbnthemes)`. You only need to install Lato once per computer.\n\nWaffle charts with glyphs require fontawesome. `fontawesome_test()` and `fontawesome_install()` are the fontawesome versions of the above functions. Be sure to install fontawesome from [here](https://github.com/hrbrmstr/waffle/tree/master/inst/fonts) first.\n\n### Grammar of Graphics and Conventions\n\nHadley Wickham's ggplot2 is based on Leland Wilkinson's [*The Grammar of Graphics*](https://www.amazon.com/Grammar-Graphics-Statistics-Computing/dp/0387245448) and Wickham's [*A Layered Grammar of Graphics*](http://vita.had.co.nz/papers/layered-grammar.html). The layered grammar of graphics is a structured way of thinking about the components of a plot, which then lend themselves to the simple structure of ggplot2.\n\n- **Data** are what are visualizaed in a plot and **mappings** are directions for how data are mapped in a plot in a way that can be perceived by humans.\\\n- **Geoms** are representations of the actual data like points, lines, and bars.\n- **Stats** are statistical transformations that represent summaries of the data like histograms.\n- **Scales** map values in the data space to values in the aesthetic space. Scales draw legends and axes.\n- **Coordinate Systems** describe how geoms are mapped to the plane of the graphic.\\\n- **Facets** break the data into meaningful subsets like small multiples.\n- **Themes** control the finer points of a plot such as fonts, font sizes, and background colors.\n\nMore information: [ggplot2: Elegant Graphics for Data Analysis](https://www.amazon.com/ggplot2-Elegant-Graphics-Data-Analysis/dp/0387981403)\n\n### Tips and Tricks\n\n- `ggplot2` expects data to be in data frames or tibbles. It is preferable for the data frames to be \"tidy\" with each variable as a column, each obseravtion as a row, and each observational unit as a separate table. `dplyr` and `tidyr` contain concise and effective tools for \"tidying\" data.\n\n- R allows function arguments to be called explicitly by name and implicitly by position. The coding examples in this guide only contain named arguments for clarity.\n\n- Graphics will sometimes render differently on different operating systems. This is because anti-aliasing is activated in R on Mac and Linux but not activated in R on Windows. This won't be an issue once graphics are saved.\n\n- Continuous x-axes have ticks. Discrete x-axes do not have ticks. Use `remove_ticks()` to remove ticks.\n\n## Bar Plots\n\n------------------------------------------------------------------------\n\n### One Color\n\n```{r barplots}\nmtcars %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = factor(cyl), y = n)) +\n geom_col() +\n geom_text(mapping = aes(label = n), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) +\n remove_ticks() +\n remove_axis() \n```\n\n### One Color (Rotated)\n\nThis example introduces `coord_flip()` and `remove_axis(axis = \"x\", flip = TRUE)`. `remove_axis()` is from `library(urbnthemes)` and creates a custom theme for rotated bar plots.\n\n```{r barplot-rotated}\nmtcars %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = factor(cyl), y = n)) +\n geom_col() +\n geom_text(mapping = aes(label = n), hjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n coord_flip() +\n remove_axis(axis = \"x\", flip = TRUE)\n```\n\n### Three Colors\n\nThis is identical to the previous plot except colors and a legend are added with `fill = cyl`. Turning `x` into a factor with `factor(cyl)` skips 5 and 7 on the `x-axis`. Adding `fill = cyl` without `factor()` would have created a continuous color scheme and legend.\n\n```{r 3-color-barplot}\nmtcars %>%\n mutate(cyl = factor(cyl)) %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = cyl, y = n, fill = cyl)) +\n geom_col() +\n geom_text(mapping = aes(label = n), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) +\n remove_ticks() +\n remove_axis()\n```\n\n### Stacked Bar Plot\n\nAn additional aesthetic can easily be added to bar plots by adding `fill = categorical variable` to the mapping. Here, transmission type subsets each bar showing the count of cars with different numbers of cylinders.\n\n```{r stacked-bar-plot}\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>% \n group_by(am) %>%\n count(cyl) %>%\n group_by(cyl) %>%\n arrange(desc(am)) %>%\n mutate(label_height = cumsum(n)) %>%\n ggplot() +\n geom_col(mapping = aes(x = cyl, y = n, fill = am)) +\n geom_text(aes(x = cyl, y = label_height - 0.5, label = n, color = am)) +\n scale_color_manual(values = c(\"white\", \"black\")) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n remove_axis() +\n guides(color = \"none\")\n```\n\n### Stacked Bar Plot With Position = Fill\n\nThe previous examples used `geom_col()`, which takes a y value for bar height. This example uses `geom_bar()` which sums the values and generates a value for bar heights. In this example, `position = \"fill\"` in `geom_bar()` changes the y-axis from count to the proportion of each bar.\n\n```{r stacked-bar-plot-fill}\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>% \n ggplot() +\n geom_bar(mapping = aes(x = cyl, fill = am), position = \"fill\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1)), labels = scales::percent) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n guides(color = \"none\")\n```\n\n### Dodged Bar Plot\n\nSubsetted bar charts in ggplot2 are stacked by default. `position = \"dodge\"` in `geom_col()` expands the bar chart so the bars appear next to each other.\n\n```{r dodged-bar-plot}\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>%\n group_by(am) %>%\n count(cyl) %>%\n ggplot(mapping = aes(cyl, y = n, fill = factor(am))) +\n geom_col(position = \"dodge\") +\n geom_text(aes(label = n), position = position_dodge(width = 0.7), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n remove_axis()\n```\n\n### Lollipop plot/Cleveland dot plot {.tabset}\n\nLollipop plots and Cleveland dot plots are minimalist alternatives to bar plots. The key to both plots is to order the data based on the continuous variable using `arrange()` and then turn the discrete variable into a factor with the ordered levels of the continuous variable using `mutate()`. This step \"stores\" the order of the data.\n\n#### Lollipop plot\n\n```{r lollipop-plot, fig.height = 5}\nmtcars %>%\n\trownames_to_column(\"model\") %>%\n\tarrange(mpg) %>%\n\tmutate(model = factor(model, levels = .$model)) %>%\n\tggplot(aes(mpg, model)) +\n\t\tgeom_segment(aes(x = 0, xend = mpg, y = model, yend = model)) +\t\n\t\tgeom_point() +\n\t\tscale_x_continuous(expand = expansion(mult = c(0, 0)), limits = c(0, 40)) +\n\t\tlabs(x = NULL, \n\t\t\t\t y = \"Miles Per Gallon\")\n```\n\n#### Cleveland dot plot\n\n```{r cleveland-dot-plot, fig.height = 5}\nmtcars %>%\n\trownames_to_column(\"model\") %>%\n\tarrange(mpg) %>%\n\tmutate(model = factor(model, levels = .$model)) %>%\n\tggplot(aes(mpg, model)) +\n\t\tgeom_point() +\n\t\tscale_x_continuous(expand = expansion(mult = c(0, 0)), limits = c(0, 40)) +\n\t\tlabs(x = NULL, \n\t\t\t\t y = \"Miles Per Gallon\")\n```\n\n### Dumbell plot\n\n## Scatter Plots\n\n------------------------------------------------------------------------\n\n### One Color Scatter Plot\n\nScatter plots are useful for showing relationships between two or more variables. Use `scatter_grid()` from `library(urbnthemes)` to easily add vertical grid lines for scatter plots.\n\n```{r one-color-scatter-plot}\nmtcars %>%\n ggplot(mapping = aes(x = wt, y = mpg)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:8 * 5) +\n labs(x = \"Weight (thousands of pounds)\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n### High-Density Scatter Plot with Transparency\n\nLarge numbers of observations can sometimes make scatter plots tough to interpret because points overlap. Adding `alpha =` with a number between 0 and 1 adds transparency to points and clarity to plots. Now it's easy to see that jewelry stores are probably rounding up but not rounding down carats!\n\n```{r alpha-scatter-plot}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 20000),\n breaks = 0:4 * 5000,\n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n```\n\n### Hex Scatter Plot\n\nSometimes transparency isn't enough to bring clarity to a scatter plot with many observations. As n increases into the hundreds of thousands and even millions, `geom_hex` can be one of the best ways to display relationships between two variables.\n\n```{r scatter-plot-hex}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_hex(mapping = aes(fill = after_stat(count))) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 20000),\n breaks = 0:4 * 5000,\n labels = scales::dollar) +\n\tscale_fill_gradientn(labels = scales::comma) + \n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid() +\n theme(legend.position = \"right\",\n legend.direction = \"vertical\")\n```\n\n### Scatter Plots With Random Noise {.tabset}\n\nSometimes scatter plots have many overlapping points but a reasonable number of observations. `geom_jitter` adds a small amount of random noise so points are less likely to overlap. `width` and `height` control the amount of noise that is added. In the following before-and-after, notice how many more points are visible after adding jitter.\n\n#### Before\n\n```{r before-scatter-plot}\nmpg %>%\n ggplot(mapping = aes(x = displ, y = cty)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 8),\n breaks = 0:8) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:4 * 10) +\n labs(x = \"Displacement\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n#### After\n\n```{r jitter-plot}\nset.seed(2017)\nmpg %>%\n ggplot(mapping = aes(x = displ, y = cty)) +\n geom_jitter() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 8),\n breaks = 0:8) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:4 * 10) +\n labs(x = \"Displacement\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n### Scatter Plots with Varying Point Size\n\nWeights and populations can be mapped in scatter plots to the size of the points. Here, the number of households in each state is mapped to the size of each point using `aes(size = hhpop)`. Note: `ggplot2::geom_point()` is used instead of `geom_point()`.\n\n```{r geom_point-size, fig.height = 5}\nurbnmapr::statedata %>%\n ggplot(mapping = aes(x = medhhincome, y = horate)) +\n ggplot2::geom_point(mapping = aes(size = hhpop), alpha = 0.3) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(30000, 80000),\n breaks = 3:8 * 10000,\n labels = scales::dollar) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 0.8),\n breaks = 0:4 * 0.2) +\n scale_radius(range = c(3, 15),\n breaks = c(2500000, 7500000, 12500000), \n labels = scales::comma) +\n labs(x = \"Household income\",\n y = \"Homeownership rate\") +\n scatter_grid() +\n\ttheme(plot.margin = margin(r = 20))\n```\n\n### Scatter Plots with Fill\n\nA third aesthetic can be added to scatter plots. Here, color signifies the number of cylinders in each car. Before `ggplot()` is called, Cylinders is created using `library(dplyr)` and the piping operator `%>%`.\n\n```{r filled-scatter-plot}\nmtcars %>%\n mutate(cyl = paste(cyl, \"cylinders\")) %>%\n ggplot(aes(x = wt, y = mpg, color = cyl)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:8 * 5) +\n labs(x = \"Weight (thousands of pounds)\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n## Line Plots\n\n------------------------------------------------------------------------\n\n```{r line-plots}\neconomics %>%\n ggplot(mapping = aes(x = date, y = unemploy)) +\n geom_line() +\n scale_x_date(expand = expansion(mult = c(0.002, 0)), \n breaks = \"10 years\",\n limits = c(as.Date(\"1961-01-01\"), as.Date(\"2020-01-01\")),\n date_labels = \"%Y\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 4000,\n limits = c(0, 16000),\n labels = scales::comma) +\n labs(x = \"Year\", \n y = \"Number Unemployed (1,000s)\")\n```\n\n### Lines Plots With Multiple Lines\n\n```{r multiple-line-charts1}\nlibrary(gapminder)\n\ngapminder %>%\n filter(country %in% c(\"Australia\", \"Canada\", \"New Zealand\")) %>%\n mutate(country = factor(country, levels = c(\"Canada\", \"Australia\", \"New Zealand\"))) %>%\n ggplot(aes(year, gdpPercap, color = country)) +\n geom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n breaks = c(1952 + 0:12 * 5), \n limits = c(1952, 2007)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:8 * 5000,\n labels = scales::dollar, \n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Per capita GDP (US dollars)\")\n```\n\nPlotting more than one variable can be useful for seeing the relationship of variables over time, but it takes a small amount of data munging.\n\nThis is because `ggplot2` wants data in a \"long\" format instead of a \"wide\" format for line plots with multiple lines. `gather()` and `spread()` from the `tidyr` package make switching back-and-forth between \"long\" and \"wide\" painless. Essentially, variable titles go into \"key\" and variable values go into \"value\". Then ggplot2, turns the different levels of the key variable (population, unemployment) into colors.\n\n```{r multiple-line-charts2}\nas_tibble(EuStockMarkets) %>%\n\tmutate(date = time(EuStockMarkets)) %>%\n\tgather(key = \"key\", value = \"value\", -date) %>%\n\tggplot(mapping = aes(x = date, y = value, color = key)) +\n\tgeom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(1991, 1999), \n breaks = c(1991, 1993, 1995, 1997, 1999)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 2500,\n labels = scales::dollar, \n limits = c(0, 10000)) + \n\tlabs(x = \"Date\",\n\t\t\t y = \"Value\")\n```\n\n### Step plot\n\n`geom_line()` connects coordinates with the shortest possible straight line. Sometimes step plots are necessary because y values don't change between coordinates. For example, the upper-bound of the Federal Funds Rate is set at regular intervals and remains constant until it is changed.\n\n```{r step-plot}\n# downloaded from FRED on 2018-12-06\n\n# https://fred.stlouisfed.org/series/DFEDTARU\n\nfed_fund_rate <- read_csv(\n \"date, fed_funds_rate\n 2014-01-01,0.0025\n 2015-12-16,0.0050\n 2016-12-14,0.0075\n 2017-03-16,0.0100\n 2017-06-15,0.0125\n 2017-12-14,0.0150\n 2018-03-22,0.0175\n 2018-06-14,0.0200\n 2018-09-27,0.0225\n 2018-12-06,0.0225\")\n\nfed_fund_rate %>%\n ggplot(mapping = aes(x = date, y = fed_funds_rate)) + \n geom_step() +\n scale_x_date(expand = expansion(mult = c(0.002, 0)), \n breaks = \"1 year\",\n limits = c(as.Date(\"2014-01-01\"), as.Date(\"2019-01-01\")),\n date_labels = \"%Y\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = c(0, 0.01, 0.02, 0.03),\n limits = c(0, 0.03),\n labels = scales::percent) + \n\tlabs(x = \"Date\",\n\t\t\t y = \"Upper-bound of the Federal Funds Rate\")\n```\n\n### Path plot\n\nThe Beveridge curve is a macroeconomic plot that displays a relationship between the unemployment rate and the vacancy rate. Movements along the curve indicate changes in the business cyle and horizontal shifts of the curve suggest structural changes in the labor market.\n\nLines in Beveridge curves do not monotonically move from left to right. Therefore, it is necessary to use `geom_path()`.\n\n```{r, path-plot}\n# seasonally-adjusted, quarterly vacancy rate - JOLTS # seasonally-adjusted, quarterly unemployment rate - CPS\n\n# pulled from FRED on April 11, 2018. \n\nlibrary(ggrepel)\n\nbeveridge <- read_csv(\n\t\"quarter, vacanacy_rate, unempoyment_rate\n\t2006-01-01,0.0310,0.0473\n\t2006-04-01,0.0316,0.0463\n\t2006-07-01,0.0313,0.0463\n\t2006-10-01,0.0310,0.0443\n\t2007-01-01,0.0323,0.0450\n\t2007-04-01,0.0326,0.0450\n\t2007-07-01,0.0316,0.0466\n\t2007-10-01,0.0293,0.0480\n\t2008-01-01,0.0286,0.0500\n\t2008-04-01,0.0280,0.0533\n\t2008-07-01,0.0253,0.0600\n\t2008-10-01,0.0220,0.0686\n\t2009-01-01,0.0196,0.0826\n\t2009-04-01,0.0180,0.0930\n\t2009-07-01,0.0176,0.0963\n\t2009-10-01,0.0180,0.0993\n\t2010-01-01,0.0196,0.0983\n\t2010-04-01,0.0220,0.0963\n\t2010-07-01,0.0216,0.0946\n\t2010-10-01,0.0220,0.0950\n\t2011-01-01,0.0226,0.0903\n\t2011-04-01,0.0236,0.0906\n\t2011-07-01,0.0250,0.0900\n\t2011-10-01,0.0243,0.0863\n\t2012-01-01,0.0270,0.0826\n\t2012-04-01,0.0270,0.0820\n\t2012-07-01,0.0266,0.0803\n\t2012-10-01,0.0260,0.0780\n\t2013-01-01,0.0276,0.0773\n\t2013-04-01,0.0280,0.0753\n\t2013-07-01,0.0280,0.0723\n\t2013-10-01,0.0276,0.0693\n\t2014-01-01,0.0290,0.0666\n\t2014-04-01,0.0323,0.0623\n\t2014-07-01,0.0326,0.0610\n\t2014-10-01,0.0330,0.0570\n\t2015-01-01,0.0350,0.0556\n\t2015-04-01,0.0366,0.0540\n\t2015-07-01,0.0373,0.0510\n\t2015-10-01,0.0360,0.0500\n\t2016-01-01,0.0386,0.0493\n\t2016-04-01,0.0383,0.0486\n\t2016-07-01,0.0383,0.0493\n\t2016-10-01,0.0363,0.0473\n\t2017-01-01,0.0366,0.0466\n\t2017-04-01,0.0390,0.0433\n\t2017-07-01,0.0406,0.0430\n\t2017-10-01,0.0386,0.0410\")\n\nlabels <- beveridge %>%\n filter(lubridate::month(quarter) == 1)\n\nbeveridge %>%\n\tggplot() +\n\tgeom_path(mapping = aes(x = unempoyment_rate, y = vacanacy_rate), alpha = 0.5) +\n geom_point(data = labels, mapping = aes(x = unempoyment_rate, y = vacanacy_rate)) +\n geom_text_repel(data = labels, mapping = aes(x = unempoyment_rate, y = vacanacy_rate, label = lubridate::year(quarter))) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0.04, 0.1),\n labels = scales::percent) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = c(0, 0.01, 0.02, 0.03, 0.04, 0.05),\n limits = c(0, 0.05),\n labels = scales::percent) + \n\tlabs(x = \"Seasonally-adjusted unemployment rate\",\n\t\t\t y = \"Seasonally-adjusted vacancy rate\") + \n scatter_grid()\n```\n\n### Slope plots\n\n```{r slope-plot, fig.height = 5}\n# https://www.bls.gov/lau/\nlibrary(ggrepel)\n\nunemployment <- tibble(\n\ttime = c(\"October 2009\", \"October 2009\", \"October 2009\", \"August 2017\", \"August 2017\", \"August 2017\"),\n\trate = c(7.4, 7.1, 10.0, 3.9, 3.8, 6.4),\n\tstate = c(\"Maryland\", \"Virginia\", \"Washington, D.C.\", \"Maryland\", \"Virginia\", \"Washington, D.C.\")\n)\n\nlabel <- tibble(label = c(\"October 2009\", \"August 2017\"))\noctober <- filter(unemployment, time == \"October 2009\")\naugust <- filter(unemployment, time == \"August 2017\")\n\nunemployment %>%\n\tmutate(time = factor(time, levels = c(\"October 2009\", \"August 2017\")),\n\t state = factor(state, levels = c(\"Washington, D.C.\", \"Maryland\", \"Virginia\"))) %>%\n\tggplot() + \n\tgeom_line(aes(time, rate, group = state, color = state), show.legend = FALSE) +\n\tgeom_point(aes(x = time, y = rate, color = state)) +\n\tlabs(subtitle = \"Unemployment Rate\") +\n\ttheme(axis.ticks.x = element_blank(),\n\t\t\t\taxis.title.x = element_blank(),\n\t\t\t\taxis.ticks.y = element_blank(),\n axis.title.y = element_blank(), \n axis.text.y = element_blank(),\n\t\t\t\tpanel.grid.major.y = element_blank(),\n panel.grid.minor.y = element_blank(),\n panel.grid.major.x = element_blank(),\n\t\t\t\taxis.line = element_blank()) +\n\tgeom_text_repel(data = october, mapping = aes(x = time, y = rate, label = as.character(rate)), nudge_x = -0.06) + \n\tgeom_text_repel(data = august, mapping = aes(x = time, y = rate, label = as.character(rate)), nudge_x = 0.06)\n```\n\n## Univariate\n\n------------------------------------------------------------------------\n\nThere are a number of ways to explore the distributions of univariate data in R. Some methods, like strip charts, show all data points. Other methods, like the box and whisker plot, show selected data points that communicate key values like the median and 25th percentile. Finally, some methods don't show any of the underlying data but calculate density estimates. Each method has advantages and disadvantages, so it is worthwhile to understand the different forms. For more information, read [40 years of boxplots](http://vita.had.co.nz/papers/boxplots.pdf) by Hadley Wickham and Lisa Stryjewski.\n\n### Strip Chart\n\nStrip charts, the simplest univariate plot, show the distribution of values along one axis. Strip charts work best with variables that have plenty of variation. If not, the points tend to cluster on top of each other. Even if the variable has plenty of variation, it is often important to add transparency to the points with `alpha =` so overlapping values are visible.\n\n```{r stripchart, fig.height=2}\nmsleep %>%\n ggplot(aes(x = sleep_total, y = factor(1))) +\n geom_point(alpha = 0.2, size = 5) +\n labs(y = NULL) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) +\n scale_y_discrete(labels = NULL) +\n labs(title = \"Total Sleep Time of Different Mammals\",\n x = \"Total sleep time (hours)\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n```\n\n### Strip Chart with Highlighting\n\nBecause strip charts show all values, they are useful for showing where selected points lie in the distribution of a variable. The clearest way to do this is by adding `geom_point()` twice with `filter()` in the data argument. This way, the highlighted values show up on top of unhighlighted values.\n\n```{r stripchart-with-highlighting, fig.height=2}\nggplot() +\n geom_point(data = filter(msleep, name != \"Red fox\"), \n aes(x = sleep_total, \n y = factor(1)),\n alpha = 0.2, \n size = 5,\n \t\t\t\t\t color = \"grey50\") +\n geom_point(data = filter(msleep, name == \"Red fox\"),\n aes(x = sleep_total, \n y = factor(1), \n color = name),\n alpha = 0.8,\n size = 5) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) + \n scale_y_discrete(labels = NULL) +\n labs(title = \"Total Sleep Time of Different Mammals\",\n x = \"Total sleep time (hours)\",\n y = NULL,\n legend) +\n guides(color = guide_legend(title = NULL)) +\n theme(axis.ticks.y = element_blank())\n```\n\n### Subsetted Strip Chart\n\nAdd a y variable to see the distributions of the continuous variable in subsets of a categorical variable.\n\n```{r subsetted-stripchart, fig.height=3}\nlibrary(forcats)\n\nmsleep %>%\n filter(!is.na(vore)) %>%\n mutate(vore = fct_recode(vore, \n \"Insectivore\" = \"insecti\",\n \"Omnivore\" = \"omni\", \n \"Herbivore\" = \"herbi\", \n \"Carnivore\" = \"carni\"\n )) %>%\n ggplot(aes(x = sleep_total, y = vore)) +\n geom_point(alpha = 0.2, size = 5) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) + \n labs(title = \"Total Sleep Time of Different Mammals by Diet\",\n x = \"Total sleep time (hours)\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n```\n\n### Beeswarm Plots\n\nBeesward plots are a variation of strip charts that shows the distribution of data, but without the points overlaping.\n\n```{r beeswarm}\nlibrary(ggbeeswarm)\n\ntxhousing %>%\n\tfilter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>% \n ggplot(aes(x = median, y = city)) +\n geom_beeswarm(alpha = 0.2, size = 5) + \n\tscale_x_continuous(labels = scales::dollar) +\n labs(title = \"Household Sale Price by City\",\n x = \"Sale Price\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n\n```\n\n### Histograms\n\nHistograms divide the distribution of a variable into n equal-sized bins and then count and display the number of observations in each bin. Histograms are sensitive to bin width. As `?geom_histogram` notes, \"You should always override \\[the default binwidth\\] value, exploring multiple widths to find the best to illustrate the stories in your data.\"\n\n```{r histogram}\nggplot(data = diamonds, mapping = aes(x = depth)) + \n geom_histogram(bins = 100) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 100)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.2)), labels = scales::comma) +\n labs(x = \"Depth\",\n y = \"Count\")\n```\n\n### Boxplots\n\nBoxplots were invented in the 1970s by John Tukey[^1]. Instead of showing the underlying data or binned counts of the underlying data, they focus on important values like the 25th percentile, median, and 75th percentile.\n\n[^1]: Wickham, H., & Stryjewski, L. (2011). 40 years of boxplots.\n\n```{r box-plot}\nInsectSprays %>%\n ggplot(mapping = aes(x = spray, y = count)) +\n geom_boxplot() +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Type of insect spray\",\n y = \"Number of dead insects\") +\n remove_ticks()\n```\n\n### Smoothed Kernel Density Plots\n\nContinuous variables with smooth distributions are sometimes better represented with smoothed kernel density estimates than histograms or boxplots. `geom_density()` computes and plots a kernel density estimate. Notice the lumps around integers and halves in the following distribution because of rounding.\n\n```{r kernel-density-plot}\ndiamonds %>%\n ggplot(mapping = aes(carat)) +\n geom_density(color = NA) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, NA)) +\n\tscale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Carat\",\n y = \"Density\")\n```\n\n```{r kernel-density-plot-filled}\ndiamonds %>%\n mutate(cost = ifelse(price > 5500, \"More than $5,500 +\", \"$0 to $5,500\")) %>%\n ggplot(mapping = aes(carat, fill = cost)) +\n geom_density(alpha = 0.25, color = NA) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, NA)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Carat\",\n y = \"Density\")\n```\n\n### Ridgeline Plots\n\nRidgeline plots are partially overlapping smoothed kernel density plots faceted by a categorical variable that pack a lot of information into one elegant plot.\n\n```{r ridgeline-plots}\nlibrary(ggridges)\n\nggplot(diamonds, mapping = aes(x = price, y = cut)) +\n\tgeom_density_ridges(fill = \"#1696d2\") +\n labs(x = \"Price\",\n y = \"Cut\")\n```\n\n### Violin Plots\n\nViolin plots are symmetrical displays of smooth kernel density plots.\n\n```{r violin-plot}\nInsectSprays %>%\n ggplot(mapping = aes(x = spray, y = count, fill = spray)) +\n geom_violin(color = NA) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Type of insect spray\",\n y = \"Number of dead insects\") +\n remove_ticks()\n```\n\n### Bean Plot\n\nIndividual outliers and important summary values are not visible in violin plots or smoothed kernel density plots. Bean plots, [created by Peter Kampstra in 2008](https://www.jstatsoft.org/article/view/v028c01), are violin plots with data shown as small lines in a one-dimensional sstrip plot and larger lines for the mean.\n\n```{r beanplot}\nmsleep %>%\n filter(!is.na(vore)) %>%\n mutate(vore = fct_recode(vore, \n \"Insectivore\" = \"insecti\",\n \"Omnivore\" = \"omni\", \n \"Herbivore\" = \"herbi\", \n \"Carnivore\" = \"carni\"\n )) %>%\n ggplot(aes(x = vore, y = sleep_total, fill = vore)) +\n stat_summary(fun = \"mean\",\n colour = \"black\", \n size = 30,\n shape = 95,\n geom = \"point\") +\n geom_violin(color = NA) +\n geom_jitter(width = 0,\n height = 0.05,\n alpha = 0.4,\n shape = \"-\",\n size = 10,\n \t\t\t\t\t\tcolor = \"grey50\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) + \n labs(x = NULL,\n y = \"Total sleep time (hours)\") +\n theme(legend.position = \"none\") +\n remove_ticks()\n```\n\n## Area Plot\n\n------------------------------------------------------------------------\n\n### Stacked Area\n\n```{r area-plot-stack}\ntxhousing %>%\n filter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>%\n group_by(city, year) %>%\n summarize(sales = sum(sales)) %>%\n ggplot(aes(x = year, y = sales, fill = city)) +\n geom_area(position = \"stack\") +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(2000, 2015),\n breaks = 2000 + 0:15) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n labels = scales::comma) +\n labs(x = \"Year\",\n y = \"Home sales\")\n```\n\n### Filled Area\n\n```{r area-plot-fill}\ntxhousing %>%\n filter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>%\n group_by(city, year) %>%\n summarize(sales = sum(sales)) %>%\n ggplot(aes(x = year, y = sales, fill = city)) +\n geom_area(position = \"fill\") +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(2000, 2015),\n breaks = 2000 + 0:15) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.02)),\n breaks = c(0, 0.25, 0.5, 0.75, 1),\n labels = scales::percent) +\n labs(x = \"Year\",\n y = \"Home sales\")\n```\n\n## Sankey Plot\n\n------------------------------------------------------------------------\n\nSankey plots visualize flows from one set of variables to another. This can be useful for showing outcomes from the start of a program to the end. You'll need to install the `ggsankey` package to create Sankey plots in R. In this example I make a dummy data set of housing status prior to program start and at exit to show the flow of people between outcomes. A key step is to transform your data set using the `make_long` function from the package. This creates a data frame that specifies each of the initial nodes and how they flow into the next stage.\n\n```{r}\n# load ggsankey package\nremotes::install_github(\"davidsjoberg/ggsankey\")\nlibrary(ggsankey)\n\n# create a dummy dataset of housing status\ndf <- data_frame(entry_status = c(rep(\"Housed\", 7), rep(\"Unhoused\", 15), rep(\"Staying w/ Family\", 8)), \n exit_status = c(rep(\"Housed\", 15), rep(\"Unhoused\", 2), rep(\"Staying w/ Family\", 13))) %>% \n\t# transform the data frame into the proper format for the sankey plot\n make_long(entry_status, exit_status) %>% \n\t# recode the labels to be cleaner in the plot \n mutate(x = recode(x, entry_status = \"Prior Housing Status\", exit_status = \"Exit Housing Status\"),\n next_x = recode(next_x, entry_status = \"Prior Housing Status\", exit_status = \"Exit Housing Status\"))\n\n# create sankey plot\nggplot(df, aes(x = x, \n next_x = next_x, \n node = node, \n next_node = next_node,\n fill = factor(node), \n label = node)) +\n geom_sankey(flow.alpha = 0.5, node.color = 1, show.legend = FALSE) +\n # add labels to plot and style\n geom_sankey_label(size = 3.5, color = 1, fill = \"white\") +\n theme_sankey(base_size = 16)+\n labs(x = NULL)\n```\n\n## Heat Map\n\n------------------------------------------------------------------------\n\n```{r heat-map}\nlibrary(fivethirtyeight)\n\nbad_drivers %>%\n filter(state %in% c(\"Maine\", \"New Hampshire\", \"Vermont\", \"Massachusetts\", \"Connecticut\", \"New York\")) %>%\n mutate(`Number of\\nDrivers` = scale(num_drivers),\n `Percent\\nSpeeding` = scale(perc_speeding),\n `Percent\\nAlcohol` = scale(perc_alcohol),\n `Percent Not\\nDistracted` = scale(perc_not_distracted),\n `Percent No\\nPrevious` = scale(perc_no_previous),\n state = factor(state, levels = rev(state))\n ) %>%\n select(-insurance_premiums, -losses, -(num_drivers:losses)) %>%\n gather(`Number of\\nDrivers`:`Percent No\\nPrevious`, key = \"variable\", value = \"SD's from Mean\") %>%\n ggplot(aes(variable, state)) +\n geom_tile(aes(fill = `SD's from Mean`)) +\n labs(x = NULL,\n y = NULL) + \n scale_fill_gradientn() +\n theme(legend.position = \"right\",\n legend.direction = \"vertical\",\n axis.line.x = element_blank(),\n panel.grid.major.y = element_blank()) +\n remove_ticks()\n#https://learnr.wordpress.com/2010/01/26/ggplot2-quick-heatmap-plotting/\n```\n\n## Faceting and Small Multiples\n\n------------------------------------------------------------------------\n\n### facet_wrap()\n\nR's faceting system is a powerful way to make \"small multiples\".\n\nSome edits to the theme may be necessary depending upon how many rows and columns are in the plot.\n\n```{r small-multiples, fig.height=2}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n facet_wrap(~cut, ncol = 5) +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 6)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 20000), \n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n```\n\n### facet_grid()\n\n```{r faceting, fig.height=7}\ndiamonds %>%\n filter(color %in% c(\"D\", \"E\", \"F\", \"G\")) %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n facet_grid(color ~ cut) +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 4)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 20000), \n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n theme(panel.spacing = unit(20L, \"pt\")) +\n scatter_grid()\n```\n\n## Smoothers\n\n------------------------------------------------------------------------\n\n`geom_smooth()` fits and plots models to data with two or more dimensions.\n\nUnderstanding and manipulating defaults is more important for `geom_smooth()` than other geoms because it contains a number of assumptions. `geom_smooth()` automatically uses loess for datasets with fewer than 1,000 observations and a generalized additive model with `formula = y ~ s(x, bs = \"cs\")` for datasets with greater than 1,000 observations. Both default to a 95% confidence interval with the confidence interval displayed.\n\nModels are chosen with `method =` and can be set to lm(), glm(), gam(), loess(), rlm(), and more. Formulas can be specified with `formula =` and `y ~ x` syntax. Plotting the standard error is toggled with `se = TRUE` and `se = FALSE`, and level is specificed with `level =`. As always, more information can be seen in RStudio with `?geom_smooth()`.\n\n`geom_point()` adds a scatterplot to `geom_smooth()`. The order of the function calls is important. The function called second will be layed on top of the function called first.\n\n```{r geom_smooth}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n\tgeom_point(alpha = 0.05) +\n\tgeom_smooth(color = \"#ec008b\") +\n\tscale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n\t limits = c(0, 5),\n\t breaks = 0:5) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 20000), \n labels = scales::dollar) + \n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n```\n\n`geom_smooth` can be subset by categorical and factor variables. This requires subgroups to have a decent number of observations and and a fair amount of variability across the x-axis. Confidence intervals often widen at the ends so special care is needed for the chart to be meaningful and readable.\n\nThis example uses Loess with MPG = displacement.\n\n```{r subset-geom_smooth}\nggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = factor(cyl))) +\n\tgeom_point(alpha = 0.2) +\n\tgeom_smooth() +\n\tscale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n\t limits = c(0, 7),\n\t breaks = 0:7) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 60)) + \n\tlabs(x = \"Engine displacement\",\n\t\t\t y = \"Highway MPG\") +\n scatter_grid()\n```\n\nThis example uses linear models with MPG = displacement.\n\n```{r subset-geom-smooth-lm}\nggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = factor(cyl))) +\n\tgeom_point(alpha = 0.2) +\n\tgeom_smooth(method = \"lm\") +\n\tscale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n\t limits = c(0, 7),\n\t breaks = 0:7) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 60)) + \n\tlabs(x = \"Engine displacement\",\n\t\t\t y = \"Highway MPG\") +\n scatter_grid()\n```\n\n## Highlighting\n\n------------------------------------------------------------------------\n\n[`library(gghighlight)`](https://yutannihilation.github.io/gghighlight/) enables the intuitive highlighting of ggplot2 plots. `gghighlight` modifies existing ggplot2 objects, so no other code should change. All of the highlighting is handled by the function `gghighlight()`, which can handle all types of geoms.\n\n*Warning:* R will throw an error if too many colors are highlighted because of the design of `urbnthemes`. Simply decrease the number of highlighted geoms to solve this issue.\n\nThere are two main ways to highlight.\n\n### Threshold\n\nThe first way to highlight is with a threshold. Add a logical test to `gghighlight()` to describe which lines should be highlighted. Here, lines with maximum change in per-capita Gross Domestic Product greater than \\$35,000 are highlighted by `gghighlight(max(pcgpd_change) > 35000, use_direct_label = FALSE)`.\n\n```{r gghighlight-threshold}\nlibrary(gghighlight)\nlibrary(gapminder)\n\ndata <- gapminder %>%\n filter(continent %in% c(\"Europe\")) %>%\n group_by(country) %>%\n mutate(pcgpd_change = ifelse(year == 1952, 0, gdpPercap - lag(gdpPercap))) %>%\n mutate(pcgpd_change = cumsum(pcgpd_change))\n \ndata %>%\n ggplot(aes(year, pcgpd_change, group = country, color = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change) > 35000, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\")\n```\n\n### Rank\n\nThe second way to highlight is by rank. Here, the countries with the first highest values for change in per-capita Gross Domestic Product are highlighted with `gghighlight(max(pcgpd_change), max_highlight = 5, use_direct_label = FALSE)`.\n\n```{r gghighlight-rank}\ndata %>%\n ggplot(aes(year, pcgpd_change, group = country, color = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change), max_highlight = 5, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\")\n```\n\n### Faceting\n\n`gghighlight()` works well with ggplot2's faceting system.\n\n```{r gghighlight-faceting}\ndata %>%\n ggplot(aes(year, pcgpd_change, group = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change), max_highlight = 4, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\") +\n facet_wrap(~ country) +\n theme(panel.spacing = unit(20L, \"pt\"))\n```\n\n## Text and Annotation\n\n------------------------------------------------------------------------\n\nSeveral functions can be used to annotate, label, and highlight different parts of plots. `geom_text()` and `geom_text_repel()` both display variables from data frames. `annotate()`, which has several different uses, displays variables and values included in the function call.\n\n### geom_text()\n\n`geom_text()` turns text variables in data sets into geometric objects. This is useful for labeling data in plots. Both functions need `x` values and `y` values to determine placement on the coordinate plane, and a text vector of labels.\n\nThis can be used to label `geom_bar()`.\n\n```{r bar-geom_text}\ndiamonds %>%\n group_by(cut) %>%\n summarize(price = mean(price)) %>%\n ggplot(aes(cut, price)) +\n geom_bar(stat = \"identity\") +\n geom_text(aes(label = scales::dollar(price)), vjust = -1) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2)),\n \t\t\t\t\t\t\t\t\t labels = scales::dollar) +\n labs(title = \"Average Diamond Price by Diamond Cut\",\n x = \"Cut\",\n y = \"Price\") +\n remove_ticks()\n```\n\nIt can also be used to label points in a scatter plot.\n\nIt's rarely useful to label every point in a scatter plot. Use `filter()` to create a second data set that is subsetted and pass it into the labelling function.\n\n```{r scatterplot-geom_text}\nlabels <- mtcars %>%\n\trownames_to_column(\"model\") %>%\n\tfilter(model %in% c(\"Toyota Corolla\", \"Merc 240D\", \"Datsun 710\"))\n\nmtcars %>%\n\tggplot() +\n\tgeom_point(mapping = aes(x = wt, y = mpg)) +\n\tgeom_text(data = labels, mapping = aes(x = wt, y = mpg, label = model), nudge_x = 0.38) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 6)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 40)) + \n labs(x = \"Weight (Tons)\",\n y = \"Miles per gallon (MPG)\") +\n scatter_grid()\n```\n\nText too often overlaps with other text or geoms when using `geom_text()`. `library(ggrepel)` is a `library(ggplot2)` add-on that automatically positions text so it doesn't overlap with geoms or other text. To add this functionality, install and load `library(ggrepel)` and then use `geom_text_repel()` with the same syntax as `geom_text()`.\n\n### geom_text_repel()\n\n```{r scatterplot-geom_text_repel}\nlibrary(ggrepel)\n\nlabels <- mtcars %>%\n\trownames_to_column(\"model\") %>%\n\ttop_n(5, mpg)\n\nmtcars %>%\n\tggplot(mapping = aes(x = wt, y = mpg)) +\n\tgeom_point() +\n\tgeom_text_repel(data = labels, \n\t mapping = aes(label = model), \n\t nudge_x = 0.38) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 6)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 40)) + \n labs(x = \"Weight (Tons)\",\n y = \"Miles per gallon (MPG)\") +\n scatter_grid()\n```\n\n### annotate()\n\n`annotate()` doesn't use data frames. Instead, it takes values for `x =` and `y =`. It can add text, rectangles, segments, and pointrange.\n\n```{r annotate-point}\nmsleep %>%\n filter(bodywt <= 1000) %>%\n ggplot(aes(bodywt, sleep_total)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(-10, 1000),\n \t\t\t\t\t\t\t\t\t labels = scales::comma) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 25)) + \n annotate(\"text\", x = 500, y = 12, label = \"These data suggest that heavy \\n animals sleep less than light animals\") +\n labs(x = \"Body weight (pounds)\",\n y = \"Sleep time (hours)\") +\n scatter_grid() \n```\n\n```{r annotate-rect}\nlibrary(AmesHousing)\n\names <- make_ames()\n\names %>%\n mutate(square_footage = Total_Bsmt_SF - Bsmt_Unf_SF + First_Flr_SF + Second_Flr_SF) %>%\n mutate(Sale_Price = Sale_Price / 1000) %>% \n ggplot(aes(square_footage, Sale_Price)) +\n geom_point(alpha = 0.2) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(-10, 12000),\n \t\t\t\t\t\t\t\t\t labels = scales::comma) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 800),\n \t\t\t\t\t\t\t\t\t labels = scales::dollar) + \n annotate(\"rect\", xmin = 6800, xmax = 11500, ymin = 145, ymax = 210, alpha = 0.1) +\n annotate(\"text\", x = 8750, y = 230, label = \"Unfinished homes\") +\n labs(x = \"Square footage\", \n y = \"Sale price (thousands)\") +\n scatter_grid() \n```\n\n## Layered Geoms\n\n------------------------------------------------------------------------\n\nGeoms can be layered in `ggplot2`. This is useful for design and analysis.\n\nIt is often useful to add points to line plots with a small number of values across the x-axis. This example from [R for Data Science](http://r4ds.had.co.nz/tidy-data.html) shows how changing the line to grey can be appealing.\n\n### Design {.tabset}\n\n#### Before\n\n```{r layering-geoms-design}\ntable1 %>%\n\tggplot(aes(x = year, y = cases)) +\n\t\tgeom_line(aes(color = country)) +\n\t\tgeom_point(aes(color = country)) +\n\t\tscale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n\t\t labels = scales::comma) +\n\t\tscale_x_continuous(breaks = c(1999, 2000)) +\n\t\tlabs(title = \"Changes in Tuberculosis Cases in Three Countries\")\n```\n\n#### After\n\n```{r layering-geoms-design-gray}\ntable1 %>%\n\tggplot(aes(year, cases)) +\n\t\tgeom_line(aes(group = country), color = \"grey50\") +\n\t\tgeom_point(aes(color = country)) +\n\t\tscale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n\t\t labels = scales::comma) +\n\t\tscale_x_continuous(breaks = c(1999, 2000)) +\n\t\tlabs(title = \"Changes in Tuberculosis Cases in Three Countries\")\n```\n\n### Centroids\n\n```{r centroids}\nmpg_summary <- mpg %>%\n\tgroup_by(cyl) %>%\n\tsummarize(displ = mean(displ), cty = mean(cty))\n\nmpg %>%\n\tggplot() +\n\tgeom_point(aes(x = displ, y = cty, color = factor(cyl)), alpha = 0.5) +\n\tgeom_point(data = mpg_summary, aes(x = displ, y = cty), size = 5, color = \"#ec008b\") +\n\tgeom_text(data = mpg_summary, aes(x = displ, y = cty, label = cyl)) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 8)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0)), \n limits = c(0, 40)) +\n\tlabs(x = \"Displacement\",\n\t y = \"City MPG\") +\n scatter_grid()\n```\n\n## Saving Plots\n\n------------------------------------------------------------------------\n\n`ggsave()` exports ggplot2 plots. The function can be used in two ways. If `plot =` isn't specified in the function call, then `ggsave()` automatically saves the plot that was last displayed in the Viewer window. Second, if `plot =` is specified, then `ggsave()` saves the specified plot. `ggsave()` guesses the type of graphics device to use in export (.png, .pdf, .svg, etc.) from the file extension in the filename.\n\n``` \nmtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\nggsave(filename = \"cars.png\")\n\nplot2 <- mtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\nggsave(filename = \"cars.png\", plot = plot2)\n```\n\nExported plots rarely look identical to the plots that show up in the Viewer window in RStudio because the overall size and aspect ratio of the Viewer is often different than the defaults for `ggsave()`. Specific sizes, aspect ratios, and resolutions can be controlled with arguments in `ggsave()`. RStudio has a useful [cheatsheet](https://www.rstudio.com/resources/cheatsheets/) called \"How Big is Your Graph?\" that should help with choosing the best size, aspect ratio, and resolution.\n\nFonts are not embedded in PDFs by default. To embed fonts in PDFs, include `device = cairo_pdf` in `ggsave()`.\n\n``` \nplot <- mtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\nggsave(filename = \"cars.pdf\", plot = plot2, width = 6.5, height = 4, device = cairo_pdf)\n```\n\n## Interactive Plots\n\nWe can make any of the previous plots interactive with the powerful and easy `plotly` library. All we have to do is wrap a ggplot object in the `ggplotly` function. *Note:* You can't add `ggplotly` to the end of a ggplot object, but have to actually save the ggplot as a variable and then wrap that in the function call as shown below.\n\nYou can customize the tooltip text by adding a value to `text` in `aes()` and then specifying `tooltip = \"text\"` in the `ggplotly` call.\n\n```{r}\nlibrary(plotly)\n\nstock_plot <- as_tibble(EuStockMarkets) %>% \n mutate(date = time(EuStockMarkets)) %>% \n gather(key = \"key\", value = \"value\", -date) %>% \n ggplot(mapping = aes(x = date, y = value, color = key,\n \t\t\t\t\t\t\t\t\t\t # sometimes ggplotly messes with line charts,\n \t\t\t\t\t\t\t\t\t\t # adding a group value usually helps with that\n \t\t\t\t\t\t\t\t\t\t group = key,\n \t\t\t\t\t\t\t\t\t\t # customize the tooltip with the text aes\n \t\t\t\t\t\t\t\t\t\t text = paste0(\"Value: \", round(value, 2), \"
\",\n \t\t\t\t\t\t\t\t\t\t \t\t\t\t\t\t\t\"Date: \", round(date, 3), \"
\",\n \t\t\t\t\t\t\t\t\t\t \t\t\t\t\t\t\t\"Key: \", key))\n \t\t\t\t\t\t\t\t\t\t ) +\n geom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(1991, 1999), \n breaks = c(1991, 1993, 1995, 1997, 1999)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 2500,\n labels = scales::dollar, \n limits = c(0, 10000)) + \n labs(x = \"Date\",\n y = \"Value\")\n\n# make interactive with gggplotly\n# Uncomment pipe to hide the interative toolbar in the top right \nggplotly(stock_plot, tooltip = \"text\") # %>% config(displayModeBar = FALSE)\n```\n\n## urbnthemes\n\n### Overview\n\n`urbnthemes` is a set of tools for creating Urban Institute-themed plots and maps in R. The package extends `ggplot2` with print and map themes as well as tools that make plotting easier at the Urban Institute. `urbnthemes` replaces the [urban_R_theme](https://github.com/UrbanInstitute/urban_R_theme).\n\nAlways load `library(urbnthemes)` after `library(ggplot2)` or `library(tidyverse)`.\n\n### Usage\n\nUse `set_urbn_defaults(style = \"print\")` to set the default styles. `scatter_grid()`, `remove_ticks()`, `add_axis()`, and `remove_axis()` can all be used to improve graphics.\n\n```{r example, message=FALSE}\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nggplot(data = mtcars, mapping = aes(factor(cyl))) +\n geom_bar() + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Number of Cylinders\",\n y = \"Count\") +\n remove_ticks()\n```\n\n### Combining elements\n\n`library(urbnthemes)` contains functions for combining plot elements into graphics. `urbn_plot()` brings all of the elements together.\n\n- `urbn_logo_text()`\n- `remove_ticks()`\n- `remove_axis()`\n- `scatter_grid()`\n- `add_axis()`\n- `urbn_geofacet`\n\n```{r example2}\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nplot <- ggplot(data = mtcars, mapping = aes(factor(cyl))) +\n geom_bar() + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Number of Cylinders\",\n y = \"Count\") +\n remove_ticks()\n\nurbn_plot(plot, urbn_logo_text(), ncol = 1, heights = c(30, 1))\n```\n\nSometimes it's important to horizontally add the y-axis title above the plot. `urbn_y_title()` can be sued for this task. The following example goes one step further and adds the title between the legend and the plot.\n\n```{r}\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults()\n\nplot <- ggplot(data = mtcars, mapping = aes(x = wt, y = mpg, color = factor(cyl))) +\n geom_point() + \n\tscale_x_continuous(expand = c(0, 0),\n\t\t\t\t\t\t\t\t\t\t limits = c(0, 8)) +\n scale_y_continuous(expand = c(0, 0),\n \t\t\t\t\t\t\t\t\t limits = c(0, 40)) +\n remove_ticks() +\n\tlabs(\"\") +\n\tscatter_grid()\n\nurbn_plot(get_legend(plot),\n\t\t\t\t\turbn_y_title(\"Miles per gallon\"),\n\t\t\t\t\tremove_legend(plot), \n\t\t\t\t\turbn_logo_text(), \n\t\t\t\t\tncol = 1, \n\t\t\t\t\theights = c(3, 1, 30, 1))\n```\n\n### Palettes\n\n`urbnthemes` contains many quick-access color palettes from the [Urban Institute Data Visualization Style Guide](http://urbaninstitute.github.io/graphics-styleguide/). These palettes can be used to quickly overwrite default color palettes from `urbnthemes`.\n\n- `palette_urbn_main` is the eight color discrete palette of the Urban Institute with cyan, yellow, black, gray, magenta, green, space gray, and red.\n- `palette_urbn_diverging` is an eight color diverging palette.\n- `palette_urbn_quintile` is a five color blue palette that is good for quintiles.\n- `palette_urbn_politics` is a two color palette with blue for Democrats and red for Republicans.\n\nThere are seven palettes that are continuous palettes of the seven unique colors in the discrete Urban Institute color palette:\n\n- `palette_urbn_cyan`\n- `palette_urbn_gray`\n- `palette_urbn_yellow`\n- `palette_urbn_magenta`\n- `palette_urbn_green`\n- `palette_urbn_spacegray`\n- `palette_urbn_red`\n\nUse `view_palette()` to see the palette:\n\n```{r view-palette}\nview_palette(palette_urbn_magenta)\n```\n\nThe vectors can be subset using base R syntax. This allows for the quick selection of specific colors from a palette.\n\n```{r palette-subset1}\npalette_urbn_main[1:4]\n```\n\n```{r palette-subset2}\npalette_urbn_spacegray[1:5]\n```\n\n### Utility functions\n\n`library(urbnthemes)` contains four functions that are helpful with managing font instalations:\n\n- `lato_test()`\n- `lato_install()`\n- `fontawesome_test()`\n- `fontawesome_install()`\n\n## Bibliography and Session Information\n\n------------------------------------------------------------------------\n\n*Note:* Examples present in [this document](https://awunderground.github.io/ggplot2-themes/) by Aaron Williams were created during personal time.\n\nBob Rudis and Dave Gandy (2017). waffle: Create Waffle Chart Visualizations in R. R package version 0.7.0. https://CRAN.R-project.org/package=waffle\n\nChester Ismay and Jennifer Chunn (2017). fivethirtyeight: Data and Code Behind the Stories and Interactives at 'FiveThirtyEight'. R package version 0.3.0. https://CRAN.R-project.org/package=fivethirtyeight\n\nHadley Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2009.\n\nHadley Wickham (2017). tidyverse: Easily Install and Load the 'Tidyverse'. R package version 1.2.1. https://CRAN.R-project.org/package=tidyverse\n\nHadley Wickham (2017). forcats: Tools for Working with Categorical Variables (Factors). R package version 0.2.0. https://CRAN.R-project.org/package=forcats\n\nJennifer Bryan (2017). gapminder: Data from Gapminder. R package version 0.3.0. https://CRAN.R-project.org/package=gapminder\n\nKamil Slowikowski (2017). ggrepel: Repulsive Text and Label Geoms for 'ggplot2'. R package version 0.7.0. https://CRAN.R-project.org/package=ggrepel\n\nMax Kuhn (2017). AmesHousing: The Ames Iowa Housing Data. R package version 0.0.3. https://CRAN.R-project.org/package=AmesHousing\n\nPeter Kampstra (2008). Beanplot: A Boxplot Alternative for Visual Comparison of Distributions, Journal of Statistical Software, 2008. https://www.jstatsoft.org/article/view/v028c01\n\nR Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.\n\nWinston Chang, (2014). extrafont: Tools for using fonts. R package version 0.17. https://CRAN.R-project.org/package=extrafont\n\nYihui Xie (2018). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.19.\n\n```{r System Info and Package Versioning}\nsessionInfo()\n```\n","srcMarkdownNoYaml":"\n\n\n\n::: {#header}\n\n:::\n\n# Urban Institute R Graphics Guide\n\n```{r setup, include=FALSE}\nlibrary(knitr)\nlibrary(datasets)\nlibrary(tidyverse)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nopts_chunk$set(fig.path = \"graphics-guide/www/images/\")\nopts_chunk$set(echo = TRUE)\nopts_chunk$set(warning = FALSE)\nopts_chunk$set(message = FALSE)\nopts_chunk$set(fig.width = 6.5)\nopts_chunk$set(fig.height = 4)\nopts_chunk$set(fig.retina = 3)\noptions(scipen = 999)\n```\n\nR is a powerful, open-source programming language and environment. R excels at data management and munging, traditional statistical analysis, machine learning, and reproducible research, but it is probably best known for its graphics. This guide contains examples and instructions for popular and lesser-known plotting techniques in R. It also includes instructions for using `urbnthemes`, the Urban Institute's R package for creating near-publication-ready plots with `ggplot2`. If you have any questions, please don't hesitate to contact Aaron Williams (awilliams\\@urban.org) or Kyle Ueyama (kueyama\\@urban.org).\n\n### Background\n\n`library(urbnthemes)` makes `ggplot2` output align more closely with [the Urban Institute's Data Visualization style guide](http://urbaninstitute.github.io/graphics-styleguide/). This package does **not produce publication ready graphics**. Visual styles must still be edited using your project/paper's normal editing workflow.\n\nExporting charts as a pdf will allow them to be more easily edited. See the Saving Plots section for more information.\n\nThe theme has been tested against `ggplot2 version 3.0.0`. It will not function properly with older versions of `ggplot2`\n\n### Using library(urbnthemes)\n\nRun the following code to install or update `urbnthemes`:\n\n``` \ninstall.packages(\"remotes\")\nremotes::install_github(\"UrbanInstitute/urbnthemes\")\n```\n\nRun the following code at the top of each script:\n\n``` \nlibrary(tidyverse)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n```\n\n### Installing Lato {#installing_lato}\n\nYour Urban computer may not have the Lato font installed. If it is not installed, please install the free [Lato font from Google](https://www.google.com/fonts/specimen/Lato). Below are step by step instructions:\n\n1) Download the [Lato font](https://www.google.com/fonts/specimen/Lato) (as a zip file).\n2) Unzip the file on your computer.\n3) For each `.ttf` file in the unzipped `Lato/` folder, double click the file and click `Install` (on Windows) or `Install Font` (on Mac).\n4) Import and register Lato into R by running `urbnthemes::lato_import()` in the console once. Be patient as this may take a few minutes!\n5) To confirm installation, run `urbnthemes::lato_test()`. If this is successful you're done and Lato will automatically be used when creating plots with `library(urbnthemes)`. You only need to install Lato once per computer.\n\nWaffle charts with glyphs require fontawesome. `fontawesome_test()` and `fontawesome_install()` are the fontawesome versions of the above functions. Be sure to install fontawesome from [here](https://github.com/hrbrmstr/waffle/tree/master/inst/fonts) first.\n\n### Grammar of Graphics and Conventions\n\nHadley Wickham's ggplot2 is based on Leland Wilkinson's [*The Grammar of Graphics*](https://www.amazon.com/Grammar-Graphics-Statistics-Computing/dp/0387245448) and Wickham's [*A Layered Grammar of Graphics*](http://vita.had.co.nz/papers/layered-grammar.html). The layered grammar of graphics is a structured way of thinking about the components of a plot, which then lend themselves to the simple structure of ggplot2.\n\n- **Data** are what are visualizaed in a plot and **mappings** are directions for how data are mapped in a plot in a way that can be perceived by humans.\\\n- **Geoms** are representations of the actual data like points, lines, and bars.\n- **Stats** are statistical transformations that represent summaries of the data like histograms.\n- **Scales** map values in the data space to values in the aesthetic space. Scales draw legends and axes.\n- **Coordinate Systems** describe how geoms are mapped to the plane of the graphic.\\\n- **Facets** break the data into meaningful subsets like small multiples.\n- **Themes** control the finer points of a plot such as fonts, font sizes, and background colors.\n\nMore information: [ggplot2: Elegant Graphics for Data Analysis](https://www.amazon.com/ggplot2-Elegant-Graphics-Data-Analysis/dp/0387981403)\n\n### Tips and Tricks\n\n- `ggplot2` expects data to be in data frames or tibbles. It is preferable for the data frames to be \"tidy\" with each variable as a column, each obseravtion as a row, and each observational unit as a separate table. `dplyr` and `tidyr` contain concise and effective tools for \"tidying\" data.\n\n- R allows function arguments to be called explicitly by name and implicitly by position. The coding examples in this guide only contain named arguments for clarity.\n\n- Graphics will sometimes render differently on different operating systems. This is because anti-aliasing is activated in R on Mac and Linux but not activated in R on Windows. This won't be an issue once graphics are saved.\n\n- Continuous x-axes have ticks. Discrete x-axes do not have ticks. Use `remove_ticks()` to remove ticks.\n\n## Bar Plots\n\n------------------------------------------------------------------------\n\n### One Color\n\n```{r barplots}\nmtcars %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = factor(cyl), y = n)) +\n geom_col() +\n geom_text(mapping = aes(label = n), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) +\n remove_ticks() +\n remove_axis() \n```\n\n### One Color (Rotated)\n\nThis example introduces `coord_flip()` and `remove_axis(axis = \"x\", flip = TRUE)`. `remove_axis()` is from `library(urbnthemes)` and creates a custom theme for rotated bar plots.\n\n```{r barplot-rotated}\nmtcars %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = factor(cyl), y = n)) +\n geom_col() +\n geom_text(mapping = aes(label = n), hjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n coord_flip() +\n remove_axis(axis = \"x\", flip = TRUE)\n```\n\n### Three Colors\n\nThis is identical to the previous plot except colors and a legend are added with `fill = cyl`. Turning `x` into a factor with `factor(cyl)` skips 5 and 7 on the `x-axis`. Adding `fill = cyl` without `factor()` would have created a continuous color scheme and legend.\n\n```{r 3-color-barplot}\nmtcars %>%\n mutate(cyl = factor(cyl)) %>%\n count(cyl) %>%\n ggplot(mapping = aes(x = cyl, y = n, fill = cyl)) +\n geom_col() +\n geom_text(mapping = aes(label = n), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) +\n remove_ticks() +\n remove_axis()\n```\n\n### Stacked Bar Plot\n\nAn additional aesthetic can easily be added to bar plots by adding `fill = categorical variable` to the mapping. Here, transmission type subsets each bar showing the count of cars with different numbers of cylinders.\n\n```{r stacked-bar-plot}\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>% \n group_by(am) %>%\n count(cyl) %>%\n group_by(cyl) %>%\n arrange(desc(am)) %>%\n mutate(label_height = cumsum(n)) %>%\n ggplot() +\n geom_col(mapping = aes(x = cyl, y = n, fill = am)) +\n geom_text(aes(x = cyl, y = label_height - 0.5, label = n, color = am)) +\n scale_color_manual(values = c(\"white\", \"black\")) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n remove_axis() +\n guides(color = \"none\")\n```\n\n### Stacked Bar Plot With Position = Fill\n\nThe previous examples used `geom_col()`, which takes a y value for bar height. This example uses `geom_bar()` which sums the values and generates a value for bar heights. In this example, `position = \"fill\"` in `geom_bar()` changes the y-axis from count to the proportion of each bar.\n\n```{r stacked-bar-plot-fill}\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>% \n ggplot() +\n geom_bar(mapping = aes(x = cyl, fill = am), position = \"fill\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1)), labels = scales::percent) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n guides(color = \"none\")\n```\n\n### Dodged Bar Plot\n\nSubsetted bar charts in ggplot2 are stacked by default. `position = \"dodge\"` in `geom_col()` expands the bar chart so the bars appear next to each other.\n\n```{r dodged-bar-plot}\nmtcars %>%\n mutate(am = factor(am, labels = c(\"Automatic\", \"Manual\")),\n cyl = factor(cyl)) %>%\n group_by(am) %>%\n count(cyl) %>%\n ggplot(mapping = aes(cyl, y = n, fill = factor(am))) +\n geom_col(position = \"dodge\") +\n geom_text(aes(label = n), position = position_dodge(width = 0.7), vjust = -1) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Cylinders\",\n y = NULL) + \n remove_ticks() +\n remove_axis()\n```\n\n### Lollipop plot/Cleveland dot plot {.tabset}\n\nLollipop plots and Cleveland dot plots are minimalist alternatives to bar plots. The key to both plots is to order the data based on the continuous variable using `arrange()` and then turn the discrete variable into a factor with the ordered levels of the continuous variable using `mutate()`. This step \"stores\" the order of the data.\n\n#### Lollipop plot\n\n```{r lollipop-plot, fig.height = 5}\nmtcars %>%\n\trownames_to_column(\"model\") %>%\n\tarrange(mpg) %>%\n\tmutate(model = factor(model, levels = .$model)) %>%\n\tggplot(aes(mpg, model)) +\n\t\tgeom_segment(aes(x = 0, xend = mpg, y = model, yend = model)) +\t\n\t\tgeom_point() +\n\t\tscale_x_continuous(expand = expansion(mult = c(0, 0)), limits = c(0, 40)) +\n\t\tlabs(x = NULL, \n\t\t\t\t y = \"Miles Per Gallon\")\n```\n\n#### Cleveland dot plot\n\n```{r cleveland-dot-plot, fig.height = 5}\nmtcars %>%\n\trownames_to_column(\"model\") %>%\n\tarrange(mpg) %>%\n\tmutate(model = factor(model, levels = .$model)) %>%\n\tggplot(aes(mpg, model)) +\n\t\tgeom_point() +\n\t\tscale_x_continuous(expand = expansion(mult = c(0, 0)), limits = c(0, 40)) +\n\t\tlabs(x = NULL, \n\t\t\t\t y = \"Miles Per Gallon\")\n```\n\n### Dumbell plot\n\n## Scatter Plots\n\n------------------------------------------------------------------------\n\n### One Color Scatter Plot\n\nScatter plots are useful for showing relationships between two or more variables. Use `scatter_grid()` from `library(urbnthemes)` to easily add vertical grid lines for scatter plots.\n\n```{r one-color-scatter-plot}\nmtcars %>%\n ggplot(mapping = aes(x = wt, y = mpg)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:8 * 5) +\n labs(x = \"Weight (thousands of pounds)\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n### High-Density Scatter Plot with Transparency\n\nLarge numbers of observations can sometimes make scatter plots tough to interpret because points overlap. Adding `alpha =` with a number between 0 and 1 adds transparency to points and clarity to plots. Now it's easy to see that jewelry stores are probably rounding up but not rounding down carats!\n\n```{r alpha-scatter-plot}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 20000),\n breaks = 0:4 * 5000,\n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n```\n\n### Hex Scatter Plot\n\nSometimes transparency isn't enough to bring clarity to a scatter plot with many observations. As n increases into the hundreds of thousands and even millions, `geom_hex` can be one of the best ways to display relationships between two variables.\n\n```{r scatter-plot-hex}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_hex(mapping = aes(fill = after_stat(count))) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 20000),\n breaks = 0:4 * 5000,\n labels = scales::dollar) +\n\tscale_fill_gradientn(labels = scales::comma) + \n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid() +\n theme(legend.position = \"right\",\n legend.direction = \"vertical\")\n```\n\n### Scatter Plots With Random Noise {.tabset}\n\nSometimes scatter plots have many overlapping points but a reasonable number of observations. `geom_jitter` adds a small amount of random noise so points are less likely to overlap. `width` and `height` control the amount of noise that is added. In the following before-and-after, notice how many more points are visible after adding jitter.\n\n#### Before\n\n```{r before-scatter-plot}\nmpg %>%\n ggplot(mapping = aes(x = displ, y = cty)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 8),\n breaks = 0:8) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:4 * 10) +\n labs(x = \"Displacement\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n#### After\n\n```{r jitter-plot}\nset.seed(2017)\nmpg %>%\n ggplot(mapping = aes(x = displ, y = cty)) +\n geom_jitter() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 8),\n breaks = 0:8) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:4 * 10) +\n labs(x = \"Displacement\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n### Scatter Plots with Varying Point Size\n\nWeights and populations can be mapped in scatter plots to the size of the points. Here, the number of households in each state is mapped to the size of each point using `aes(size = hhpop)`. Note: `ggplot2::geom_point()` is used instead of `geom_point()`.\n\n```{r geom_point-size, fig.height = 5}\nurbnmapr::statedata %>%\n ggplot(mapping = aes(x = medhhincome, y = horate)) +\n ggplot2::geom_point(mapping = aes(size = hhpop), alpha = 0.3) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(30000, 80000),\n breaks = 3:8 * 10000,\n labels = scales::dollar) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 0.8),\n breaks = 0:4 * 0.2) +\n scale_radius(range = c(3, 15),\n breaks = c(2500000, 7500000, 12500000), \n labels = scales::comma) +\n labs(x = \"Household income\",\n y = \"Homeownership rate\") +\n scatter_grid() +\n\ttheme(plot.margin = margin(r = 20))\n```\n\n### Scatter Plots with Fill\n\nA third aesthetic can be added to scatter plots. Here, color signifies the number of cylinders in each car. Before `ggplot()` is called, Cylinders is created using `library(dplyr)` and the piping operator `%>%`.\n\n```{r filled-scatter-plot}\nmtcars %>%\n mutate(cyl = paste(cyl, \"cylinders\")) %>%\n ggplot(aes(x = wt, y = mpg, color = cyl)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 6),\n breaks = 0:6) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 40),\n breaks = 0:8 * 5) +\n labs(x = \"Weight (thousands of pounds)\",\n y = \"City MPG\") +\n scatter_grid()\n```\n\n## Line Plots\n\n------------------------------------------------------------------------\n\n```{r line-plots}\neconomics %>%\n ggplot(mapping = aes(x = date, y = unemploy)) +\n geom_line() +\n scale_x_date(expand = expansion(mult = c(0.002, 0)), \n breaks = \"10 years\",\n limits = c(as.Date(\"1961-01-01\"), as.Date(\"2020-01-01\")),\n date_labels = \"%Y\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 4000,\n limits = c(0, 16000),\n labels = scales::comma) +\n labs(x = \"Year\", \n y = \"Number Unemployed (1,000s)\")\n```\n\n### Lines Plots With Multiple Lines\n\n```{r multiple-line-charts1}\nlibrary(gapminder)\n\ngapminder %>%\n filter(country %in% c(\"Australia\", \"Canada\", \"New Zealand\")) %>%\n mutate(country = factor(country, levels = c(\"Canada\", \"Australia\", \"New Zealand\"))) %>%\n ggplot(aes(year, gdpPercap, color = country)) +\n geom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n breaks = c(1952 + 0:12 * 5), \n limits = c(1952, 2007)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:8 * 5000,\n labels = scales::dollar, \n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Per capita GDP (US dollars)\")\n```\n\nPlotting more than one variable can be useful for seeing the relationship of variables over time, but it takes a small amount of data munging.\n\nThis is because `ggplot2` wants data in a \"long\" format instead of a \"wide\" format for line plots with multiple lines. `gather()` and `spread()` from the `tidyr` package make switching back-and-forth between \"long\" and \"wide\" painless. Essentially, variable titles go into \"key\" and variable values go into \"value\". Then ggplot2, turns the different levels of the key variable (population, unemployment) into colors.\n\n```{r multiple-line-charts2}\nas_tibble(EuStockMarkets) %>%\n\tmutate(date = time(EuStockMarkets)) %>%\n\tgather(key = \"key\", value = \"value\", -date) %>%\n\tggplot(mapping = aes(x = date, y = value, color = key)) +\n\tgeom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(1991, 1999), \n breaks = c(1991, 1993, 1995, 1997, 1999)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 2500,\n labels = scales::dollar, \n limits = c(0, 10000)) + \n\tlabs(x = \"Date\",\n\t\t\t y = \"Value\")\n```\n\n### Step plot\n\n`geom_line()` connects coordinates with the shortest possible straight line. Sometimes step plots are necessary because y values don't change between coordinates. For example, the upper-bound of the Federal Funds Rate is set at regular intervals and remains constant until it is changed.\n\n```{r step-plot}\n# downloaded from FRED on 2018-12-06\n\n# https://fred.stlouisfed.org/series/DFEDTARU\n\nfed_fund_rate <- read_csv(\n \"date, fed_funds_rate\n 2014-01-01,0.0025\n 2015-12-16,0.0050\n 2016-12-14,0.0075\n 2017-03-16,0.0100\n 2017-06-15,0.0125\n 2017-12-14,0.0150\n 2018-03-22,0.0175\n 2018-06-14,0.0200\n 2018-09-27,0.0225\n 2018-12-06,0.0225\")\n\nfed_fund_rate %>%\n ggplot(mapping = aes(x = date, y = fed_funds_rate)) + \n geom_step() +\n scale_x_date(expand = expansion(mult = c(0.002, 0)), \n breaks = \"1 year\",\n limits = c(as.Date(\"2014-01-01\"), as.Date(\"2019-01-01\")),\n date_labels = \"%Y\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = c(0, 0.01, 0.02, 0.03),\n limits = c(0, 0.03),\n labels = scales::percent) + \n\tlabs(x = \"Date\",\n\t\t\t y = \"Upper-bound of the Federal Funds Rate\")\n```\n\n### Path plot\n\nThe Beveridge curve is a macroeconomic plot that displays a relationship between the unemployment rate and the vacancy rate. Movements along the curve indicate changes in the business cyle and horizontal shifts of the curve suggest structural changes in the labor market.\n\nLines in Beveridge curves do not monotonically move from left to right. Therefore, it is necessary to use `geom_path()`.\n\n```{r, path-plot}\n# seasonally-adjusted, quarterly vacancy rate - JOLTS # seasonally-adjusted, quarterly unemployment rate - CPS\n\n# pulled from FRED on April 11, 2018. \n\nlibrary(ggrepel)\n\nbeveridge <- read_csv(\n\t\"quarter, vacanacy_rate, unempoyment_rate\n\t2006-01-01,0.0310,0.0473\n\t2006-04-01,0.0316,0.0463\n\t2006-07-01,0.0313,0.0463\n\t2006-10-01,0.0310,0.0443\n\t2007-01-01,0.0323,0.0450\n\t2007-04-01,0.0326,0.0450\n\t2007-07-01,0.0316,0.0466\n\t2007-10-01,0.0293,0.0480\n\t2008-01-01,0.0286,0.0500\n\t2008-04-01,0.0280,0.0533\n\t2008-07-01,0.0253,0.0600\n\t2008-10-01,0.0220,0.0686\n\t2009-01-01,0.0196,0.0826\n\t2009-04-01,0.0180,0.0930\n\t2009-07-01,0.0176,0.0963\n\t2009-10-01,0.0180,0.0993\n\t2010-01-01,0.0196,0.0983\n\t2010-04-01,0.0220,0.0963\n\t2010-07-01,0.0216,0.0946\n\t2010-10-01,0.0220,0.0950\n\t2011-01-01,0.0226,0.0903\n\t2011-04-01,0.0236,0.0906\n\t2011-07-01,0.0250,0.0900\n\t2011-10-01,0.0243,0.0863\n\t2012-01-01,0.0270,0.0826\n\t2012-04-01,0.0270,0.0820\n\t2012-07-01,0.0266,0.0803\n\t2012-10-01,0.0260,0.0780\n\t2013-01-01,0.0276,0.0773\n\t2013-04-01,0.0280,0.0753\n\t2013-07-01,0.0280,0.0723\n\t2013-10-01,0.0276,0.0693\n\t2014-01-01,0.0290,0.0666\n\t2014-04-01,0.0323,0.0623\n\t2014-07-01,0.0326,0.0610\n\t2014-10-01,0.0330,0.0570\n\t2015-01-01,0.0350,0.0556\n\t2015-04-01,0.0366,0.0540\n\t2015-07-01,0.0373,0.0510\n\t2015-10-01,0.0360,0.0500\n\t2016-01-01,0.0386,0.0493\n\t2016-04-01,0.0383,0.0486\n\t2016-07-01,0.0383,0.0493\n\t2016-10-01,0.0363,0.0473\n\t2017-01-01,0.0366,0.0466\n\t2017-04-01,0.0390,0.0433\n\t2017-07-01,0.0406,0.0430\n\t2017-10-01,0.0386,0.0410\")\n\nlabels <- beveridge %>%\n filter(lubridate::month(quarter) == 1)\n\nbeveridge %>%\n\tggplot() +\n\tgeom_path(mapping = aes(x = unempoyment_rate, y = vacanacy_rate), alpha = 0.5) +\n geom_point(data = labels, mapping = aes(x = unempoyment_rate, y = vacanacy_rate)) +\n geom_text_repel(data = labels, mapping = aes(x = unempoyment_rate, y = vacanacy_rate, label = lubridate::year(quarter))) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0.04, 0.1),\n labels = scales::percent) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = c(0, 0.01, 0.02, 0.03, 0.04, 0.05),\n limits = c(0, 0.05),\n labels = scales::percent) + \n\tlabs(x = \"Seasonally-adjusted unemployment rate\",\n\t\t\t y = \"Seasonally-adjusted vacancy rate\") + \n scatter_grid()\n```\n\n### Slope plots\n\n```{r slope-plot, fig.height = 5}\n# https://www.bls.gov/lau/\nlibrary(ggrepel)\n\nunemployment <- tibble(\n\ttime = c(\"October 2009\", \"October 2009\", \"October 2009\", \"August 2017\", \"August 2017\", \"August 2017\"),\n\trate = c(7.4, 7.1, 10.0, 3.9, 3.8, 6.4),\n\tstate = c(\"Maryland\", \"Virginia\", \"Washington, D.C.\", \"Maryland\", \"Virginia\", \"Washington, D.C.\")\n)\n\nlabel <- tibble(label = c(\"October 2009\", \"August 2017\"))\noctober <- filter(unemployment, time == \"October 2009\")\naugust <- filter(unemployment, time == \"August 2017\")\n\nunemployment %>%\n\tmutate(time = factor(time, levels = c(\"October 2009\", \"August 2017\")),\n\t state = factor(state, levels = c(\"Washington, D.C.\", \"Maryland\", \"Virginia\"))) %>%\n\tggplot() + \n\tgeom_line(aes(time, rate, group = state, color = state), show.legend = FALSE) +\n\tgeom_point(aes(x = time, y = rate, color = state)) +\n\tlabs(subtitle = \"Unemployment Rate\") +\n\ttheme(axis.ticks.x = element_blank(),\n\t\t\t\taxis.title.x = element_blank(),\n\t\t\t\taxis.ticks.y = element_blank(),\n axis.title.y = element_blank(), \n axis.text.y = element_blank(),\n\t\t\t\tpanel.grid.major.y = element_blank(),\n panel.grid.minor.y = element_blank(),\n panel.grid.major.x = element_blank(),\n\t\t\t\taxis.line = element_blank()) +\n\tgeom_text_repel(data = october, mapping = aes(x = time, y = rate, label = as.character(rate)), nudge_x = -0.06) + \n\tgeom_text_repel(data = august, mapping = aes(x = time, y = rate, label = as.character(rate)), nudge_x = 0.06)\n```\n\n## Univariate\n\n------------------------------------------------------------------------\n\nThere are a number of ways to explore the distributions of univariate data in R. Some methods, like strip charts, show all data points. Other methods, like the box and whisker plot, show selected data points that communicate key values like the median and 25th percentile. Finally, some methods don't show any of the underlying data but calculate density estimates. Each method has advantages and disadvantages, so it is worthwhile to understand the different forms. For more information, read [40 years of boxplots](http://vita.had.co.nz/papers/boxplots.pdf) by Hadley Wickham and Lisa Stryjewski.\n\n### Strip Chart\n\nStrip charts, the simplest univariate plot, show the distribution of values along one axis. Strip charts work best with variables that have plenty of variation. If not, the points tend to cluster on top of each other. Even if the variable has plenty of variation, it is often important to add transparency to the points with `alpha =` so overlapping values are visible.\n\n```{r stripchart, fig.height=2}\nmsleep %>%\n ggplot(aes(x = sleep_total, y = factor(1))) +\n geom_point(alpha = 0.2, size = 5) +\n labs(y = NULL) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) +\n scale_y_discrete(labels = NULL) +\n labs(title = \"Total Sleep Time of Different Mammals\",\n x = \"Total sleep time (hours)\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n```\n\n### Strip Chart with Highlighting\n\nBecause strip charts show all values, they are useful for showing where selected points lie in the distribution of a variable. The clearest way to do this is by adding `geom_point()` twice with `filter()` in the data argument. This way, the highlighted values show up on top of unhighlighted values.\n\n```{r stripchart-with-highlighting, fig.height=2}\nggplot() +\n geom_point(data = filter(msleep, name != \"Red fox\"), \n aes(x = sleep_total, \n y = factor(1)),\n alpha = 0.2, \n size = 5,\n \t\t\t\t\t color = \"grey50\") +\n geom_point(data = filter(msleep, name == \"Red fox\"),\n aes(x = sleep_total, \n y = factor(1), \n color = name),\n alpha = 0.8,\n size = 5) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) + \n scale_y_discrete(labels = NULL) +\n labs(title = \"Total Sleep Time of Different Mammals\",\n x = \"Total sleep time (hours)\",\n y = NULL,\n legend) +\n guides(color = guide_legend(title = NULL)) +\n theme(axis.ticks.y = element_blank())\n```\n\n### Subsetted Strip Chart\n\nAdd a y variable to see the distributions of the continuous variable in subsets of a categorical variable.\n\n```{r subsetted-stripchart, fig.height=3}\nlibrary(forcats)\n\nmsleep %>%\n filter(!is.na(vore)) %>%\n mutate(vore = fct_recode(vore, \n \"Insectivore\" = \"insecti\",\n \"Omnivore\" = \"omni\", \n \"Herbivore\" = \"herbi\", \n \"Carnivore\" = \"carni\"\n )) %>%\n ggplot(aes(x = sleep_total, y = vore)) +\n geom_point(alpha = 0.2, size = 5) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 25), \n breaks = 0:5 * 5) + \n labs(title = \"Total Sleep Time of Different Mammals by Diet\",\n x = \"Total sleep time (hours)\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n```\n\n### Beeswarm Plots\n\nBeesward plots are a variation of strip charts that shows the distribution of data, but without the points overlaping.\n\n```{r beeswarm}\nlibrary(ggbeeswarm)\n\ntxhousing %>%\n\tfilter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>% \n ggplot(aes(x = median, y = city)) +\n geom_beeswarm(alpha = 0.2, size = 5) + \n\tscale_x_continuous(labels = scales::dollar) +\n labs(title = \"Household Sale Price by City\",\n x = \"Sale Price\",\n y = NULL) +\n theme(axis.ticks.y = element_blank())\n\n```\n\n### Histograms\n\nHistograms divide the distribution of a variable into n equal-sized bins and then count and display the number of observations in each bin. Histograms are sensitive to bin width. As `?geom_histogram` notes, \"You should always override \\[the default binwidth\\] value, exploring multiple widths to find the best to illustrate the stories in your data.\"\n\n```{r histogram}\nggplot(data = diamonds, mapping = aes(x = depth)) + \n geom_histogram(bins = 100) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, 100)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.2)), labels = scales::comma) +\n labs(x = \"Depth\",\n y = \"Count\")\n```\n\n### Boxplots\n\nBoxplots were invented in the 1970s by John Tukey[^1]. Instead of showing the underlying data or binned counts of the underlying data, they focus on important values like the 25th percentile, median, and 75th percentile.\n\n[^1]: Wickham, H., & Stryjewski, L. (2011). 40 years of boxplots.\n\n```{r box-plot}\nInsectSprays %>%\n ggplot(mapping = aes(x = spray, y = count)) +\n geom_boxplot() +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Type of insect spray\",\n y = \"Number of dead insects\") +\n remove_ticks()\n```\n\n### Smoothed Kernel Density Plots\n\nContinuous variables with smooth distributions are sometimes better represented with smoothed kernel density estimates than histograms or boxplots. `geom_density()` computes and plots a kernel density estimate. Notice the lumps around integers and halves in the following distribution because of rounding.\n\n```{r kernel-density-plot}\ndiamonds %>%\n ggplot(mapping = aes(carat)) +\n geom_density(color = NA) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, NA)) +\n\tscale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Carat\",\n y = \"Density\")\n```\n\n```{r kernel-density-plot-filled}\ndiamonds %>%\n mutate(cost = ifelse(price > 5500, \"More than $5,500 +\", \"$0 to $5,500\")) %>%\n ggplot(mapping = aes(carat, fill = cost)) +\n geom_density(alpha = 0.25, color = NA) +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(0, NA)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Carat\",\n y = \"Density\")\n```\n\n### Ridgeline Plots\n\nRidgeline plots are partially overlapping smoothed kernel density plots faceted by a categorical variable that pack a lot of information into one elegant plot.\n\n```{r ridgeline-plots}\nlibrary(ggridges)\n\nggplot(diamonds, mapping = aes(x = price, y = cut)) +\n\tgeom_density_ridges(fill = \"#1696d2\") +\n labs(x = \"Price\",\n y = \"Cut\")\n```\n\n### Violin Plots\n\nViolin plots are symmetrical displays of smooth kernel density plots.\n\n```{r violin-plot}\nInsectSprays %>%\n ggplot(mapping = aes(x = spray, y = count, fill = spray)) +\n geom_violin(color = NA) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +\n labs(x = \"Type of insect spray\",\n y = \"Number of dead insects\") +\n remove_ticks()\n```\n\n### Bean Plot\n\nIndividual outliers and important summary values are not visible in violin plots or smoothed kernel density plots. Bean plots, [created by Peter Kampstra in 2008](https://www.jstatsoft.org/article/view/v028c01), are violin plots with data shown as small lines in a one-dimensional sstrip plot and larger lines for the mean.\n\n```{r beanplot}\nmsleep %>%\n filter(!is.na(vore)) %>%\n mutate(vore = fct_recode(vore, \n \"Insectivore\" = \"insecti\",\n \"Omnivore\" = \"omni\", \n \"Herbivore\" = \"herbi\", \n \"Carnivore\" = \"carni\"\n )) %>%\n ggplot(aes(x = vore, y = sleep_total, fill = vore)) +\n stat_summary(fun = \"mean\",\n colour = \"black\", \n size = 30,\n shape = 95,\n geom = \"point\") +\n geom_violin(color = NA) +\n geom_jitter(width = 0,\n height = 0.05,\n alpha = 0.4,\n shape = \"-\",\n size = 10,\n \t\t\t\t\t\tcolor = \"grey50\") +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2))) + \n labs(x = NULL,\n y = \"Total sleep time (hours)\") +\n theme(legend.position = \"none\") +\n remove_ticks()\n```\n\n## Area Plot\n\n------------------------------------------------------------------------\n\n### Stacked Area\n\n```{r area-plot-stack}\ntxhousing %>%\n filter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>%\n group_by(city, year) %>%\n summarize(sales = sum(sales)) %>%\n ggplot(aes(x = year, y = sales, fill = city)) +\n geom_area(position = \"stack\") +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(2000, 2015),\n breaks = 2000 + 0:15) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n labels = scales::comma) +\n labs(x = \"Year\",\n y = \"Home sales\")\n```\n\n### Filled Area\n\n```{r area-plot-fill}\ntxhousing %>%\n filter(city %in% c(\"Austin\",\"Houston\",\"Dallas\",\"San Antonio\",\"Fort Worth\")) %>%\n group_by(city, year) %>%\n summarize(sales = sum(sales)) %>%\n ggplot(aes(x = year, y = sales, fill = city)) +\n geom_area(position = \"fill\") +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(2000, 2015),\n breaks = 2000 + 0:15) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.02)),\n breaks = c(0, 0.25, 0.5, 0.75, 1),\n labels = scales::percent) +\n labs(x = \"Year\",\n y = \"Home sales\")\n```\n\n## Sankey Plot\n\n------------------------------------------------------------------------\n\nSankey plots visualize flows from one set of variables to another. This can be useful for showing outcomes from the start of a program to the end. You'll need to install the `ggsankey` package to create Sankey plots in R. In this example I make a dummy data set of housing status prior to program start and at exit to show the flow of people between outcomes. A key step is to transform your data set using the `make_long` function from the package. This creates a data frame that specifies each of the initial nodes and how they flow into the next stage.\n\n```{r}\n# load ggsankey package\nremotes::install_github(\"davidsjoberg/ggsankey\")\nlibrary(ggsankey)\n\n# create a dummy dataset of housing status\ndf <- data_frame(entry_status = c(rep(\"Housed\", 7), rep(\"Unhoused\", 15), rep(\"Staying w/ Family\", 8)), \n exit_status = c(rep(\"Housed\", 15), rep(\"Unhoused\", 2), rep(\"Staying w/ Family\", 13))) %>% \n\t# transform the data frame into the proper format for the sankey plot\n make_long(entry_status, exit_status) %>% \n\t# recode the labels to be cleaner in the plot \n mutate(x = recode(x, entry_status = \"Prior Housing Status\", exit_status = \"Exit Housing Status\"),\n next_x = recode(next_x, entry_status = \"Prior Housing Status\", exit_status = \"Exit Housing Status\"))\n\n# create sankey plot\nggplot(df, aes(x = x, \n next_x = next_x, \n node = node, \n next_node = next_node,\n fill = factor(node), \n label = node)) +\n geom_sankey(flow.alpha = 0.5, node.color = 1, show.legend = FALSE) +\n # add labels to plot and style\n geom_sankey_label(size = 3.5, color = 1, fill = \"white\") +\n theme_sankey(base_size = 16)+\n labs(x = NULL)\n```\n\n## Heat Map\n\n------------------------------------------------------------------------\n\n```{r heat-map}\nlibrary(fivethirtyeight)\n\nbad_drivers %>%\n filter(state %in% c(\"Maine\", \"New Hampshire\", \"Vermont\", \"Massachusetts\", \"Connecticut\", \"New York\")) %>%\n mutate(`Number of\\nDrivers` = scale(num_drivers),\n `Percent\\nSpeeding` = scale(perc_speeding),\n `Percent\\nAlcohol` = scale(perc_alcohol),\n `Percent Not\\nDistracted` = scale(perc_not_distracted),\n `Percent No\\nPrevious` = scale(perc_no_previous),\n state = factor(state, levels = rev(state))\n ) %>%\n select(-insurance_premiums, -losses, -(num_drivers:losses)) %>%\n gather(`Number of\\nDrivers`:`Percent No\\nPrevious`, key = \"variable\", value = \"SD's from Mean\") %>%\n ggplot(aes(variable, state)) +\n geom_tile(aes(fill = `SD's from Mean`)) +\n labs(x = NULL,\n y = NULL) + \n scale_fill_gradientn() +\n theme(legend.position = \"right\",\n legend.direction = \"vertical\",\n axis.line.x = element_blank(),\n panel.grid.major.y = element_blank()) +\n remove_ticks()\n#https://learnr.wordpress.com/2010/01/26/ggplot2-quick-heatmap-plotting/\n```\n\n## Faceting and Small Multiples\n\n------------------------------------------------------------------------\n\n### facet_wrap()\n\nR's faceting system is a powerful way to make \"small multiples\".\n\nSome edits to the theme may be necessary depending upon how many rows and columns are in the plot.\n\n```{r small-multiples, fig.height=2}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n facet_wrap(~cut, ncol = 5) +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 6)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 20000), \n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n```\n\n### facet_grid()\n\n```{r faceting, fig.height=7}\ndiamonds %>%\n filter(color %in% c(\"D\", \"E\", \"F\", \"G\")) %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n geom_point(alpha = 0.05) +\n facet_grid(color ~ cut) +\n scale_x_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 4)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0)),\n limits = c(0, 20000), \n labels = scales::dollar) +\n labs(x = \"Carat\",\n y = \"Price\") +\n theme(panel.spacing = unit(20L, \"pt\")) +\n scatter_grid()\n```\n\n## Smoothers\n\n------------------------------------------------------------------------\n\n`geom_smooth()` fits and plots models to data with two or more dimensions.\n\nUnderstanding and manipulating defaults is more important for `geom_smooth()` than other geoms because it contains a number of assumptions. `geom_smooth()` automatically uses loess for datasets with fewer than 1,000 observations and a generalized additive model with `formula = y ~ s(x, bs = \"cs\")` for datasets with greater than 1,000 observations. Both default to a 95% confidence interval with the confidence interval displayed.\n\nModels are chosen with `method =` and can be set to lm(), glm(), gam(), loess(), rlm(), and more. Formulas can be specified with `formula =` and `y ~ x` syntax. Plotting the standard error is toggled with `se = TRUE` and `se = FALSE`, and level is specificed with `level =`. As always, more information can be seen in RStudio with `?geom_smooth()`.\n\n`geom_point()` adds a scatterplot to `geom_smooth()`. The order of the function calls is important. The function called second will be layed on top of the function called first.\n\n```{r geom_smooth}\ndiamonds %>%\n ggplot(mapping = aes(x = carat, y = price)) +\n\tgeom_point(alpha = 0.05) +\n\tgeom_smooth(color = \"#ec008b\") +\n\tscale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n\t limits = c(0, 5),\n\t breaks = 0:5) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 20000), \n labels = scales::dollar) + \n labs(x = \"Carat\",\n y = \"Price\") +\n scatter_grid()\n```\n\n`geom_smooth` can be subset by categorical and factor variables. This requires subgroups to have a decent number of observations and and a fair amount of variability across the x-axis. Confidence intervals often widen at the ends so special care is needed for the chart to be meaningful and readable.\n\nThis example uses Loess with MPG = displacement.\n\n```{r subset-geom_smooth}\nggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = factor(cyl))) +\n\tgeom_point(alpha = 0.2) +\n\tgeom_smooth() +\n\tscale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n\t limits = c(0, 7),\n\t breaks = 0:7) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 60)) + \n\tlabs(x = \"Engine displacement\",\n\t\t\t y = \"Highway MPG\") +\n scatter_grid()\n```\n\nThis example uses linear models with MPG = displacement.\n\n```{r subset-geom-smooth-lm}\nggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = factor(cyl))) +\n\tgeom_point(alpha = 0.2) +\n\tgeom_smooth(method = \"lm\") +\n\tscale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n\t limits = c(0, 7),\n\t breaks = 0:7) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n limits = c(0, 60)) + \n\tlabs(x = \"Engine displacement\",\n\t\t\t y = \"Highway MPG\") +\n scatter_grid()\n```\n\n## Highlighting\n\n------------------------------------------------------------------------\n\n[`library(gghighlight)`](https://yutannihilation.github.io/gghighlight/) enables the intuitive highlighting of ggplot2 plots. `gghighlight` modifies existing ggplot2 objects, so no other code should change. All of the highlighting is handled by the function `gghighlight()`, which can handle all types of geoms.\n\n*Warning:* R will throw an error if too many colors are highlighted because of the design of `urbnthemes`. Simply decrease the number of highlighted geoms to solve this issue.\n\nThere are two main ways to highlight.\n\n### Threshold\n\nThe first way to highlight is with a threshold. Add a logical test to `gghighlight()` to describe which lines should be highlighted. Here, lines with maximum change in per-capita Gross Domestic Product greater than \\$35,000 are highlighted by `gghighlight(max(pcgpd_change) > 35000, use_direct_label = FALSE)`.\n\n```{r gghighlight-threshold}\nlibrary(gghighlight)\nlibrary(gapminder)\n\ndata <- gapminder %>%\n filter(continent %in% c(\"Europe\")) %>%\n group_by(country) %>%\n mutate(pcgpd_change = ifelse(year == 1952, 0, gdpPercap - lag(gdpPercap))) %>%\n mutate(pcgpd_change = cumsum(pcgpd_change))\n \ndata %>%\n ggplot(aes(year, pcgpd_change, group = country, color = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change) > 35000, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\")\n```\n\n### Rank\n\nThe second way to highlight is by rank. Here, the countries with the first highest values for change in per-capita Gross Domestic Product are highlighted with `gghighlight(max(pcgpd_change), max_highlight = 5, use_direct_label = FALSE)`.\n\n```{r gghighlight-rank}\ndata %>%\n ggplot(aes(year, pcgpd_change, group = country, color = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change), max_highlight = 5, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\")\n```\n\n### Faceting\n\n`gghighlight()` works well with ggplot2's faceting system.\n\n```{r gghighlight-faceting}\ndata %>%\n ggplot(aes(year, pcgpd_change, group = country)) +\n geom_line() +\n gghighlight(max(pcgpd_change), max_highlight = 4, use_direct_label = FALSE) + \n scale_x_continuous(expand = expansion(mult = c(0.002, 0)),\n breaks = c(seq(1950, 2010, 10)),\n limits = c(1950, 2010)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n breaks = 0:8 * 5000,\n labels = scales::dollar,\n limits = c(0, 40000)) +\n labs(x = \"Year\",\n y = \"Change in per-capita GDP (US dollars)\") +\n facet_wrap(~ country) +\n theme(panel.spacing = unit(20L, \"pt\"))\n```\n\n## Text and Annotation\n\n------------------------------------------------------------------------\n\nSeveral functions can be used to annotate, label, and highlight different parts of plots. `geom_text()` and `geom_text_repel()` both display variables from data frames. `annotate()`, which has several different uses, displays variables and values included in the function call.\n\n### geom_text()\n\n`geom_text()` turns text variables in data sets into geometric objects. This is useful for labeling data in plots. Both functions need `x` values and `y` values to determine placement on the coordinate plane, and a text vector of labels.\n\nThis can be used to label `geom_bar()`.\n\n```{r bar-geom_text}\ndiamonds %>%\n group_by(cut) %>%\n summarize(price = mean(price)) %>%\n ggplot(aes(cut, price)) +\n geom_bar(stat = \"identity\") +\n geom_text(aes(label = scales::dollar(price)), vjust = -1) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.2)),\n \t\t\t\t\t\t\t\t\t labels = scales::dollar) +\n labs(title = \"Average Diamond Price by Diamond Cut\",\n x = \"Cut\",\n y = \"Price\") +\n remove_ticks()\n```\n\nIt can also be used to label points in a scatter plot.\n\nIt's rarely useful to label every point in a scatter plot. Use `filter()` to create a second data set that is subsetted and pass it into the labelling function.\n\n```{r scatterplot-geom_text}\nlabels <- mtcars %>%\n\trownames_to_column(\"model\") %>%\n\tfilter(model %in% c(\"Toyota Corolla\", \"Merc 240D\", \"Datsun 710\"))\n\nmtcars %>%\n\tggplot() +\n\tgeom_point(mapping = aes(x = wt, y = mpg)) +\n\tgeom_text(data = labels, mapping = aes(x = wt, y = mpg, label = model), nudge_x = 0.38) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 6)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 40)) + \n labs(x = \"Weight (Tons)\",\n y = \"Miles per gallon (MPG)\") +\n scatter_grid()\n```\n\nText too often overlaps with other text or geoms when using `geom_text()`. `library(ggrepel)` is a `library(ggplot2)` add-on that automatically positions text so it doesn't overlap with geoms or other text. To add this functionality, install and load `library(ggrepel)` and then use `geom_text_repel()` with the same syntax as `geom_text()`.\n\n### geom_text_repel()\n\n```{r scatterplot-geom_text_repel}\nlibrary(ggrepel)\n\nlabels <- mtcars %>%\n\trownames_to_column(\"model\") %>%\n\ttop_n(5, mpg)\n\nmtcars %>%\n\tggplot(mapping = aes(x = wt, y = mpg)) +\n\tgeom_point() +\n\tgeom_text_repel(data = labels, \n\t mapping = aes(label = model), \n\t nudge_x = 0.38) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 6)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 40)) + \n labs(x = \"Weight (Tons)\",\n y = \"Miles per gallon (MPG)\") +\n scatter_grid()\n```\n\n### annotate()\n\n`annotate()` doesn't use data frames. Instead, it takes values for `x =` and `y =`. It can add text, rectangles, segments, and pointrange.\n\n```{r annotate-point}\nmsleep %>%\n filter(bodywt <= 1000) %>%\n ggplot(aes(bodywt, sleep_total)) +\n geom_point() +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(-10, 1000),\n \t\t\t\t\t\t\t\t\t labels = scales::comma) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 25)) + \n annotate(\"text\", x = 500, y = 12, label = \"These data suggest that heavy \\n animals sleep less than light animals\") +\n labs(x = \"Body weight (pounds)\",\n y = \"Sleep time (hours)\") +\n scatter_grid() \n```\n\n```{r annotate-rect}\nlibrary(AmesHousing)\n\names <- make_ames()\n\names %>%\n mutate(square_footage = Total_Bsmt_SF - Bsmt_Unf_SF + First_Flr_SF + Second_Flr_SF) %>%\n mutate(Sale_Price = Sale_Price / 1000) %>% \n ggplot(aes(square_footage, Sale_Price)) +\n geom_point(alpha = 0.2) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(-10, 12000),\n \t\t\t\t\t\t\t\t\t labels = scales::comma) + \n scale_y_continuous(expand = expansion(mult = c(0, 0.002)),\n \t\t\t\t\t\t\t\t\t limits = c(0, 800),\n \t\t\t\t\t\t\t\t\t labels = scales::dollar) + \n annotate(\"rect\", xmin = 6800, xmax = 11500, ymin = 145, ymax = 210, alpha = 0.1) +\n annotate(\"text\", x = 8750, y = 230, label = \"Unfinished homes\") +\n labs(x = \"Square footage\", \n y = \"Sale price (thousands)\") +\n scatter_grid() \n```\n\n## Layered Geoms\n\n------------------------------------------------------------------------\n\nGeoms can be layered in `ggplot2`. This is useful for design and analysis.\n\nIt is often useful to add points to line plots with a small number of values across the x-axis. This example from [R for Data Science](http://r4ds.had.co.nz/tidy-data.html) shows how changing the line to grey can be appealing.\n\n### Design {.tabset}\n\n#### Before\n\n```{r layering-geoms-design}\ntable1 %>%\n\tggplot(aes(x = year, y = cases)) +\n\t\tgeom_line(aes(color = country)) +\n\t\tgeom_point(aes(color = country)) +\n\t\tscale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n\t\t labels = scales::comma) +\n\t\tscale_x_continuous(breaks = c(1999, 2000)) +\n\t\tlabs(title = \"Changes in Tuberculosis Cases in Three Countries\")\n```\n\n#### After\n\n```{r layering-geoms-design-gray}\ntable1 %>%\n\tggplot(aes(year, cases)) +\n\t\tgeom_line(aes(group = country), color = \"grey50\") +\n\t\tgeom_point(aes(color = country)) +\n\t\tscale_y_continuous(expand = expansion(mult = c(0, 0.2)), \n\t\t labels = scales::comma) +\n\t\tscale_x_continuous(breaks = c(1999, 2000)) +\n\t\tlabs(title = \"Changes in Tuberculosis Cases in Three Countries\")\n```\n\n### Centroids\n\n```{r centroids}\nmpg_summary <- mpg %>%\n\tgroup_by(cyl) %>%\n\tsummarize(displ = mean(displ), cty = mean(cty))\n\nmpg %>%\n\tggplot() +\n\tgeom_point(aes(x = displ, y = cty, color = factor(cyl)), alpha = 0.5) +\n\tgeom_point(data = mpg_summary, aes(x = displ, y = cty), size = 5, color = \"#ec008b\") +\n\tgeom_text(data = mpg_summary, aes(x = displ, y = cty, label = cyl)) +\n scale_x_continuous(expand = expansion(mult = c(0, 0.002)), \n limits = c(0, 8)) + \n scale_y_continuous(expand = expansion(mult = c(0, 0)), \n limits = c(0, 40)) +\n\tlabs(x = \"Displacement\",\n\t y = \"City MPG\") +\n scatter_grid()\n```\n\n## Saving Plots\n\n------------------------------------------------------------------------\n\n`ggsave()` exports ggplot2 plots. The function can be used in two ways. If `plot =` isn't specified in the function call, then `ggsave()` automatically saves the plot that was last displayed in the Viewer window. Second, if `plot =` is specified, then `ggsave()` saves the specified plot. `ggsave()` guesses the type of graphics device to use in export (.png, .pdf, .svg, etc.) from the file extension in the filename.\n\n``` \nmtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\nggsave(filename = \"cars.png\")\n\nplot2 <- mtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\nggsave(filename = \"cars.png\", plot = plot2)\n```\n\nExported plots rarely look identical to the plots that show up in the Viewer window in RStudio because the overall size and aspect ratio of the Viewer is often different than the defaults for `ggsave()`. Specific sizes, aspect ratios, and resolutions can be controlled with arguments in `ggsave()`. RStudio has a useful [cheatsheet](https://www.rstudio.com/resources/cheatsheets/) called \"How Big is Your Graph?\" that should help with choosing the best size, aspect ratio, and resolution.\n\nFonts are not embedded in PDFs by default. To embed fonts in PDFs, include `device = cairo_pdf` in `ggsave()`.\n\n``` \nplot <- mtcars %>%\n ggplot(aes(x = wt, y = mpg)) +\n geom_point()\n\nggsave(filename = \"cars.pdf\", plot = plot2, width = 6.5, height = 4, device = cairo_pdf)\n```\n\n## Interactive Plots\n\nWe can make any of the previous plots interactive with the powerful and easy `plotly` library. All we have to do is wrap a ggplot object in the `ggplotly` function. *Note:* You can't add `ggplotly` to the end of a ggplot object, but have to actually save the ggplot as a variable and then wrap that in the function call as shown below.\n\nYou can customize the tooltip text by adding a value to `text` in `aes()` and then specifying `tooltip = \"text\"` in the `ggplotly` call.\n\n```{r}\nlibrary(plotly)\n\nstock_plot <- as_tibble(EuStockMarkets) %>% \n mutate(date = time(EuStockMarkets)) %>% \n gather(key = \"key\", value = \"value\", -date) %>% \n ggplot(mapping = aes(x = date, y = value, color = key,\n \t\t\t\t\t\t\t\t\t\t # sometimes ggplotly messes with line charts,\n \t\t\t\t\t\t\t\t\t\t # adding a group value usually helps with that\n \t\t\t\t\t\t\t\t\t\t group = key,\n \t\t\t\t\t\t\t\t\t\t # customize the tooltip with the text aes\n \t\t\t\t\t\t\t\t\t\t text = paste0(\"Value: \", round(value, 2), \"
\",\n \t\t\t\t\t\t\t\t\t\t \t\t\t\t\t\t\t\"Date: \", round(date, 3), \"
\",\n \t\t\t\t\t\t\t\t\t\t \t\t\t\t\t\t\t\"Key: \", key))\n \t\t\t\t\t\t\t\t\t\t ) +\n geom_line() +\n scale_x_continuous(expand = expansion(mult = c(0.002, 0)), \n limits = c(1991, 1999), \n breaks = c(1991, 1993, 1995, 1997, 1999)) +\n scale_y_continuous(expand = expansion(mult = c(0, 0.002)), \n breaks = 0:4 * 2500,\n labels = scales::dollar, \n limits = c(0, 10000)) + \n labs(x = \"Date\",\n y = \"Value\")\n\n# make interactive with gggplotly\n# Uncomment pipe to hide the interative toolbar in the top right \nggplotly(stock_plot, tooltip = \"text\") # %>% config(displayModeBar = FALSE)\n```\n\n## urbnthemes\n\n### Overview\n\n`urbnthemes` is a set of tools for creating Urban Institute-themed plots and maps in R. The package extends `ggplot2` with print and map themes as well as tools that make plotting easier at the Urban Institute. `urbnthemes` replaces the [urban_R_theme](https://github.com/UrbanInstitute/urban_R_theme).\n\nAlways load `library(urbnthemes)` after `library(ggplot2)` or `library(tidyverse)`.\n\n### Usage\n\nUse `set_urbn_defaults(style = \"print\")` to set the default styles. `scatter_grid()`, `remove_ticks()`, `add_axis()`, and `remove_axis()` can all be used to improve graphics.\n\n```{r example, message=FALSE}\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nggplot(data = mtcars, mapping = aes(factor(cyl))) +\n geom_bar() + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Number of Cylinders\",\n y = \"Count\") +\n remove_ticks()\n```\n\n### Combining elements\n\n`library(urbnthemes)` contains functions for combining plot elements into graphics. `urbn_plot()` brings all of the elements together.\n\n- `urbn_logo_text()`\n- `remove_ticks()`\n- `remove_axis()`\n- `scatter_grid()`\n- `add_axis()`\n- `urbn_geofacet`\n\n```{r example2}\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nplot <- ggplot(data = mtcars, mapping = aes(factor(cyl))) +\n geom_bar() + \n scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +\n labs(x = \"Number of Cylinders\",\n y = \"Count\") +\n remove_ticks()\n\nurbn_plot(plot, urbn_logo_text(), ncol = 1, heights = c(30, 1))\n```\n\nSometimes it's important to horizontally add the y-axis title above the plot. `urbn_y_title()` can be sued for this task. The following example goes one step further and adds the title between the legend and the plot.\n\n```{r}\nlibrary(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults()\n\nplot <- ggplot(data = mtcars, mapping = aes(x = wt, y = mpg, color = factor(cyl))) +\n geom_point() + \n\tscale_x_continuous(expand = c(0, 0),\n\t\t\t\t\t\t\t\t\t\t limits = c(0, 8)) +\n scale_y_continuous(expand = c(0, 0),\n \t\t\t\t\t\t\t\t\t limits = c(0, 40)) +\n remove_ticks() +\n\tlabs(\"\") +\n\tscatter_grid()\n\nurbn_plot(get_legend(plot),\n\t\t\t\t\turbn_y_title(\"Miles per gallon\"),\n\t\t\t\t\tremove_legend(plot), \n\t\t\t\t\turbn_logo_text(), \n\t\t\t\t\tncol = 1, \n\t\t\t\t\theights = c(3, 1, 30, 1))\n```\n\n### Palettes\n\n`urbnthemes` contains many quick-access color palettes from the [Urban Institute Data Visualization Style Guide](http://urbaninstitute.github.io/graphics-styleguide/). These palettes can be used to quickly overwrite default color palettes from `urbnthemes`.\n\n- `palette_urbn_main` is the eight color discrete palette of the Urban Institute with cyan, yellow, black, gray, magenta, green, space gray, and red.\n- `palette_urbn_diverging` is an eight color diverging palette.\n- `palette_urbn_quintile` is a five color blue palette that is good for quintiles.\n- `palette_urbn_politics` is a two color palette with blue for Democrats and red for Republicans.\n\nThere are seven palettes that are continuous palettes of the seven unique colors in the discrete Urban Institute color palette:\n\n- `palette_urbn_cyan`\n- `palette_urbn_gray`\n- `palette_urbn_yellow`\n- `palette_urbn_magenta`\n- `palette_urbn_green`\n- `palette_urbn_spacegray`\n- `palette_urbn_red`\n\nUse `view_palette()` to see the palette:\n\n```{r view-palette}\nview_palette(palette_urbn_magenta)\n```\n\nThe vectors can be subset using base R syntax. This allows for the quick selection of specific colors from a palette.\n\n```{r palette-subset1}\npalette_urbn_main[1:4]\n```\n\n```{r palette-subset2}\npalette_urbn_spacegray[1:5]\n```\n\n### Utility functions\n\n`library(urbnthemes)` contains four functions that are helpful with managing font instalations:\n\n- `lato_test()`\n- `lato_install()`\n- `fontawesome_test()`\n- `fontawesome_install()`\n\n## Bibliography and Session Information\n\n------------------------------------------------------------------------\n\n*Note:* Examples present in [this document](https://awunderground.github.io/ggplot2-themes/) by Aaron Williams were created during personal time.\n\nBob Rudis and Dave Gandy (2017). waffle: Create Waffle Chart Visualizations in R. R package version 0.7.0. https://CRAN.R-project.org/package=waffle\n\nChester Ismay and Jennifer Chunn (2017). fivethirtyeight: Data and Code Behind the Stories and Interactives at 'FiveThirtyEight'. R package version 0.3.0. https://CRAN.R-project.org/package=fivethirtyeight\n\nHadley Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2009.\n\nHadley Wickham (2017). tidyverse: Easily Install and Load the 'Tidyverse'. R package version 1.2.1. https://CRAN.R-project.org/package=tidyverse\n\nHadley Wickham (2017). forcats: Tools for Working with Categorical Variables (Factors). R package version 0.2.0. https://CRAN.R-project.org/package=forcats\n\nJennifer Bryan (2017). gapminder: Data from Gapminder. R package version 0.3.0. https://CRAN.R-project.org/package=gapminder\n\nKamil Slowikowski (2017). ggrepel: Repulsive Text and Label Geoms for 'ggplot2'. R package version 0.7.0. https://CRAN.R-project.org/package=ggrepel\n\nMax Kuhn (2017). AmesHousing: The Ames Iowa Housing Data. R package version 0.0.3. https://CRAN.R-project.org/package=AmesHousing\n\nPeter Kampstra (2008). Beanplot: A Boxplot Alternative for Visual Comparison of Distributions, Journal of Statistical Software, 2008. https://www.jstatsoft.org/article/view/v028c01\n\nR Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.\n\nWinston Chang, (2014). extrafont: Tools for using fonts. R package version 0.17. https://CRAN.R-project.org/package=extrafont\n\nYihui Xie (2018). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.19.\n\n```{r System Info and Package Versioning}\nsessionInfo()\n```\n"},"formats":{"html":{"identifier":{"display-name":"HTML","target-format":"html","base-format":"html"},"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"knitr"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"inline-includes":false,"preserve-yaml":false,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[],"notebook-links":true,"format-links":true},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"graphics-guide.html"},"language":{"toc-title-document":"Table of contents","toc-title-website":"On this page","related-formats-title":"Other Formats","related-notebooks-title":"Notebooks","source-notebooks-prefix":"Source","section-title-abstract":"Abstract","section-title-appendices":"Appendices","section-title-footnotes":"Footnotes","section-title-references":"References","section-title-reuse":"Reuse","section-title-copyright":"Copyright","section-title-citation":"Citation","appendix-attribution-cite-as":"For attribution, please cite this work as:","appendix-attribution-bibtex":"BibTeX citation:","title-block-author-single":"Author","title-block-author-plural":"Authors","title-block-affiliation-single":"Affiliation","title-block-affiliation-plural":"Affiliations","title-block-published":"Published","title-block-modified":"Modified","callout-tip-title":"Tip","callout-note-title":"Note","callout-warning-title":"Warning","callout-important-title":"Important","callout-caution-title":"Caution","code-summary":"Code","code-tools-menu-caption":"Code","code-tools-show-all-code":"Show All Code","code-tools-hide-all-code":"Hide All Code","code-tools-view-source":"View Source","code-tools-source-code":"Source Code","code-line":"Line","code-lines":"Lines","copy-button-tooltip":"Copy to Clipboard","copy-button-tooltip-success":"Copied!","repo-action-links-edit":"Edit this page","repo-action-links-source":"View source","repo-action-links-issue":"Report an issue","back-to-top":"Back to top","search-no-results-text":"No results","search-matching-documents-text":"matching documents","search-copy-link-title":"Copy link to search","search-hide-matches-text":"Hide additional matches","search-more-match-text":"more match in this document","search-more-matches-text":"more matches in this document","search-clear-button-title":"Clear","search-detached-cancel-button-title":"Cancel","search-submit-button-title":"Submit","search-label":"Search","toggle-section":"Toggle section","toggle-sidebar":"Toggle sidebar navigation","toggle-dark-mode":"Toggle dark mode","toggle-reader-mode":"Toggle reader mode","toggle-navigation":"Toggle navigation","crossref-fig-title":"Figure","crossref-tbl-title":"Table","crossref-lst-title":"Listing","crossref-thm-title":"Theorem","crossref-lem-title":"Lemma","crossref-cor-title":"Corollary","crossref-prp-title":"Proposition","crossref-cnj-title":"Conjecture","crossref-def-title":"Definition","crossref-exm-title":"Example","crossref-exr-title":"Exercise","crossref-ch-prefix":"Chapter","crossref-apx-prefix":"Appendix","crossref-sec-prefix":"Section","crossref-eq-prefix":"Equation","crossref-lof-title":"List of Figures","crossref-lot-title":"List of Tables","crossref-lol-title":"List of Listings","environment-proof-title":"Proof","environment-remark-title":"Remark","environment-solution-title":"Solution","listing-page-order-by":"Order By","listing-page-order-by-default":"Default","listing-page-order-by-date-asc":"Oldest","listing-page-order-by-date-desc":"Newest","listing-page-order-by-number-desc":"High to Low","listing-page-order-by-number-asc":"Low to High","listing-page-field-date":"Date","listing-page-field-title":"Title","listing-page-field-description":"Description","listing-page-field-author":"Author","listing-page-field-filename":"File Name","listing-page-field-filemodified":"Modified","listing-page-field-subtitle":"Subtitle","listing-page-field-readingtime":"Reading Time","listing-page-field-categories":"Categories","listing-page-minutes-compact":"{0} min","listing-page-category-all":"All","listing-page-no-matches":"No matching items"},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.3.433","editor_options":{"chunk_output_type":"console"}},"extensions":{"book":{"multiFile":true}}}},"projectFormats":["html"]} \ No newline at end of file diff --git a/.quarto/idx/index.qmd.json b/.quarto/idx/index.qmd.json index 626c16b..d1d8383 100644 --- a/.quarto/idx/index.qmd.json +++ b/.quarto/idx/index.qmd.json @@ -1 +1 @@ -{"title":"R Users Group","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"output_dir":"."}}},"headingText":"R Users Group","containsRefs":false,"markdown":"\n\n\n\n
\n\n
\n\n\nThis website contains resources for using R at the Urban Institute for analysis, visualization, mapping, and more. Click on the links above to get started learning about R!\n\n*The Urban Institute R Users Group is committed to exposing researchers to the joy and power of R; developing beginner, intermediate, and advanced R skills; encouraging and supporting novel applications of R to public policy research; and building a diverse and mutually supportive community of R Users.*\n\n\n\n\"gif\n\ngif credits: [Allison Horst](https://twitter.com/allison_horst)\n\n## Sign up for List Serv!\n\nPlease fill out the following form to receive email updates about upcoming RUG events and trainings. We promise not to spam your inbox:\n\n###\n\n###\n\n
\n\n
\n\n
\n\nFill out [this Smartsheet form](https://app.smartsheet.com/b/form/0e9d04ced47b489b8d14971ae6c2fb15) to unsubscribe from the RUG List Serv.\n\n
\n\n## Contact Info\n\nPlease don't hesitate to contact Aaron Williams (awilliams@urban.org) or Amy Rogin (arogin@urban.org) with any thoughts or questions about R at the Urban Institute. \n\n## R Lunch Labs\n\nThe Urban Institute R Users Group hosts weekly lunch labs. R Lunch Labs are hands-on trainings for R users of all skill levels and soon-to-be R users. Each meeting begins with a 5-10 minute quick tip. Afterwards, attendees break into small groups and work on a range of topics including introduction to R, data management and plotting, mapping, and machine learning. Most users bring laptops, but there are a few extras for users without laptops. \n\nWe have currently paused R Lunch Labs, but they will be back soon! If you have an idea for a topic you want to present informally at a lunch lab, please let us know!\n\n"},"formats":{"html":{"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"output_dir":"."}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"markdown"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[]},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"index.html"},"language":{},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.2.269"},"extensions":{"book":{"multiFile":true}}}}} \ No newline at end of file +{"title":"R Users Group","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"output_dir":"."}}},"headingText":"R Users Group","containsRefs":false,"markdown":"\n\n\n\n
\n\n
\n\n\nThis website contains resources for using R at the Urban Institute for analysis, visualization, mapping, and more. Click on the links above to get started learning about R!\n\n*The Urban Institute R Users Group is committed to exposing researchers to the joy and power of R; developing beginner, intermediate, and advanced R skills; encouraging and supporting novel applications of R to public policy research; and building a diverse and mutually supportive community of R Users.*\n\n\n\n\"gif\n\ngif credits: [Allison Horst](https://twitter.com/allison_horst)\n\n## Sign up for List Serv!\n\nPlease fill out the following form to receive email updates about upcoming RUG events and trainings. We promise not to spam your inbox:\n\n###\n\n###\n\n
\n\n
\n\n
\n\nFill out [this Smartsheet form](https://app.smartsheet.com/b/form/0e9d04ced47b489b8d14971ae6c2fb15) to unsubscribe from the RUG List Serv.\n\n
\n\n## Contact Info\n\nPlease don't hesitate to contact Aaron Williams (awilliams@urban.org) or Amy Rogin (arogin@urban.org) with any thoughts or questions about R at the Urban Institute. \n\n## R Lunch Labs\n\nThe Urban Institute R Users Group hosts weekly lunch labs. R Lunch Labs are hands-on trainings for R users of all skill levels and soon-to-be R users. Each meeting begins with a 5-10 minute quick tip. Afterwards, attendees break into small groups and work on a range of topics including introduction to R, data management and plotting, mapping, and machine learning. Most users bring laptops, but there are a few extras for users without laptops. \n\nWe have currently paused R Lunch Labs, but they will be back soon! If you have an idea for a topic you want to present informally at a lunch lab, please let us know!\n\n","srcMarkdownNoYaml":"\n\n\n\n
\n\n
\n\n## R Users Group\n\nThis website contains resources for using R at the Urban Institute for analysis, visualization, mapping, and more. Click on the links above to get started learning about R!\n\n*The Urban Institute R Users Group is committed to exposing researchers to the joy and power of R; developing beginner, intermediate, and advanced R skills; encouraging and supporting novel applications of R to public policy research; and building a diverse and mutually supportive community of R Users.*\n\n\n\n\"gif\n\ngif credits: [Allison Horst](https://twitter.com/allison_horst)\n\n## Sign up for List Serv!\n\nPlease fill out the following form to receive email updates about upcoming RUG events and trainings. We promise not to spam your inbox:\n\n###\n\n###\n\n
\n\n
\n\n
\n\nFill out [this Smartsheet form](https://app.smartsheet.com/b/form/0e9d04ced47b489b8d14971ae6c2fb15) to unsubscribe from the RUG List Serv.\n\n
\n\n## Contact Info\n\nPlease don't hesitate to contact Aaron Williams (awilliams@urban.org) or Amy Rogin (arogin@urban.org) with any thoughts or questions about R at the Urban Institute. \n\n## R Lunch Labs\n\nThe Urban Institute R Users Group hosts weekly lunch labs. R Lunch Labs are hands-on trainings for R users of all skill levels and soon-to-be R users. Each meeting begins with a 5-10 minute quick tip. Afterwards, attendees break into small groups and work on a range of topics including introduction to R, data management and plotting, mapping, and machine learning. Most users bring laptops, but there are a few extras for users without laptops. \n\nWe have currently paused R Lunch Labs, but they will be back soon! If you have an idea for a topic you want to present informally at a lunch lab, please let us know!\n\n"},"formats":{"html":{"identifier":{"display-name":"HTML","target-format":"html","base-format":"html"},"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"output_dir":"."}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"markdown"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"inline-includes":false,"preserve-yaml":false,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[],"notebook-links":true,"format-links":true},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"index.html"},"language":{"toc-title-document":"Table of contents","toc-title-website":"On this page","related-formats-title":"Other Formats","related-notebooks-title":"Notebooks","source-notebooks-prefix":"Source","section-title-abstract":"Abstract","section-title-appendices":"Appendices","section-title-footnotes":"Footnotes","section-title-references":"References","section-title-reuse":"Reuse","section-title-copyright":"Copyright","section-title-citation":"Citation","appendix-attribution-cite-as":"For attribution, please cite this work as:","appendix-attribution-bibtex":"BibTeX citation:","title-block-author-single":"Author","title-block-author-plural":"Authors","title-block-affiliation-single":"Affiliation","title-block-affiliation-plural":"Affiliations","title-block-published":"Published","title-block-modified":"Modified","callout-tip-title":"Tip","callout-note-title":"Note","callout-warning-title":"Warning","callout-important-title":"Important","callout-caution-title":"Caution","code-summary":"Code","code-tools-menu-caption":"Code","code-tools-show-all-code":"Show All Code","code-tools-hide-all-code":"Hide All Code","code-tools-view-source":"View Source","code-tools-source-code":"Source Code","code-line":"Line","code-lines":"Lines","copy-button-tooltip":"Copy to Clipboard","copy-button-tooltip-success":"Copied!","repo-action-links-edit":"Edit this page","repo-action-links-source":"View source","repo-action-links-issue":"Report an issue","back-to-top":"Back to top","search-no-results-text":"No results","search-matching-documents-text":"matching documents","search-copy-link-title":"Copy link to search","search-hide-matches-text":"Hide additional matches","search-more-match-text":"more match in this document","search-more-matches-text":"more matches in this document","search-clear-button-title":"Clear","search-detached-cancel-button-title":"Cancel","search-submit-button-title":"Submit","search-label":"Search","toggle-section":"Toggle section","toggle-sidebar":"Toggle sidebar navigation","toggle-dark-mode":"Toggle dark mode","toggle-reader-mode":"Toggle reader mode","toggle-navigation":"Toggle navigation","crossref-fig-title":"Figure","crossref-tbl-title":"Table","crossref-lst-title":"Listing","crossref-thm-title":"Theorem","crossref-lem-title":"Lemma","crossref-cor-title":"Corollary","crossref-prp-title":"Proposition","crossref-cnj-title":"Conjecture","crossref-def-title":"Definition","crossref-exm-title":"Example","crossref-exr-title":"Exercise","crossref-ch-prefix":"Chapter","crossref-apx-prefix":"Appendix","crossref-sec-prefix":"Section","crossref-eq-prefix":"Equation","crossref-lof-title":"List of Figures","crossref-lot-title":"List of Tables","crossref-lol-title":"List of Listings","environment-proof-title":"Proof","environment-remark-title":"Remark","environment-solution-title":"Solution","listing-page-order-by":"Order By","listing-page-order-by-default":"Default","listing-page-order-by-date-asc":"Oldest","listing-page-order-by-date-desc":"Newest","listing-page-order-by-number-desc":"High to Low","listing-page-order-by-number-asc":"Low to High","listing-page-field-date":"Date","listing-page-field-title":"Title","listing-page-field-description":"Description","listing-page-field-author":"Author","listing-page-field-filename":"File Name","listing-page-field-filemodified":"Modified","listing-page-field-subtitle":"Subtitle","listing-page-field-readingtime":"Reading Time","listing-page-field-categories":"Categories","listing-page-minutes-compact":"{0} min","listing-page-category-all":"All","listing-page-no-matches":"No matching items"},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.3.433"},"extensions":{"book":{"multiFile":true}}}},"projectFormats":["html"]} \ No newline at end of file diff --git a/.quarto/idx/intro-to-r.qmd.json b/.quarto/idx/intro-to-r.qmd.json index d30fb0c..c1ba4b1 100644 --- a/.quarto/idx/intro-to-r.qmd.json +++ b/.quarto/idx/intro-to-r.qmd.json @@ -1 +1 @@ -{"title":"Introduction","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}}},"headingText":"Introduction","containsRefs":false,"markdown":"\n\n\n\n::: {#header}\n\n:::\n\n```{r markdown setup, include=FALSE}\n\nknitr::opts_chunk$set(fig.path = \"intro-to-r/www/images/\")\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\nknitr::opts_chunk$set(warning = FALSE)\n\noptions(scipen = 999)\n```\n\n\nR is one of two premier programming languages for data science and one of the [fastest growing programming languages.](https://stackoverflow.blog/2017/10/10/impressive-growth-r/) Created by researchers for researchers (with some help from software engineers), R offers rich, intuitive tools that make it perfect for visualization, public policy analysis, econometrics, geospatial analysis, and statistics.\n\nR doesn't come in a box. R was never wrapped in cellophane and it definitely isn't purchased at a store. R's pricelessness and open-source development are two of its greatest strengths, but it can often leave new users without the anchor of the box and booklet often provided with proprietary software.\n\nThis guide is meant to be an on-ramp for soon-to-be R Users and a fill-in-the-gap guide for existing R Users. It starts with the most basic question, \"what is R?\" and progresses to advanced topics like organizing analyses. Along the way it even demonstrates how to read XKCD comics in R.\n\nR boasts a strong community in the world and inside the Urban Institute. Please don't hesitate to contact Aaron Williams (awilliams\\@urban.org) or Amy Rogin (arogin\\@urban.org) with thoughts or questions about R.\n\n## What is R?\n\n![](intro-to-r/images/r-logo.png){width=\"15%\"}\n\n[Source](https://www.r-project.org/logo/)\n\nR is a free, open-source software for statistical computing. It is known for intuitive, crisp graphics and an extensive, growing library of statistical and analytic methods. Above all, R boasts an enthusiastic community of developers, instructors, and users.\n\nThe copyright and documentation for R is held by a not-for-profit organization called [The R Foundation](https://www.r-project.org/foundation/).\n\n![](intro-to-r/images/r-studio-logo.png){width=\"15%\"}\n\n[Source, Fair use](https://en.wikipedia.org/w/index.php?curid=48590482)\n\nRStudio is a free, open-source integrated development environment (IDE) that runs on top of R. In practice, R users almost exclusively open RStudio and rarely directly open R.\n\nRStudio is developed by a for-profit company called [RStudio](https://www.rstudio.com/). RStudio, the company, employs some of the R community's most prolific, open-source developers and creates many open-source tools and trainings.\n\nWhile R code can be written in any text editor, the RStudio IDE is a powerful tool with a console, syntax-highlighting, and debugging tools. [This cheatsheet](https://github.com/rstudio/cheatsheets/raw/master/rstudio-ide.pdf) outlines the power of RStudio.\n\n## Installation and Updates\n\n------------------------------------------------------------------------\n\n### When should you update?\n\nAll Urban computers should come pre-installed with R and Rstudio. However your R version may be out of date and require updating. We recommend having at least R version 3.6.0 or higher. You can check what version of R you have installed by opening Rstudio and submitting the following line of code to the console: `R.Version()$version.string`.\n\nIf you're working on a personal computer, you may not have R or Rstudio installed. So follow this guide to install both on your computer.\n\n### Updating/Installing R\n\n1) Visit https://cran.r-project.org/bin/windows/base/. The latest R version will be the downloadable link at the top. As of 1/1/2020, that R version is 3.6.2. Click on the link at the top and download the `R-x.x.x-win.exe` file.\n\n2) Open the R-x.x.x-win.exe\\` file. Click next, accept all the defaults, and install R. After R has been installed, click the Finish button. You should not need admin privileges for this.\n\n3) Check that your version of R has been updated in Rstudio. If Rstudio is already open, first close it. Then open Rstudio and retype in `R.Version()$version.string`. You should see an updated version number printed out on the console.\n\n4) Test that R packages are loading as expected. Packages you already had installed should continue to work with newer versions of R. But in some cases, you may need to re-install the packages to work properly with new versions of R.\n\n### Updating/Installing Rstudio\n\n1) Open Rstudio and go to Help \\> Check for Updates to see if RStudio is up-to-date\n\n2) If it is out-of-date, download the [appropriate update](https://rstudio.com/products/rstudio/download/#download).\n\n3) Before you run the installer, contact IT at helpdesk\\@urban.org for administrative approval as the program requires admin access.\n\n4) Run the installer and accept all defaults.\n\nMoving forward, RStudio will automatically and regularly update on Windows computers at the Urban Institute.\n\n## Learning R\n\n------------------------------------------------------------------------\n\n### What to Learn\n\nThere is often more than one way to accomplish a goal in R because of the language's flexibility. At first, this flexibility can be overwhelming. That's why it is useful to pick and master one set of tools in R before branching out and learning everything R.\n\nFortunately, [Hadley Wickham's tidyverse](https://www.tidyverse.org/) offers a comprehensive set of tools for data analysis that are good for both beginners and experts. The tidyverse is self-described as \"an opinionated collection of R packages designed for data science.\" The tidyverse consists of almost two dozen clear and concise tools for every part of an analysis workflow. At first, focus on the function `read_csv()` for loading data, the package `dplyr` for manipulating data, and the package `ggplot2` for plotting.\n\nHere's a quick example that reads a .csv, filters the data, and creates a publishable column plot in just fifteen lines of code:\n\n```{r quick example}\n# load packages and source the Urban Institute ggplot2 theme\nlibrary(tidyverse) # contains read_csv, library(dplyr), and library(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\n# read bankdata.csv\nbank <- read_csv(\"intro-to-r/data/bankdata.csv\") \n\nbank_subset <- bank %>%\n\t# filter to observations of unmarried mothers less than age 30\n\tfilter(married == \"NO\" & age < 30) %>%\t\n\t# drop all variables except children and income\n\tselect(children, income)\t\t\t\t\t\t\t\t\n\n# plot!\nbank_subset %>%\n\tggplot(mapping = aes(x = children, y = income)) +\n\tgeom_bar(stat = \"summary\", fun.y = \"mean\") +\n\tscale_y_continuous(expand = c(0, 0), labels = scales::dollar) +\n\tlabs(title = \"Mean income\",\n\t\t\t subtitle = \"Unmarried mothers less than age 30\",\n\t\t\t caption = \"Urban Institute analysis of bank data\",\n\t\t\t x = \"Number of children\",\n\t\t\t y = \"Income\")\n```\n\n### Resources for Learning\n\n*R for Data Science* by Hadley Wickham and Garrett Grolemund is the best print resource for learning R and the tidyverse. The book is available [online](http://r4ds.had.co.nz/index.html) for free and *begins* with visualization which is motivating and practical. *R for Data Science* contains dozens of worthwhile exercises but no solutions guide. Please check your solutions against the [Urban Institute r4ds solutions guide on GitHub](https://github.com/UI-Research/r4ds-exercises.git) and please contribute if the exercise isn't already in the guide!\n\nRStudio publishes a number of cheat sheets that cover the tidyverse. The main cheat sheets can be accessed in RStudio at Help \\> Cheat Sheets. Additional cheat sheets are accessible here on the [RStudio website](https://www.rstudio.com/resources/cheatsheets/).\n\nDavid Robinson, a data scientist from Data Camp, has a new [video course](https://www.datacamp.com/instructors/drobinson) about the tidyverse. Few people know as much about R and communicate as effectively as David Robinson.\n\n*Advanced R* by Hadley Wickham is a good resource for new R users that have experience with other programming languages and computer science. It is available [online](http://adv-r.had.co.nz/) for free.\n\n### Library\n\nIt's easy to feel overwhelmed by the frenetic development of the extended R universe. Books are an invaluable resource for slowing down and focusing on fully-formed ideas.\n\nAaron Williams (awilliams\\@urban.org) has a number of books that can be checked out:\n\n- [The Art of R Programming](https://www.nostarch.com/artofr.htm)\n- [ggplot2](http://www.springer.com/us/book/9780387981413)\n- [Efficient R Programming](http://shop.oreilly.com/product/0636920047995.do) ([Online!](https://csgillespie.github.io/efficientR/))\n- [Text Mining with R](http://shop.oreilly.com/product/0636920067153.do) ([Online!](https://www.tidytextmining.com/))\n- [Reasoning with Data](https://www.guilford.com/books/Reasoning-with-Data/Jeffrey-Stanton/9781462530267/reviews)\n- [Practical Statistics for Data Scientists](http://shop.oreilly.com/product/0636920048992.do)\n\n### Built-in Data Sets\n\nR has many built-in data sets that are useful for practice and even more data sets are accessible through R packages.\n\nSubmitting `data()` shows a list of all available data sets. `cars` and `iris` are two classic sets that are used in many examples.\n\n`library(tidyverse)` loads many more \"tidy\" data sets including `diamonds` and `starwars`.\n\n```{r tidyverse}\nlibrary(tidyverse)\nstarwars %>%\n\tcount(species) %>%\n\tarrange(desc(n)) %>%\n\thead()\n```\n\n`library(dslabs)` by [Rafael Irizarry](https://simplystatistics.org/2018/01/22/the-dslabs-package-provides-datasets-for-teaching-data-science/) includes varied data sets that are intentionally imperfect that are useful for practice. Students of econometrics will enjoy `library(wooldridge)`. It loads 105 data sets from *Introductory Econometrics: A Modern Approach* by Jeffrey Wooldridge. Now you can practice estimating your hedonic pricing models in R!\n\n```{r psid}\nlibrary(wooldridge)\nlibrary(tidyverse)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nas_tibble(hprice1) %>%\n\tggplot(aes(x = sqrft, y = price)) +\n\tgeom_point() +\n\tscale_y_continuous(expand = c(0, 0), lim = c(0, 800)) +\n\tlabs(title = '\"hprice1\" data from Wooldridge') \n```\n\n### Getting Help\n\nEven the best R programmers spend hours each week searching the Internet for answers. Here are some of the best ways to find answers:\n\nSubmit `?` and any function name without parentheses (ex. `?mean`) to see the function documentation in RStudio.\n\nWhen Googling, set the search range to the last year to avoid out-of-date solutions and to focus on up-to-date practices.\n\n[Stack Overflow](https://stackoverflow.com/) contains numerous solutions. Add `[r]` to any search to limit results to R. If a problem is particularly perplexing, it is simple to submit questions. Exercise caution when submitting questions because the Stack Overflow community has strict norms about questions and loose norms about respecting novices.\n\n[RStudio Community](https://community.rstudio.com/) is a new forum for R Users. It has a smaller back catalog than Stack Overflow but users are friendlier than on Stack Overflow.\n\nFinally, Aaron Williams (awilliams\\@urban.org) from IBP and Amy Rogin (arogin\\@urban.org) from METRO are available to solve problems, offer guidance, and share R enthusiasm.\n\n### CRAN Task Views\n\nR has sub-communities, frameworks, and tools focused on different subject-matter and and methodological areas. [CRAN Task Views](https://cran.r-project.org/web/views/) is invaluable for understanding these communities and finding the best frameworks and tools for different disciplines in R.\n\nCRAN Task Views has 35 pages focused on subcategories of R ranging from [econometrics](https://cran.r-project.org/web/views/Econometrics.html) to natural language processing. Each page is maintained by a subject-matter expert and contains methods, packages, books, and mailing lists that are useful for researchers.\n\nThe econometrics page alone contains detailed information on basic linear regression, microeconometrics, instrumental variables, panel data models, further regression models, time series data and models, data sets, CRAN packages, articles, books, and more.\n\n## R Code\n\n------------------------------------------------------------------------\n\nIt's time to start writing R code. Remember, most R users never open R and exclusively use RStudio. Go ahead and open R once to admire its dated text editor. Then, close R and never directly open it again. Now, open RStudio.\n\n### Submitting Code\n\nRStudio has four main panels: code editor (top left by default), R console (bottom left by default), environment and history (top right by default), and files, plots, packages, help, and viewer pane (bottom right by default).\n\nThere are two main ways to submit code:\n\n1) Type code to the right of ![](intro-to-r/images/code-console.png) in the R console and hit enter. **Note:** R won't create a long-term record of this code.\n2) Click ![](intro-to-r/images/new-script.png) in the top left to create a new R script in the code editor panel. Type code in the script. Highlight desired code and either click run the in top right of the code editor panel or type Ctrl/command-enter to run code. Scripts can be saved, so they are the best way to write code that will be used again.\n\nFor practice, submit `state.name` in the R console to create a vector with all fifty state names (sorry statehood advocates, no Washington, D.C.). Next, create a script, paste `state.name`, highlight the text, and click run at the top right of the code editor. You should get the same output both times.\n\n```{r state names}\nstate.name\n```\n\n### Syntax\n\nThe are five fundamental pieces of syntax in R.\n\n- `<-` is the assignment operator. An object created on the right side of an assignment operator is assigned to a name on the left side of an assignment operator. Assignment operators are important for saving the consequences of operations and functions. Operations without assignment operators will typically be printed to the console but not saved.\n- `#` begins a comment. Comments are useful for explaining decisions in scripts. As Haldey Wickham notes in the [Tidyverse styleguide](http://style.tidyverse.org/), 'In code, use comments to explain the \"why\" not the \"what\" or \"how\".\n- `c()` combines similar vectors into larger vectors. For example, `c(1, 2, 3)` is a numeric vector of length three made up of three numeric vectors of length one.\n- `?` in front of any function name without parentheses returns function documentation. For example, `?mean`.\n- `%>%` from `library(magrittr)` and `library(tidyverse)` is the \"pipe operator\". It passes the output from one function to another function. This is useful because strings of operations can be \"piped\" together instead of each individual operation needing to be assigned to an object.\n\n### Vectors\n\nVectors are the fundamental piece of data in R. R has six vector types (you can't mix vector types): logical, integer, double, character, complex, and raw. . You can check the type of a vector with `typeof()` and the length with `length()`\n\n### Data frames\n\nData frames are combinations of equal length vectors. Data analysis in R is built around the data frames. As a guiding principle working with data frames, you want to have \"tidy data\" whenever possible. A tidy data frame means that :\n\n1. Each variable has its own column.\n\n2. Each observation has its own row.\n\n3. Each value has its own cell.\n\n[![\\[Source\\](https://r4ds.had.co.nz/tidy-data.html)](intro-to-r/images/tidy-data.png)](https://r4ds.had.co.nz/tidy-data.html)\n\nHaving data in a tidy format allows R's vectorized nature to shine and many of the `tidyverse` functions are designed for tidy data.\n\n### Missing values\n\nR stores missing values as `NA`. A single `NA` in a calculation can cause the entire result to return as `NA`.\n\n```{r}\nsum(c(2, 2, NA))\n```\n\nThe contagiousness of `NA` is good, it makes users explicitly acknowledge dropping missing values with `na.rm = TRUE`.\n\n```{r}\nsum(c(2, 2, NA), na.rm = TRUE)\n```\n\n`== NA` does not test for missing values. Instead, use `is.na()`.\n\n- is.na() and math with booleans\n- complete.cases\n\n### Functions\n\nFunctions in R are collections of code that when called cause certain actions. R contains hundreds of functions and thousands of more functions can be accessed through packages.\n\nMost functions take arguments. For example, the function `mean()` has arguments `x`, `trim`, `na.rm`, and `...`. The first argument in most functions, in this case `x`, is an input object. Arguments can be passed to functions by name or position. `mean(c(1, 2, 3))` is equivalent to `mean(x = c(1, 2, 3))`.\n\nNotice how the other three arguments were skipped. Most arguments in functions have default values. The best way to see default values is to submit the function name with a question mark, like `?mean`. In this case, `trim = 0`, `na.rm = FALSE`, and no further arguments were passed through with `...`.\n\nIn the previous example, the `c()` function was nested inside of the `mean()` function. It is also possible to assign a vector of 1, 2, and 3 to a name and pass the name to the mean function.\n\n```{r mean, eval = FALSE}\napples <- c(1, 2, 3)\n\nmean(apples)\n```\n\nR is a [functional programming language](http://adv-r.had.co.nz/Functional-programming.html). In addition to having many pre-made functions like `mean()`, R has powerful tools for creating and manipulating custom functions. This is useful because:\n\n- It avoids tedious and error-prone copying-and-pasting and makes iterating processes simple;\n- Is a powerful way to organize sets of operations;\n- Is a standardized way to save code for later and to share operations with others.\n\nThis last bullet is key to the package system in R.\n\n### Packages\n\nOpening RStudio automatically loads \"base R\", a fundamental collection of code and functions that handles simple operations like math and system management. R can be extended with collections of code and functions developed by the R community called packages. This sounds wild, but most packages are created and maintained by some of the best statisticians and developers in the world.\n\nMost packages can be installed with `install.packages(\"dplyr\")`, where the string between the quotation marks is the name of the package. Packages installed with `install.packages()` come from CRAN and must pass certain checks for performance and documentation. Popular packages on CRAN, like dplyr, have as much, if not more support, standards, and quality than code in proprietary software packages like Stata or SAS.\n\nIt is possible, but less common, to install packages from places like GitHub. This is less secure and the functionality of the packages is more likely to change over time. `install.packages()` need only be run once per version of package per machine and should rarely be included in .R scripts.\n\nPackages are loaded once per R session with the function `library()`. It is a good idea to include `library(package-name)` at the top of scripts for each package used in the script. This way it is obvious at the top of the script which packages are installed and loaded.\n\n**Note:** `install.packages()` uses quoted package names and `library()` uses unquoted package names.\n\nFor practice, submit the following three lines of code to install `RXKCD`, load `library(RXKCD)`, and get a random [XKCD comic](https://www.xkcd.com/).\n\n```{r xkcd, eval=FALSE}\ninstall.packages(\"RXKCD\")\nlibrary(RXKCD)\ngetXKCD(\"random\")\n```\n\n```{r xkcd run, echo=FALSE}\nlibrary(RXKCD)\n# assignment to hide text output\ncomic <- getXKCD(539)\n```\n\nPackages are frequently updated, especially around the time R versions change. The easiest way to update packages is Tools \\> Check for Package Updated in RStudio.\n\nOccasionally, two loaded packages will have functions with identical names. Any conflicts with be announced when loading packages. See how `filter()` and `lag()` from `library(tidyverse)` and `library(stats)` conflict:\n\n![](intro-to-r/images/load-tidyverse.png) In this case, the tidyverse functions are usually favored. If there is ever a conflict or any doubt about which function is used, use the package name and `::` to directly call the function. For example, `dplyr::select(apples)`. `::` can also be used to call a function without loading the entire package.\n\n### CRAN\n\nThe [Comprehensive R Archive Network](https://cran.r-project.org/index.html) (CRAN) contains almost 12,000 packages contributed over the last two decades by a range of developers. New packages are added to CRAN almost every day.\n\nCRAN enables R to have all of the benefits of open-source development and the security and predictability of proprietary statistical packages like SAS and Stata. CRAN weds the benefits of broad-based, real-time package development with certain [standards](https://cran.r-project.org/index.html) for functionality and documentation. Methods and tools make it to R before SAS or Stata, if they ever make it to SAS or Stata, but have standards that generally exceed Python or other open-source languages. (See: [Malicious Libraries Found on Python Package Index (PyPI)](https://www.blog.pythonlibrary.org/2017/09/15/malicious-libraries-found-on-python-package-index-pypi/))\n\nBecause of CRAN's long history and R's place in the statistics community, CRAN contains many methods that can't be accessed, much less duplicated, using proprietary software. In addition to being useful now, this also ensures that R isn't a temporary fad and will have staying power because of the challenge of replicating or besting CRAN.\n\nR's extensible design is important, but most tasks can be accomplished with a handful of packages:\n\n- `ggplot2` data visualization\n- `dplyr` data management\n- `tidyr` data tidying\n- `readr` data import\n- `purrr` functional programming\n- `tibble` data frames\n- `hms` times\n- `stringr` character strings\n- `lubridate` dates/times\\\n- `forcats` factors\n- `DBI` databases\n- `haven` SPSS, SAS, and Stata files\n- `readxl`.xls and .xlsx\n- `modelr` simple modeling within a pipeline\n- `broom` turning models into tidy data\n- `tidyverse` loads all of the packages listed up to this point; see Hadley Wichkham's \"[tidyverse](https://blog.rstudio.org/2016/09/15/tidyverse-1-0-0/)\"\n\n## Organizing Analyses\n\n------------------------------------------------------------------------\n\nThis section outlines how to organize an analysis to get the most out of R. Newer users may want to skip this section and work through [R for Data Science](http://r4ds.had.co.nz/) until they understand `library(readr)`, `library(dplyr)`, and `library(ggplot2)`.\n\n### Projects\n\nOrganizing scripts, files, and data is one of the most important steps to creating a clear and reproducible analysis.\n\nR Projects, proper noun, are the best way to organize an analysis. They have several advantages:\n\n- They make it possible to concurrently run multiple RStudio sessions.\n- They allow for project-specific RStudio settings.\n- They integrate well with Git version control.\n- They are the \"node\" of relative file paths. (more on this in a second)\n\nBefore setting up an R Project, go to Tools \\> Global Options and uncheck \"Restore most recently opened project at startup\".\n\n![](intro-to-r/images/restore.png){width=\"50%\"}\n\nEvery new analysis in R should start with an R Project. First, create a directory that holds all data, scripts, and files for the analysis. Storing files and data in a sub-directories is encouraged. For example, data can be stored in a folder called data/.\n\nNext, click \"New Project...\" in the top right corner.\n\n![](intro-to-r/images/new-project.png){width=\"50%\"}\n\nWhen prompted, turn your recently created \"Existing Directory\" into a project.\n\n![](intro-to-r/images/existing-directory.png){width=\"50%\"}\n\nUpon completion, the name of the R Project should now be displayed in the top right corner of RStudio where it previously displayed \"Project: (None)\". Once opened, .RProj files do not need to be saved. Double-clicking .Rproj files in the directory is now the best way to open RStudio. This will allow for the concurrent use of multiple R sessions and ensure the portability of file paths. Once an RStudio project is open, scripts can be opened by double-clicking individual files in the computer directory or clicking files in the \"Files\" tab in the top right of RStudio.\n\nR Projects make code highly portable because of the way they handle file paths. Here are a few rules:\n\n#### Filepaths\n\nNever use `\\` in file paths in R. `\\` is a regular expression and will complicate an analysis. Fortunately, RStudio understands `/` in file paths regardless of operating system.\n\nNever use `setwd()` in R. It is unnecessary, it makes code unreproducible across machines, and it is rude to collaborators. R Projects create a better framework for file paths. Simply treat the directory where the R Project lives as the working directory and directories inside of that directory as sub-directories.\n\nFor example, say there's a `.Rproj` called `starwars-analysis.Rproj` in a directory called `starwars-analysis`. If there is a .csv in that folder called `jedi.csv`, the file can be loaded with `read_csv(\"jedi.csv\")` instead of `read_csv(\"H:/ibp/analyses/starwars-analysis/diamonds.csv\")`. If that file is in a sub-directory of `starwars-analysis` called `data`, it can be loaded with `read_csv(\"data/jedi.csv\")`. The same concepts hold for writing data and graphics.\n\nThis simplifies code and makes it portable because all relative filepaths will be identical on all computers. To share an analysis, simply send the entire directory to a collaborator or share it with GitHub.\n\nHere's an example directory:\n\n
![](intro-to-r/images/directory.png){width=\"50%\"}
\n\nIt isn't always possible to avoid absolute file paths because of the many different ways the Urban Institute stores data. Avoid absolute paths when possible and be deliberate about where analyses live in relation to where data live.\n\nFinally, it's good practice to include a README in the same directory as the .Rproj. The README should outline the purpose and the directories and can include information about how to contribute, licenses, dependencies, and acknowledgements. This [GitHub page](https://gist.github.com/PurpleBooth/109311bb0361f32d87a2) is a good README template.\n\nCheck out [R for Data Science](http://r4ds.had.co.nz/workflow-projects.html) by Hadley Wickham and Garrett Grolemund for a more thorough explanation of this workflow. Jenny Bryan also has a good [blogpost](https://www.tidyverse.org/articles/2017/12/workflow-vs-script/) about avoiding `setwd()`.\n\n### Naming Conventions\n\nNaming functions, objects, variables, files, and scripts is one of the toughest and least-taught dimensions of computer programming. Better names can add clarity to code, save time and effort, and minimize errors caused by accidentally overwriting existing functions or other objects.\n\n> There are only two hard things in Computer Science: cache invalidation and naming things. \\~ [Phil Karlton](http://www.meerkat.com/2017/12/naming-things-hard/)\n\n#### Functions and Other Objects\n\nR is case-sensitive.\n\nObjects in R can be named anything - [even unicode characters](https://www.r-bloggers.com/rules-for-naming-objects-in-r/). But just because something *can* be named anything doesn't mean it should.\n\nMost functions and objects in R are lowerCamelCase, period.separated, or underscore_separated. As an individual or team, it's important to pick a style and stick with it, but as [this article](https://journal.r-project.org/archive/2012-2/RJournal_2012-2_Baaaath.pdf) from 2012 shows, there isn't much consistency across the R community. Hadley Wickham's tidyverse uses underscores, so expect to see some consolidation into this style.\n\nIn general, it's good practice to name functions with verbs and other objects with nouns.\n\nVariable and object names that start with numbers, have spaces, or use peculiar syntax require back-ticks.\n\n> select(urban, \\`R Users Group\\`)\n\n> urban\\$\\`R Users Group\\`)\n\nFinally, it's possible to overwrite existing functions and other objects in R with the assignment operator. Don't give vectors or data frames the same names as exisiting functions and don't overwrite existing functions with custom functions.\n\n#### Files\n\nNaming conventions for scripts and files is probably the most overlooked dimension in programming and analysis. The first three bullets from this section come from this [rich slide deck](http://www2.stat.duke.edu/~rcs46/lectures_2015/01-markdown-git/slides/naming-slides/naming-slides.pdf) by Jenny Bryan. This may seem pedantic, but picking a file naming convention now can save a bunch of time and headaches in the future.\n\n**1) Machine readable**\n\nCreate file names that are easily machine readable. Use all lower case letters and skip punctuation other than delimiters. Use underscores as characters for splitting the file name. For example, `stringr::str_split_fixed(\"2018-01-10_r-introduction_machine-readable-example_01.csv\", \"[_\\\\.]\", 5)` splits the file name on underscores and periods and returns date, project, file name, file number, and file type. This information can then be stored and sorted in a data frame.\n\n**2) Human readable**\n\nCreate file names that are human readable. The example from above is informative without any machine interpretation.\n\n**3) Plays well with default ordering**\n\nIt is often useful to include date or sequence numbers in script and file names. For example, include 2018-10-01 for data collected on January 10th, 2018 or include 3 for the third script a sequence of five `.R` programs. Starting file names with the date or sequence numbers means files will show up in a logical order by default. Be sure to use ISO 8601 standard for dates (YYYY-MM-DD).\n\n**4) Don't Use File Names for Version Control**\n\nVersion control with file names is unwieldy and usually results in names that are barely human readable and definitely not machine readable.\n\n> \"2018-01-10_r-introduction_machine-readable-example_01_v2_for-aaron_after-review_before-submission.R\"\n\nIterations usually don't iterate sensibly. For example, what was \"v1\", \"v2\" abandoned for \"for-aaron\", \"after-review\", \"before-submission\". Furthermore, version control with file names is poor for concurrent work and merging.\n\nThe next section will outline the optimal tool for version control.\n\n### Version Control\n\nThe workflow outlined above integrates perfectly with version control like Git and distributed version control repository hosting services like GitHub.\n\nVersion control is a system for recording changes to files over time. Version control is built around repositories. In this case, the folder containing the `.Rproj` is the perfect directory to use as a repository. A handful of simple commands are used to track and commit changes to text files (.R, .Rmd, etc.) and data. This record is valuable for testing alternatives, communicating with others and your future self, and documenting progress on projects.\n\nGitHub is a distributed repository system built on top of Git. GitHub has a number of valuable tools for collaboration and project management. In particular, it makes concurrent collaboration on code simpler with branches and has a slick system for issues. Here are the [branches](https://github.com/UrbanInstitute/urban_R_theme/branches) and [issues](https://github.com/UrbanInstitute/urban_R_theme/issues) for the Urban Institute R Graphics Guide. It also has free web hosting for websites like the website you are reading right now. [GitHub has a quick guide that is a good place to start learning Git](https://try.github.io/levels/1/challenges/1).\n\nThe Urban Institute has a number of legacy models and code bases that span years and have been touched by scores of brilliant researchers. The future value of a record of all code changes and development is borderline unthinkable.\n\n### Coding Style\n\n> \"Good coding style is like using correct punctuation. You can manage without it, but it sure makes things easier to read.\" \\~Hadley Wickham (2014)\n\ngood coding style is like using correct punctuation you can manage without it but it sure makes thing easier to read\n\nThe details of a coding style are less important than consistently sticking to that style. Be flexible when working with collaborators so the style doesn't change inside an analysis.\n\nHere are three good sources for inspiration:\n\n- [Tidyverse Style Guide](http://style.tidyverse.org/)\n- [Google's R Style Guide](https://google.github.io/styleguide/Rguide.xml)\n- [Hadley Wickham's R Style Guide](http://adv-r.had.co.nz/Style.html)\n\n## Putting it All Together\n\n------------------------------------------------------------------------\n\nR can augment or replace a traditional proprietary statistical packages like SAS or Stata with a few extra bells and whistles, but hopefully this guide and other resources show a fuller vision for developing reproducible, accurate, and collaborative analyses.[^1]\n\n[^1]: The language \"reproducible, accurate, and collaborative analyses\" comes from [Hilary S. Parker's talk](https://www.rstudio.com/resources/videos/opinionated-analysis-development/) at rstudio::conf 2017 about opinionated analysis development.\n\nThis research pipeline, to use the phrase by Roger Peng, Jeff Leek, and Brian Caffo, combines the best of traditional economic and social policy research, computer science/software development, and statistics.[^2] Here are the rules:\n\n[^2]: The basis for [this section](https://www.coursera.org/learn/reproducible-research/lecture/abevs/reproducible-research-concepts-and-ideas-part-2) comes from this Coursera talk by Roger Peng.\n\n#### 1) No steps in an analysis are done by hand and all steps are recorded with executable scripts.\n\nIt is common to use executable scripts to estimate a regression equation or to tabulate weighted summary statistics. But for some reason, other steps like file management, data munging, and visualization are often done \"by hand\". Good science demands that every step of an analysis is recorded - and if possible - with executable scripts.\n\nFortunately, it is possible to script most steps in R from downloading data from the Internet and accessing APIs to visualizations and drafting manuscripts. This may be challenging at first, but it will save time and result in better research in the long run.\n\n#### 2) All code is entirely reproducible and portable.\n\nExecutable scripts are for communicating with other researchers and our future selves. Scripts lose value if they aren't portable and can't be reproduced in the future or by others. Recording every step with execuatble scripts is a start, but scripts aren't valuable if they require expensive proprietary software,or if researchers have to significantly alter scripts to run an analysis.\n\nOpen source software, like R, promotes accessibility, portability, and reproducibility. Also, be sure to avoid `setwd()` and use relative filepaths.\n\n#### 3) Local and collaborative version control is used and all repositories include all code and a README.\n\nUse local version control like Git and a distributed version control repository hosting service like GitHub to track changes and share analyses. The version control should include all scripts and meta information about the analysis in a README.\n\n#### 4) Raw data and tidy analytic data are stored in a collaborative location with a code book.\n\nMany raw data are already stored in collaborative locations like BLS.gov and don't need to be duplicated. Tidy analytic data, like the data used to estimate a regression equation, should be stored in a collaborative location. This is good practice, but is less essential if executable scripts are flawless and reproducible. Researcher-entered data and data from less-stable sources should be stored in raw and analytic forms.\n\nSmall data sets can be stored on GitHub without issue. Larger data sets should be stored in collaborative locations accessible by scripting languages. This is only possible for public data and best-practices for private data are less established.\n\nSave codebooks for data sets as text files or PDFs in repositories. Creating codebooks for user-entered data or variables created in executable scripts is often worth the time.\n\n#### 5) Code review and issue tracking are used to improve accuracy and computational efficiency.\n\nGetting stronger programmers and/or methodologists to review code is valuable for limiting programming and analytic mistakes, improving computational efficiency, and learning.\n\n[GitHub issues](https://guides.github.com/features/issues/) is a powerful tool for managing, discussing, and collaborating on code.\n\n#### 6) Projects rely heavily on literate statistical programming and standard means of distribution for execution, validation, and publishing.\n\nLiterate statistical programming is the combination of natural language explanations for humans and executable code in one document. The idea was created by Donald Knuth and is embodied by R Markdown.\n\nR Markdown combines text chunks, code chunks, and output chunks in one script that can be \"knitted\" using `library(knitr)` to created PDFs, books, .htmls, and websites like the website where this guide lives.\n\nThis workflow combines the analytic and narrative process in a tool that is flexible, scalable, reproducible, and less error-prone. R Markdown documents can be used for executing programs, validating models and analyses, and publishing. These documents can be submitted to many academic journals and shared easily with [GitHub pages](https://pages.github.com/).\n\n#### 7) Software versions and dependencies are recorded and all software is cited in publications.\n\n`sessionInfo()` reports the R version, locale, packages used, and other important information about an R session. `citation()` creates a text and BibTex entry of the citation for R. `citation()` creates a text and BibTex entry for R packages. `library(packrat)` (outlined [here](https://rstudio.github.io/packrat/)) is a tool for saving R dependencies.\n\n## Bibliography and References\n\n------------------------------------------------------------------------\n\nHadley Wickham (2017). tidyverse: Easily Install and Load the 'Tidyverse'. R package version 1.2.1. https://CRAN.R-project.org/package=tidyverse\n\nHadley Wickham and Garrett Grolemund (2017). R For Data Science http://r4ds.had.co.nz/\n\nHadley Wickham (2014). Advanced R http://adv-r.had.co.nz/Style.html\n\nHilary S. Parker (2017. Opinionated Analysis Development https://www.rstudio.com/resources/videos/opinionated-analysis-development/\n\nJenny Bryan (2017).\\\nProject-oriented workflow https://www.tidyverse.org/articles/2017/12/workflow-vs-script/\n\nJenny Bryan (2015). naming things. http://www2.stat.duke.edu/\\~rcs46/lectures_2015/01-markdown-git/slides/naming-slides/naming-slides.pdf\n\nJJ Allaire, Yihui Xie, Jonathan McPherson, Javier Luraschi, Kevin Ushey, Aron Atkins, Hadley Wickham, Joe Cheng and Winston Chang (2017). rmarkdown: Dynamic Documents for R. R package version 1.8. https://CRAN.R-project.org/package=rmarkdown\n\nJustin M. Shea (2017). wooldridge: 105 Data Sets from \"Introductory Econometrics: A Modern Approach\" by Jeffrey M. Wooldridge. R package version 1.2.0. https://CRAN.R-project.org/package=wooldridge\n\nRoger Peng Reproducible Research Part 2 https://www.coursera.org/learn/reproducible-research/lecture/abevs/reproducible-research-concepts-and-ideas-part-2\n\nYihui Xie (2017). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.18.\n\n```{r session info}\nsessionInfo()\n```\n"},"formats":{"html":{"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"knitr"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[]},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"intro-to-r.html"},"language":{},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.2.269"},"extensions":{"book":{"multiFile":true}}}}} \ No newline at end of file +{"title":"Introduction","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}}},"headingText":"Introduction","containsRefs":false,"markdown":"\n\n\n\n::: {#header}\n\n:::\n\n```{r markdown setup, include=FALSE}\n\nknitr::opts_chunk$set(fig.path = \"intro-to-r/www/images/\")\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\nknitr::opts_chunk$set(warning = FALSE)\n\noptions(scipen = 999)\n```\n\n\nR is one of two premier programming languages for data science and one of the [fastest growing programming languages.](https://stackoverflow.blog/2017/10/10/impressive-growth-r/) Created by researchers for researchers (with some help from software engineers), R offers rich, intuitive tools that make it perfect for visualization, public policy analysis, econometrics, geospatial analysis, and statistics.\n\nR doesn't come in a box. R was never wrapped in cellophane and it definitely isn't purchased at a store. R's pricelessness and open-source development are two of its greatest strengths, but it can often leave new users without the anchor of the box and booklet often provided with proprietary software.\n\nThis guide is meant to be an on-ramp for soon-to-be R Users and a fill-in-the-gap guide for existing R Users. It starts with the most basic question, \"what is R?\" and progresses to advanced topics like organizing analyses. Along the way it even demonstrates how to read XKCD comics in R.\n\nR boasts a strong community in the world and inside the Urban Institute. Please don't hesitate to contact Aaron Williams (awilliams\\@urban.org) or Amy Rogin (arogin\\@urban.org) with thoughts or questions about R.\n\n## What is R?\n\n![](intro-to-r/images/r-logo.png){width=\"15%\"}\n\n[Source](https://www.r-project.org/logo/)\n\nR is a free, open-source software for statistical computing. It is known for intuitive, crisp graphics and an extensive, growing library of statistical and analytic methods. Above all, R boasts an enthusiastic community of developers, instructors, and users.\n\nThe copyright and documentation for R is held by a not-for-profit organization called [The R Foundation](https://www.r-project.org/foundation/).\n\n![](intro-to-r/images/r-studio-logo.png){width=\"15%\"}\n\n[Source, Fair use](https://en.wikipedia.org/w/index.php?curid=48590482)\n\nRStudio is a free, open-source integrated development environment (IDE) that runs on top of R. In practice, R users almost exclusively open RStudio and rarely directly open R.\n\nRStudio is developed by a for-profit company called [RStudio](https://www.rstudio.com/). RStudio, the company, employs some of the R community's most prolific, open-source developers and creates many open-source tools and trainings.\n\nWhile R code can be written in any text editor, the RStudio IDE is a powerful tool with a console, syntax-highlighting, and debugging tools. [This cheatsheet](https://github.com/rstudio/cheatsheets/raw/master/rstudio-ide.pdf) outlines the power of RStudio.\n\n## Installation and Updates\n\n------------------------------------------------------------------------\n\n### When should you update?\n\nAll Urban computers should come pre-installed with R and Rstudio. However your R version may be out of date and require updating. We recommend having at least R version 3.6.0 or higher. You can check what version of R you have installed by opening Rstudio and submitting the following line of code to the console: `R.Version()$version.string`.\n\nIf you're working on a personal computer, you may not have R or Rstudio installed. So follow this guide to install both on your computer.\n\n### Updating/Installing R\n\n1) Visit https://cran.r-project.org/bin/windows/base/. The latest R version will be the downloadable link at the top. As of 1/1/2020, that R version is 3.6.2. Click on the link at the top and download the `R-x.x.x-win.exe` file.\n\n2) Open the R-x.x.x-win.exe\\` file. Click next, accept all the defaults, and install R. After R has been installed, click the Finish button. You should not need admin privileges for this.\n\n3) Check that your version of R has been updated in Rstudio. If Rstudio is already open, first close it. Then open Rstudio and retype in `R.Version()$version.string`. You should see an updated version number printed out on the console.\n\n4) Test that R packages are loading as expected. Packages you already had installed should continue to work with newer versions of R. But in some cases, you may need to re-install the packages to work properly with new versions of R.\n\n### Updating/Installing Rstudio\n\n1) Open Rstudio and go to Help \\> Check for Updates to see if RStudio is up-to-date\n\n2) If it is out-of-date, download the [appropriate update](https://rstudio.com/products/rstudio/download/#download).\n\n3) Before you run the installer, contact IT at helpdesk\\@urban.org for administrative approval as the program requires admin access.\n\n4) Run the installer and accept all defaults.\n\nMoving forward, RStudio will automatically and regularly update on Windows computers at the Urban Institute.\n\n## Learning R\n\n------------------------------------------------------------------------\n\n### What to Learn\n\nThere is often more than one way to accomplish a goal in R because of the language's flexibility. At first, this flexibility can be overwhelming. That's why it is useful to pick and master one set of tools in R before branching out and learning everything R.\n\nFortunately, [Hadley Wickham's tidyverse](https://www.tidyverse.org/) offers a comprehensive set of tools for data analysis that are good for both beginners and experts. The tidyverse is self-described as \"an opinionated collection of R packages designed for data science.\" The tidyverse consists of almost two dozen clear and concise tools for every part of an analysis workflow. At first, focus on the function `read_csv()` for loading data, the package `dplyr` for manipulating data, and the package `ggplot2` for plotting.\n\nHere's a quick example that reads a .csv, filters the data, and creates a publishable column plot in just fifteen lines of code:\n\n```{r quick example}\n# load packages and source the Urban Institute ggplot2 theme\nlibrary(tidyverse) # contains read_csv, library(dplyr), and library(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\n# read bankdata.csv\nbank <- read_csv(\"intro-to-r/data/bankdata.csv\") \n\nbank_subset <- bank %>%\n\t# filter to observations of unmarried mothers less than age 30\n\tfilter(married == \"NO\" & age < 30) %>%\t\n\t# drop all variables except children and income\n\tselect(children, income)\t\t\t\t\t\t\t\t\n\n# plot!\nbank_subset %>%\n\tggplot(mapping = aes(x = children, y = income)) +\n\tgeom_bar(stat = \"summary\", fun.y = \"mean\") +\n\tscale_y_continuous(expand = c(0, 0), labels = scales::dollar) +\n\tlabs(title = \"Mean income\",\n\t\t\t subtitle = \"Unmarried mothers less than age 30\",\n\t\t\t caption = \"Urban Institute analysis of bank data\",\n\t\t\t x = \"Number of children\",\n\t\t\t y = \"Income\")\n```\n\n### Resources for Learning\n\n*R for Data Science* by Hadley Wickham and Garrett Grolemund is the best print resource for learning R and the tidyverse. The book is available [online](http://r4ds.had.co.nz/index.html) for free and *begins* with visualization which is motivating and practical. *R for Data Science* contains dozens of worthwhile exercises but no solutions guide. Please check your solutions against the [Urban Institute r4ds solutions guide on GitHub](https://github.com/UI-Research/r4ds-exercises.git) and please contribute if the exercise isn't already in the guide!\n\nRStudio publishes a number of cheat sheets that cover the tidyverse. The main cheat sheets can be accessed in RStudio at Help \\> Cheat Sheets. Additional cheat sheets are accessible here on the [RStudio website](https://www.rstudio.com/resources/cheatsheets/).\n\nDavid Robinson, a data scientist from Data Camp, has a new [video course](https://www.datacamp.com/instructors/drobinson) about the tidyverse. Few people know as much about R and communicate as effectively as David Robinson.\n\n*Advanced R* by Hadley Wickham is a good resource for new R users that have experience with other programming languages and computer science. It is available [online](http://adv-r.had.co.nz/) for free.\n\n### Library\n\nIt's easy to feel overwhelmed by the frenetic development of the extended R universe. Books are an invaluable resource for slowing down and focusing on fully-formed ideas.\n\nAaron Williams (awilliams\\@urban.org) has a number of books that can be checked out:\n\n- [The Art of R Programming](https://www.nostarch.com/artofr.htm)\n- [ggplot2](http://www.springer.com/us/book/9780387981413)\n- [Efficient R Programming](http://shop.oreilly.com/product/0636920047995.do) ([Online!](https://csgillespie.github.io/efficientR/))\n- [Text Mining with R](http://shop.oreilly.com/product/0636920067153.do) ([Online!](https://www.tidytextmining.com/))\n- [Reasoning with Data](https://www.guilford.com/books/Reasoning-with-Data/Jeffrey-Stanton/9781462530267/reviews)\n- [Practical Statistics for Data Scientists](http://shop.oreilly.com/product/0636920048992.do)\n\n### Built-in Data Sets\n\nR has many built-in data sets that are useful for practice and even more data sets are accessible through R packages.\n\nSubmitting `data()` shows a list of all available data sets. `cars` and `iris` are two classic sets that are used in many examples.\n\n`library(tidyverse)` loads many more \"tidy\" data sets including `diamonds` and `starwars`.\n\n```{r tidyverse}\nlibrary(tidyverse)\nstarwars %>%\n\tcount(species) %>%\n\tarrange(desc(n)) %>%\n\thead()\n```\n\n`library(dslabs)` by [Rafael Irizarry](https://simplystatistics.org/2018/01/22/the-dslabs-package-provides-datasets-for-teaching-data-science/) includes varied data sets that are intentionally imperfect that are useful for practice. Students of econometrics will enjoy `library(wooldridge)`. It loads 105 data sets from *Introductory Econometrics: A Modern Approach* by Jeffrey Wooldridge. Now you can practice estimating your hedonic pricing models in R!\n\n```{r psid}\nlibrary(wooldridge)\nlibrary(tidyverse)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nas_tibble(hprice1) %>%\n\tggplot(aes(x = sqrft, y = price)) +\n\tgeom_point() +\n\tscale_y_continuous(expand = c(0, 0), lim = c(0, 800)) +\n\tlabs(title = '\"hprice1\" data from Wooldridge') \n```\n\n### Getting Help\n\nEven the best R programmers spend hours each week searching the Internet for answers. Here are some of the best ways to find answers:\n\nSubmit `?` and any function name without parentheses (ex. `?mean`) to see the function documentation in RStudio.\n\nWhen Googling, set the search range to the last year to avoid out-of-date solutions and to focus on up-to-date practices.\n\n[Stack Overflow](https://stackoverflow.com/) contains numerous solutions. Add `[r]` to any search to limit results to R. If a problem is particularly perplexing, it is simple to submit questions. Exercise caution when submitting questions because the Stack Overflow community has strict norms about questions and loose norms about respecting novices.\n\n[RStudio Community](https://community.rstudio.com/) is a new forum for R Users. It has a smaller back catalog than Stack Overflow but users are friendlier than on Stack Overflow.\n\nFinally, Aaron Williams (awilliams\\@urban.org) from IBP and Amy Rogin (arogin\\@urban.org) from METRO are available to solve problems, offer guidance, and share R enthusiasm.\n\n### CRAN Task Views\n\nR has sub-communities, frameworks, and tools focused on different subject-matter and and methodological areas. [CRAN Task Views](https://cran.r-project.org/web/views/) is invaluable for understanding these communities and finding the best frameworks and tools for different disciplines in R.\n\nCRAN Task Views has 35 pages focused on subcategories of R ranging from [econometrics](https://cran.r-project.org/web/views/Econometrics.html) to natural language processing. Each page is maintained by a subject-matter expert and contains methods, packages, books, and mailing lists that are useful for researchers.\n\nThe econometrics page alone contains detailed information on basic linear regression, microeconometrics, instrumental variables, panel data models, further regression models, time series data and models, data sets, CRAN packages, articles, books, and more.\n\n## R Code\n\n------------------------------------------------------------------------\n\nIt's time to start writing R code. Remember, most R users never open R and exclusively use RStudio. Go ahead and open R once to admire its dated text editor. Then, close R and never directly open it again. Now, open RStudio.\n\n### Submitting Code\n\nRStudio has four main panels: code editor (top left by default), R console (bottom left by default), environment and history (top right by default), and files, plots, packages, help, and viewer pane (bottom right by default).\n\nThere are two main ways to submit code:\n\n1) Type code to the right of ![](intro-to-r/images/code-console.png) in the R console and hit enter. **Note:** R won't create a long-term record of this code.\n2) Click ![](intro-to-r/images/new-script.png) in the top left to create a new R script in the code editor panel. Type code in the script. Highlight desired code and either click run the in top right of the code editor panel or type Ctrl/command-enter to run code. Scripts can be saved, so they are the best way to write code that will be used again.\n\nFor practice, submit `state.name` in the R console to create a vector with all fifty state names (sorry statehood advocates, no Washington, D.C.). Next, create a script, paste `state.name`, highlight the text, and click run at the top right of the code editor. You should get the same output both times.\n\n```{r state names}\nstate.name\n```\n\n### Syntax\n\nThe are five fundamental pieces of syntax in R.\n\n- `<-` is the assignment operator. An object created on the right side of an assignment operator is assigned to a name on the left side of an assignment operator. Assignment operators are important for saving the consequences of operations and functions. Operations without assignment operators will typically be printed to the console but not saved.\n- `#` begins a comment. Comments are useful for explaining decisions in scripts. As Haldey Wickham notes in the [Tidyverse styleguide](http://style.tidyverse.org/), 'In code, use comments to explain the \"why\" not the \"what\" or \"how\".\n- `c()` combines similar vectors into larger vectors. For example, `c(1, 2, 3)` is a numeric vector of length three made up of three numeric vectors of length one.\n- `?` in front of any function name without parentheses returns function documentation. For example, `?mean`.\n- `%>%` from `library(magrittr)` and `library(tidyverse)` is the \"pipe operator\". It passes the output from one function to another function. This is useful because strings of operations can be \"piped\" together instead of each individual operation needing to be assigned to an object.\n\n### Vectors\n\nVectors are the fundamental piece of data in R. R has six vector types (you can't mix vector types): logical, integer, double, character, complex, and raw. . You can check the type of a vector with `typeof()` and the length with `length()`\n\n### Data frames\n\nData frames are combinations of equal length vectors. Data analysis in R is built around the data frames. As a guiding principle working with data frames, you want to have \"tidy data\" whenever possible. A tidy data frame means that :\n\n1. Each variable has its own column.\n\n2. Each observation has its own row.\n\n3. Each value has its own cell.\n\n[![\\[Source\\](https://r4ds.had.co.nz/tidy-data.html)](intro-to-r/images/tidy-data.png)](https://r4ds.had.co.nz/tidy-data.html)\n\nHaving data in a tidy format allows R's vectorized nature to shine and many of the `tidyverse` functions are designed for tidy data.\n\n### Missing values\n\nR stores missing values as `NA`. A single `NA` in a calculation can cause the entire result to return as `NA`.\n\n```{r}\nsum(c(2, 2, NA))\n```\n\nThe contagiousness of `NA` is good, it makes users explicitly acknowledge dropping missing values with `na.rm = TRUE`.\n\n```{r}\nsum(c(2, 2, NA), na.rm = TRUE)\n```\n\n`== NA` does not test for missing values. Instead, use `is.na()`.\n\n- is.na() and math with booleans\n- complete.cases\n\n### Functions\n\nFunctions in R are collections of code that when called cause certain actions. R contains hundreds of functions and thousands of more functions can be accessed through packages.\n\nMost functions take arguments. For example, the function `mean()` has arguments `x`, `trim`, `na.rm`, and `...`. The first argument in most functions, in this case `x`, is an input object. Arguments can be passed to functions by name or position. `mean(c(1, 2, 3))` is equivalent to `mean(x = c(1, 2, 3))`.\n\nNotice how the other three arguments were skipped. Most arguments in functions have default values. The best way to see default values is to submit the function name with a question mark, like `?mean`. In this case, `trim = 0`, `na.rm = FALSE`, and no further arguments were passed through with `...`.\n\nIn the previous example, the `c()` function was nested inside of the `mean()` function. It is also possible to assign a vector of 1, 2, and 3 to a name and pass the name to the mean function.\n\n```{r mean, eval = FALSE}\napples <- c(1, 2, 3)\n\nmean(apples)\n```\n\nR is a [functional programming language](http://adv-r.had.co.nz/Functional-programming.html). In addition to having many pre-made functions like `mean()`, R has powerful tools for creating and manipulating custom functions. This is useful because:\n\n- It avoids tedious and error-prone copying-and-pasting and makes iterating processes simple;\n- Is a powerful way to organize sets of operations;\n- Is a standardized way to save code for later and to share operations with others.\n\nThis last bullet is key to the package system in R.\n\n### Packages\n\nOpening RStudio automatically loads \"base R\", a fundamental collection of code and functions that handles simple operations like math and system management. R can be extended with collections of code and functions developed by the R community called packages. This sounds wild, but most packages are created and maintained by some of the best statisticians and developers in the world.\n\nMost packages can be installed with `install.packages(\"dplyr\")`, where the string between the quotation marks is the name of the package. Packages installed with `install.packages()` come from CRAN and must pass certain checks for performance and documentation. Popular packages on CRAN, like dplyr, have as much, if not more support, standards, and quality than code in proprietary software packages like Stata or SAS.\n\nIt is possible, but less common, to install packages from places like GitHub. This is less secure and the functionality of the packages is more likely to change over time. `install.packages()` need only be run once per version of package per machine and should rarely be included in .R scripts.\n\nPackages are loaded once per R session with the function `library()`. It is a good idea to include `library(package-name)` at the top of scripts for each package used in the script. This way it is obvious at the top of the script which packages are installed and loaded.\n\n**Note:** `install.packages()` uses quoted package names and `library()` uses unquoted package names.\n\nFor practice, submit the following three lines of code to install `RXKCD`, load `library(RXKCD)`, and get a random [XKCD comic](https://www.xkcd.com/).\n\n```{r xkcd, eval=FALSE}\ninstall.packages(\"RXKCD\")\nlibrary(RXKCD)\ngetXKCD(\"random\")\n```\n\n```{r xkcd run, echo=FALSE}\nlibrary(RXKCD)\n# assignment to hide text output\ncomic <- getXKCD(539)\n```\n\nPackages are frequently updated, especially around the time R versions change. The easiest way to update packages is Tools \\> Check for Package Updated in RStudio.\n\nOccasionally, two loaded packages will have functions with identical names. Any conflicts with be announced when loading packages. See how `filter()` and `lag()` from `library(tidyverse)` and `library(stats)` conflict:\n\n![](intro-to-r/images/load-tidyverse.png) In this case, the tidyverse functions are usually favored. If there is ever a conflict or any doubt about which function is used, use the package name and `::` to directly call the function. For example, `dplyr::select(apples)`. `::` can also be used to call a function without loading the entire package.\n\n### CRAN\n\nThe [Comprehensive R Archive Network](https://cran.r-project.org/index.html) (CRAN) contains almost 12,000 packages contributed over the last two decades by a range of developers. New packages are added to CRAN almost every day.\n\nCRAN enables R to have all of the benefits of open-source development and the security and predictability of proprietary statistical packages like SAS and Stata. CRAN weds the benefits of broad-based, real-time package development with certain [standards](https://cran.r-project.org/index.html) for functionality and documentation. Methods and tools make it to R before SAS or Stata, if they ever make it to SAS or Stata, but have standards that generally exceed Python or other open-source languages. (See: [Malicious Libraries Found on Python Package Index (PyPI)](https://www.blog.pythonlibrary.org/2017/09/15/malicious-libraries-found-on-python-package-index-pypi/))\n\nBecause of CRAN's long history and R's place in the statistics community, CRAN contains many methods that can't be accessed, much less duplicated, using proprietary software. In addition to being useful now, this also ensures that R isn't a temporary fad and will have staying power because of the challenge of replicating or besting CRAN.\n\nR's extensible design is important, but most tasks can be accomplished with a handful of packages:\n\n- `ggplot2` data visualization\n- `dplyr` data management\n- `tidyr` data tidying\n- `readr` data import\n- `purrr` functional programming\n- `tibble` data frames\n- `hms` times\n- `stringr` character strings\n- `lubridate` dates/times\\\n- `forcats` factors\n- `DBI` databases\n- `haven` SPSS, SAS, and Stata files\n- `readxl`.xls and .xlsx\n- `modelr` simple modeling within a pipeline\n- `broom` turning models into tidy data\n- `tidyverse` loads all of the packages listed up to this point; see Hadley Wichkham's \"[tidyverse](https://blog.rstudio.org/2016/09/15/tidyverse-1-0-0/)\"\n\n## Organizing Analyses\n\n------------------------------------------------------------------------\n\nThis section outlines how to organize an analysis to get the most out of R. Newer users may want to skip this section and work through [R for Data Science](http://r4ds.had.co.nz/) until they understand `library(readr)`, `library(dplyr)`, and `library(ggplot2)`.\n\n### Projects\n\nOrganizing scripts, files, and data is one of the most important steps to creating a clear and reproducible analysis.\n\nR Projects, proper noun, are the best way to organize an analysis. They have several advantages:\n\n- They make it possible to concurrently run multiple RStudio sessions.\n- They allow for project-specific RStudio settings.\n- They integrate well with Git version control.\n- They are the \"node\" of relative file paths. (more on this in a second)\n\nBefore setting up an R Project, go to Tools \\> Global Options and uncheck \"Restore most recently opened project at startup\".\n\n![](intro-to-r/images/restore.png){width=\"50%\"}\n\nEvery new analysis in R should start with an R Project. First, create a directory that holds all data, scripts, and files for the analysis. Storing files and data in a sub-directories is encouraged. For example, data can be stored in a folder called data/.\n\nNext, click \"New Project...\" in the top right corner.\n\n![](intro-to-r/images/new-project.png){width=\"50%\"}\n\nWhen prompted, turn your recently created \"Existing Directory\" into a project.\n\n![](intro-to-r/images/existing-directory.png){width=\"50%\"}\n\nUpon completion, the name of the R Project should now be displayed in the top right corner of RStudio where it previously displayed \"Project: (None)\". Once opened, .RProj files do not need to be saved. Double-clicking .Rproj files in the directory is now the best way to open RStudio. This will allow for the concurrent use of multiple R sessions and ensure the portability of file paths. Once an RStudio project is open, scripts can be opened by double-clicking individual files in the computer directory or clicking files in the \"Files\" tab in the top right of RStudio.\n\nR Projects make code highly portable because of the way they handle file paths. Here are a few rules:\n\n#### Filepaths\n\nNever use `\\` in file paths in R. `\\` is a regular expression and will complicate an analysis. Fortunately, RStudio understands `/` in file paths regardless of operating system.\n\nNever use `setwd()` in R. It is unnecessary, it makes code unreproducible across machines, and it is rude to collaborators. R Projects create a better framework for file paths. Simply treat the directory where the R Project lives as the working directory and directories inside of that directory as sub-directories.\n\nFor example, say there's a `.Rproj` called `starwars-analysis.Rproj` in a directory called `starwars-analysis`. If there is a .csv in that folder called `jedi.csv`, the file can be loaded with `read_csv(\"jedi.csv\")` instead of `read_csv(\"H:/ibp/analyses/starwars-analysis/diamonds.csv\")`. If that file is in a sub-directory of `starwars-analysis` called `data`, it can be loaded with `read_csv(\"data/jedi.csv\")`. The same concepts hold for writing data and graphics.\n\nThis simplifies code and makes it portable because all relative filepaths will be identical on all computers. To share an analysis, simply send the entire directory to a collaborator or share it with GitHub.\n\nHere's an example directory:\n\n
![](intro-to-r/images/directory.png){width=\"50%\"}
\n\nIt isn't always possible to avoid absolute file paths because of the many different ways the Urban Institute stores data. Avoid absolute paths when possible and be deliberate about where analyses live in relation to where data live.\n\nFinally, it's good practice to include a README in the same directory as the .Rproj. The README should outline the purpose and the directories and can include information about how to contribute, licenses, dependencies, and acknowledgements. This [GitHub page](https://gist.github.com/PurpleBooth/109311bb0361f32d87a2) is a good README template.\n\nCheck out [R for Data Science](http://r4ds.had.co.nz/workflow-projects.html) by Hadley Wickham and Garrett Grolemund for a more thorough explanation of this workflow. Jenny Bryan also has a good [blogpost](https://www.tidyverse.org/articles/2017/12/workflow-vs-script/) about avoiding `setwd()`.\n\n### Naming Conventions\n\nNaming functions, objects, variables, files, and scripts is one of the toughest and least-taught dimensions of computer programming. Better names can add clarity to code, save time and effort, and minimize errors caused by accidentally overwriting existing functions or other objects.\n\n> There are only two hard things in Computer Science: cache invalidation and naming things. \\~ [Phil Karlton](http://www.meerkat.com/2017/12/naming-things-hard/)\n\n#### Functions and Other Objects\n\nR is case-sensitive.\n\nObjects in R can be named anything - [even unicode characters](https://www.r-bloggers.com/rules-for-naming-objects-in-r/). But just because something *can* be named anything doesn't mean it should.\n\nMost functions and objects in R are lowerCamelCase, period.separated, or underscore_separated. As an individual or team, it's important to pick a style and stick with it, but as [this article](https://journal.r-project.org/archive/2012-2/RJournal_2012-2_Baaaath.pdf) from 2012 shows, there isn't much consistency across the R community. Hadley Wickham's tidyverse uses underscores, so expect to see some consolidation into this style.\n\nIn general, it's good practice to name functions with verbs and other objects with nouns.\n\nVariable and object names that start with numbers, have spaces, or use peculiar syntax require back-ticks.\n\n> select(urban, \\`R Users Group\\`)\n\n> urban\\$\\`R Users Group\\`)\n\nFinally, it's possible to overwrite existing functions and other objects in R with the assignment operator. Don't give vectors or data frames the same names as exisiting functions and don't overwrite existing functions with custom functions.\n\n#### Files\n\nNaming conventions for scripts and files is probably the most overlooked dimension in programming and analysis. The first three bullets from this section come from this [rich slide deck](http://www2.stat.duke.edu/~rcs46/lectures_2015/01-markdown-git/slides/naming-slides/naming-slides.pdf) by Jenny Bryan. This may seem pedantic, but picking a file naming convention now can save a bunch of time and headaches in the future.\n\n**1) Machine readable**\n\nCreate file names that are easily machine readable. Use all lower case letters and skip punctuation other than delimiters. Use underscores as characters for splitting the file name. For example, `stringr::str_split_fixed(\"2018-01-10_r-introduction_machine-readable-example_01.csv\", \"[_\\\\.]\", 5)` splits the file name on underscores and periods and returns date, project, file name, file number, and file type. This information can then be stored and sorted in a data frame.\n\n**2) Human readable**\n\nCreate file names that are human readable. The example from above is informative without any machine interpretation.\n\n**3) Plays well with default ordering**\n\nIt is often useful to include date or sequence numbers in script and file names. For example, include 2018-10-01 for data collected on January 10th, 2018 or include 3 for the third script a sequence of five `.R` programs. Starting file names with the date or sequence numbers means files will show up in a logical order by default. Be sure to use ISO 8601 standard for dates (YYYY-MM-DD).\n\n**4) Don't Use File Names for Version Control**\n\nVersion control with file names is unwieldy and usually results in names that are barely human readable and definitely not machine readable.\n\n> \"2018-01-10_r-introduction_machine-readable-example_01_v2_for-aaron_after-review_before-submission.R\"\n\nIterations usually don't iterate sensibly. For example, what was \"v1\", \"v2\" abandoned for \"for-aaron\", \"after-review\", \"before-submission\". Furthermore, version control with file names is poor for concurrent work and merging.\n\nThe next section will outline the optimal tool for version control.\n\n### Version Control\n\nThe workflow outlined above integrates perfectly with version control like Git and distributed version control repository hosting services like GitHub.\n\nVersion control is a system for recording changes to files over time. Version control is built around repositories. In this case, the folder containing the `.Rproj` is the perfect directory to use as a repository. A handful of simple commands are used to track and commit changes to text files (.R, .Rmd, etc.) and data. This record is valuable for testing alternatives, communicating with others and your future self, and documenting progress on projects.\n\nGitHub is a distributed repository system built on top of Git. GitHub has a number of valuable tools for collaboration and project management. In particular, it makes concurrent collaboration on code simpler with branches and has a slick system for issues. Here are the [branches](https://github.com/UrbanInstitute/urban_R_theme/branches) and [issues](https://github.com/UrbanInstitute/urban_R_theme/issues) for the Urban Institute R Graphics Guide. It also has free web hosting for websites like the website you are reading right now. [GitHub has a quick guide that is a good place to start learning Git](https://try.github.io/levels/1/challenges/1).\n\nThe Urban Institute has a number of legacy models and code bases that span years and have been touched by scores of brilliant researchers. The future value of a record of all code changes and development is borderline unthinkable.\n\n### Coding Style\n\n> \"Good coding style is like using correct punctuation. You can manage without it, but it sure makes things easier to read.\" \\~Hadley Wickham (2014)\n\ngood coding style is like using correct punctuation you can manage without it but it sure makes thing easier to read\n\nThe details of a coding style are less important than consistently sticking to that style. Be flexible when working with collaborators so the style doesn't change inside an analysis.\n\nHere are three good sources for inspiration:\n\n- [Tidyverse Style Guide](http://style.tidyverse.org/)\n- [Google's R Style Guide](https://google.github.io/styleguide/Rguide.xml)\n- [Hadley Wickham's R Style Guide](http://adv-r.had.co.nz/Style.html)\n\n## Putting it All Together\n\n------------------------------------------------------------------------\n\nR can augment or replace a traditional proprietary statistical packages like SAS or Stata with a few extra bells and whistles, but hopefully this guide and other resources show a fuller vision for developing reproducible, accurate, and collaborative analyses.[^1]\n\n[^1]: The language \"reproducible, accurate, and collaborative analyses\" comes from [Hilary S. Parker's talk](https://www.rstudio.com/resources/videos/opinionated-analysis-development/) at rstudio::conf 2017 about opinionated analysis development.\n\nThis research pipeline, to use the phrase by Roger Peng, Jeff Leek, and Brian Caffo, combines the best of traditional economic and social policy research, computer science/software development, and statistics.[^2] Here are the rules:\n\n[^2]: The basis for [this section](https://www.coursera.org/learn/reproducible-research/lecture/abevs/reproducible-research-concepts-and-ideas-part-2) comes from this Coursera talk by Roger Peng.\n\n#### 1) No steps in an analysis are done by hand and all steps are recorded with executable scripts.\n\nIt is common to use executable scripts to estimate a regression equation or to tabulate weighted summary statistics. But for some reason, other steps like file management, data munging, and visualization are often done \"by hand\". Good science demands that every step of an analysis is recorded - and if possible - with executable scripts.\n\nFortunately, it is possible to script most steps in R from downloading data from the Internet and accessing APIs to visualizations and drafting manuscripts. This may be challenging at first, but it will save time and result in better research in the long run.\n\n#### 2) All code is entirely reproducible and portable.\n\nExecutable scripts are for communicating with other researchers and our future selves. Scripts lose value if they aren't portable and can't be reproduced in the future or by others. Recording every step with execuatble scripts is a start, but scripts aren't valuable if they require expensive proprietary software,or if researchers have to significantly alter scripts to run an analysis.\n\nOpen source software, like R, promotes accessibility, portability, and reproducibility. Also, be sure to avoid `setwd()` and use relative filepaths.\n\n#### 3) Local and collaborative version control is used and all repositories include all code and a README.\n\nUse local version control like Git and a distributed version control repository hosting service like GitHub to track changes and share analyses. The version control should include all scripts and meta information about the analysis in a README.\n\n#### 4) Raw data and tidy analytic data are stored in a collaborative location with a code book.\n\nMany raw data are already stored in collaborative locations like BLS.gov and don't need to be duplicated. Tidy analytic data, like the data used to estimate a regression equation, should be stored in a collaborative location. This is good practice, but is less essential if executable scripts are flawless and reproducible. Researcher-entered data and data from less-stable sources should be stored in raw and analytic forms.\n\nSmall data sets can be stored on GitHub without issue. Larger data sets should be stored in collaborative locations accessible by scripting languages. This is only possible for public data and best-practices for private data are less established.\n\nSave codebooks for data sets as text files or PDFs in repositories. Creating codebooks for user-entered data or variables created in executable scripts is often worth the time.\n\n#### 5) Code review and issue tracking are used to improve accuracy and computational efficiency.\n\nGetting stronger programmers and/or methodologists to review code is valuable for limiting programming and analytic mistakes, improving computational efficiency, and learning.\n\n[GitHub issues](https://guides.github.com/features/issues/) is a powerful tool for managing, discussing, and collaborating on code.\n\n#### 6) Projects rely heavily on literate statistical programming and standard means of distribution for execution, validation, and publishing.\n\nLiterate statistical programming is the combination of natural language explanations for humans and executable code in one document. The idea was created by Donald Knuth and is embodied by R Markdown.\n\nR Markdown combines text chunks, code chunks, and output chunks in one script that can be \"knitted\" using `library(knitr)` to created PDFs, books, .htmls, and websites like the website where this guide lives.\n\nThis workflow combines the analytic and narrative process in a tool that is flexible, scalable, reproducible, and less error-prone. R Markdown documents can be used for executing programs, validating models and analyses, and publishing. These documents can be submitted to many academic journals and shared easily with [GitHub pages](https://pages.github.com/).\n\n#### 7) Software versions and dependencies are recorded and all software is cited in publications.\n\n`sessionInfo()` reports the R version, locale, packages used, and other important information about an R session. `citation()` creates a text and BibTex entry of the citation for R. `citation()` creates a text and BibTex entry for R packages. `library(packrat)` (outlined [here](https://rstudio.github.io/packrat/)) is a tool for saving R dependencies.\n\n## Bibliography and References\n\n------------------------------------------------------------------------\n\nHadley Wickham (2017). tidyverse: Easily Install and Load the 'Tidyverse'. R package version 1.2.1. https://CRAN.R-project.org/package=tidyverse\n\nHadley Wickham and Garrett Grolemund (2017). R For Data Science http://r4ds.had.co.nz/\n\nHadley Wickham (2014). Advanced R http://adv-r.had.co.nz/Style.html\n\nHilary S. Parker (2017. Opinionated Analysis Development https://www.rstudio.com/resources/videos/opinionated-analysis-development/\n\nJenny Bryan (2017).\\\nProject-oriented workflow https://www.tidyverse.org/articles/2017/12/workflow-vs-script/\n\nJenny Bryan (2015). naming things. http://www2.stat.duke.edu/\\~rcs46/lectures_2015/01-markdown-git/slides/naming-slides/naming-slides.pdf\n\nJJ Allaire, Yihui Xie, Jonathan McPherson, Javier Luraschi, Kevin Ushey, Aron Atkins, Hadley Wickham, Joe Cheng and Winston Chang (2017). rmarkdown: Dynamic Documents for R. R package version 1.8. https://CRAN.R-project.org/package=rmarkdown\n\nJustin M. Shea (2017). wooldridge: 105 Data Sets from \"Introductory Econometrics: A Modern Approach\" by Jeffrey M. Wooldridge. R package version 1.2.0. https://CRAN.R-project.org/package=wooldridge\n\nRoger Peng Reproducible Research Part 2 https://www.coursera.org/learn/reproducible-research/lecture/abevs/reproducible-research-concepts-and-ideas-part-2\n\nYihui Xie (2017). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.18.\n\n```{r session info}\nsessionInfo()\n```\n","srcMarkdownNoYaml":"\n\n\n\n::: {#header}\n\n:::\n\n```{r markdown setup, include=FALSE}\n\nknitr::opts_chunk$set(fig.path = \"intro-to-r/www/images/\")\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\nknitr::opts_chunk$set(warning = FALSE)\n\noptions(scipen = 999)\n```\n\n# Introduction\n\nR is one of two premier programming languages for data science and one of the [fastest growing programming languages.](https://stackoverflow.blog/2017/10/10/impressive-growth-r/) Created by researchers for researchers (with some help from software engineers), R offers rich, intuitive tools that make it perfect for visualization, public policy analysis, econometrics, geospatial analysis, and statistics.\n\nR doesn't come in a box. R was never wrapped in cellophane and it definitely isn't purchased at a store. R's pricelessness and open-source development are two of its greatest strengths, but it can often leave new users without the anchor of the box and booklet often provided with proprietary software.\n\nThis guide is meant to be an on-ramp for soon-to-be R Users and a fill-in-the-gap guide for existing R Users. It starts with the most basic question, \"what is R?\" and progresses to advanced topics like organizing analyses. Along the way it even demonstrates how to read XKCD comics in R.\n\nR boasts a strong community in the world and inside the Urban Institute. Please don't hesitate to contact Aaron Williams (awilliams\\@urban.org) or Amy Rogin (arogin\\@urban.org) with thoughts or questions about R.\n\n## What is R?\n\n![](intro-to-r/images/r-logo.png){width=\"15%\"}\n\n[Source](https://www.r-project.org/logo/)\n\nR is a free, open-source software for statistical computing. It is known for intuitive, crisp graphics and an extensive, growing library of statistical and analytic methods. Above all, R boasts an enthusiastic community of developers, instructors, and users.\n\nThe copyright and documentation for R is held by a not-for-profit organization called [The R Foundation](https://www.r-project.org/foundation/).\n\n![](intro-to-r/images/r-studio-logo.png){width=\"15%\"}\n\n[Source, Fair use](https://en.wikipedia.org/w/index.php?curid=48590482)\n\nRStudio is a free, open-source integrated development environment (IDE) that runs on top of R. In practice, R users almost exclusively open RStudio and rarely directly open R.\n\nRStudio is developed by a for-profit company called [RStudio](https://www.rstudio.com/). RStudio, the company, employs some of the R community's most prolific, open-source developers and creates many open-source tools and trainings.\n\nWhile R code can be written in any text editor, the RStudio IDE is a powerful tool with a console, syntax-highlighting, and debugging tools. [This cheatsheet](https://github.com/rstudio/cheatsheets/raw/master/rstudio-ide.pdf) outlines the power of RStudio.\n\n## Installation and Updates\n\n------------------------------------------------------------------------\n\n### When should you update?\n\nAll Urban computers should come pre-installed with R and Rstudio. However your R version may be out of date and require updating. We recommend having at least R version 3.6.0 or higher. You can check what version of R you have installed by opening Rstudio and submitting the following line of code to the console: `R.Version()$version.string`.\n\nIf you're working on a personal computer, you may not have R or Rstudio installed. So follow this guide to install both on your computer.\n\n### Updating/Installing R\n\n1) Visit https://cran.r-project.org/bin/windows/base/. The latest R version will be the downloadable link at the top. As of 1/1/2020, that R version is 3.6.2. Click on the link at the top and download the `R-x.x.x-win.exe` file.\n\n2) Open the R-x.x.x-win.exe\\` file. Click next, accept all the defaults, and install R. After R has been installed, click the Finish button. You should not need admin privileges for this.\n\n3) Check that your version of R has been updated in Rstudio. If Rstudio is already open, first close it. Then open Rstudio and retype in `R.Version()$version.string`. You should see an updated version number printed out on the console.\n\n4) Test that R packages are loading as expected. Packages you already had installed should continue to work with newer versions of R. But in some cases, you may need to re-install the packages to work properly with new versions of R.\n\n### Updating/Installing Rstudio\n\n1) Open Rstudio and go to Help \\> Check for Updates to see if RStudio is up-to-date\n\n2) If it is out-of-date, download the [appropriate update](https://rstudio.com/products/rstudio/download/#download).\n\n3) Before you run the installer, contact IT at helpdesk\\@urban.org for administrative approval as the program requires admin access.\n\n4) Run the installer and accept all defaults.\n\nMoving forward, RStudio will automatically and regularly update on Windows computers at the Urban Institute.\n\n## Learning R\n\n------------------------------------------------------------------------\n\n### What to Learn\n\nThere is often more than one way to accomplish a goal in R because of the language's flexibility. At first, this flexibility can be overwhelming. That's why it is useful to pick and master one set of tools in R before branching out and learning everything R.\n\nFortunately, [Hadley Wickham's tidyverse](https://www.tidyverse.org/) offers a comprehensive set of tools for data analysis that are good for both beginners and experts. The tidyverse is self-described as \"an opinionated collection of R packages designed for data science.\" The tidyverse consists of almost two dozen clear and concise tools for every part of an analysis workflow. At first, focus on the function `read_csv()` for loading data, the package `dplyr` for manipulating data, and the package `ggplot2` for plotting.\n\nHere's a quick example that reads a .csv, filters the data, and creates a publishable column plot in just fifteen lines of code:\n\n```{r quick example}\n# load packages and source the Urban Institute ggplot2 theme\nlibrary(tidyverse) # contains read_csv, library(dplyr), and library(ggplot2)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\n# read bankdata.csv\nbank <- read_csv(\"intro-to-r/data/bankdata.csv\") \n\nbank_subset <- bank %>%\n\t# filter to observations of unmarried mothers less than age 30\n\tfilter(married == \"NO\" & age < 30) %>%\t\n\t# drop all variables except children and income\n\tselect(children, income)\t\t\t\t\t\t\t\t\n\n# plot!\nbank_subset %>%\n\tggplot(mapping = aes(x = children, y = income)) +\n\tgeom_bar(stat = \"summary\", fun.y = \"mean\") +\n\tscale_y_continuous(expand = c(0, 0), labels = scales::dollar) +\n\tlabs(title = \"Mean income\",\n\t\t\t subtitle = \"Unmarried mothers less than age 30\",\n\t\t\t caption = \"Urban Institute analysis of bank data\",\n\t\t\t x = \"Number of children\",\n\t\t\t y = \"Income\")\n```\n\n### Resources for Learning\n\n*R for Data Science* by Hadley Wickham and Garrett Grolemund is the best print resource for learning R and the tidyverse. The book is available [online](http://r4ds.had.co.nz/index.html) for free and *begins* with visualization which is motivating and practical. *R for Data Science* contains dozens of worthwhile exercises but no solutions guide. Please check your solutions against the [Urban Institute r4ds solutions guide on GitHub](https://github.com/UI-Research/r4ds-exercises.git) and please contribute if the exercise isn't already in the guide!\n\nRStudio publishes a number of cheat sheets that cover the tidyverse. The main cheat sheets can be accessed in RStudio at Help \\> Cheat Sheets. Additional cheat sheets are accessible here on the [RStudio website](https://www.rstudio.com/resources/cheatsheets/).\n\nDavid Robinson, a data scientist from Data Camp, has a new [video course](https://www.datacamp.com/instructors/drobinson) about the tidyverse. Few people know as much about R and communicate as effectively as David Robinson.\n\n*Advanced R* by Hadley Wickham is a good resource for new R users that have experience with other programming languages and computer science. It is available [online](http://adv-r.had.co.nz/) for free.\n\n### Library\n\nIt's easy to feel overwhelmed by the frenetic development of the extended R universe. Books are an invaluable resource for slowing down and focusing on fully-formed ideas.\n\nAaron Williams (awilliams\\@urban.org) has a number of books that can be checked out:\n\n- [The Art of R Programming](https://www.nostarch.com/artofr.htm)\n- [ggplot2](http://www.springer.com/us/book/9780387981413)\n- [Efficient R Programming](http://shop.oreilly.com/product/0636920047995.do) ([Online!](https://csgillespie.github.io/efficientR/))\n- [Text Mining with R](http://shop.oreilly.com/product/0636920067153.do) ([Online!](https://www.tidytextmining.com/))\n- [Reasoning with Data](https://www.guilford.com/books/Reasoning-with-Data/Jeffrey-Stanton/9781462530267/reviews)\n- [Practical Statistics for Data Scientists](http://shop.oreilly.com/product/0636920048992.do)\n\n### Built-in Data Sets\n\nR has many built-in data sets that are useful for practice and even more data sets are accessible through R packages.\n\nSubmitting `data()` shows a list of all available data sets. `cars` and `iris` are two classic sets that are used in many examples.\n\n`library(tidyverse)` loads many more \"tidy\" data sets including `diamonds` and `starwars`.\n\n```{r tidyverse}\nlibrary(tidyverse)\nstarwars %>%\n\tcount(species) %>%\n\tarrange(desc(n)) %>%\n\thead()\n```\n\n`library(dslabs)` by [Rafael Irizarry](https://simplystatistics.org/2018/01/22/the-dslabs-package-provides-datasets-for-teaching-data-science/) includes varied data sets that are intentionally imperfect that are useful for practice. Students of econometrics will enjoy `library(wooldridge)`. It loads 105 data sets from *Introductory Econometrics: A Modern Approach* by Jeffrey Wooldridge. Now you can practice estimating your hedonic pricing models in R!\n\n```{r psid}\nlibrary(wooldridge)\nlibrary(tidyverse)\nlibrary(urbnthemes)\n\nset_urbn_defaults(style = \"print\")\n\nas_tibble(hprice1) %>%\n\tggplot(aes(x = sqrft, y = price)) +\n\tgeom_point() +\n\tscale_y_continuous(expand = c(0, 0), lim = c(0, 800)) +\n\tlabs(title = '\"hprice1\" data from Wooldridge') \n```\n\n### Getting Help\n\nEven the best R programmers spend hours each week searching the Internet for answers. Here are some of the best ways to find answers:\n\nSubmit `?` and any function name without parentheses (ex. `?mean`) to see the function documentation in RStudio.\n\nWhen Googling, set the search range to the last year to avoid out-of-date solutions and to focus on up-to-date practices.\n\n[Stack Overflow](https://stackoverflow.com/) contains numerous solutions. Add `[r]` to any search to limit results to R. If a problem is particularly perplexing, it is simple to submit questions. Exercise caution when submitting questions because the Stack Overflow community has strict norms about questions and loose norms about respecting novices.\n\n[RStudio Community](https://community.rstudio.com/) is a new forum for R Users. It has a smaller back catalog than Stack Overflow but users are friendlier than on Stack Overflow.\n\nFinally, Aaron Williams (awilliams\\@urban.org) from IBP and Amy Rogin (arogin\\@urban.org) from METRO are available to solve problems, offer guidance, and share R enthusiasm.\n\n### CRAN Task Views\n\nR has sub-communities, frameworks, and tools focused on different subject-matter and and methodological areas. [CRAN Task Views](https://cran.r-project.org/web/views/) is invaluable for understanding these communities and finding the best frameworks and tools for different disciplines in R.\n\nCRAN Task Views has 35 pages focused on subcategories of R ranging from [econometrics](https://cran.r-project.org/web/views/Econometrics.html) to natural language processing. Each page is maintained by a subject-matter expert and contains methods, packages, books, and mailing lists that are useful for researchers.\n\nThe econometrics page alone contains detailed information on basic linear regression, microeconometrics, instrumental variables, panel data models, further regression models, time series data and models, data sets, CRAN packages, articles, books, and more.\n\n## R Code\n\n------------------------------------------------------------------------\n\nIt's time to start writing R code. Remember, most R users never open R and exclusively use RStudio. Go ahead and open R once to admire its dated text editor. Then, close R and never directly open it again. Now, open RStudio.\n\n### Submitting Code\n\nRStudio has four main panels: code editor (top left by default), R console (bottom left by default), environment and history (top right by default), and files, plots, packages, help, and viewer pane (bottom right by default).\n\nThere are two main ways to submit code:\n\n1) Type code to the right of ![](intro-to-r/images/code-console.png) in the R console and hit enter. **Note:** R won't create a long-term record of this code.\n2) Click ![](intro-to-r/images/new-script.png) in the top left to create a new R script in the code editor panel. Type code in the script. Highlight desired code and either click run the in top right of the code editor panel or type Ctrl/command-enter to run code. Scripts can be saved, so they are the best way to write code that will be used again.\n\nFor practice, submit `state.name` in the R console to create a vector with all fifty state names (sorry statehood advocates, no Washington, D.C.). Next, create a script, paste `state.name`, highlight the text, and click run at the top right of the code editor. You should get the same output both times.\n\n```{r state names}\nstate.name\n```\n\n### Syntax\n\nThe are five fundamental pieces of syntax in R.\n\n- `<-` is the assignment operator. An object created on the right side of an assignment operator is assigned to a name on the left side of an assignment operator. Assignment operators are important for saving the consequences of operations and functions. Operations without assignment operators will typically be printed to the console but not saved.\n- `#` begins a comment. Comments are useful for explaining decisions in scripts. As Haldey Wickham notes in the [Tidyverse styleguide](http://style.tidyverse.org/), 'In code, use comments to explain the \"why\" not the \"what\" or \"how\".\n- `c()` combines similar vectors into larger vectors. For example, `c(1, 2, 3)` is a numeric vector of length three made up of three numeric vectors of length one.\n- `?` in front of any function name without parentheses returns function documentation. For example, `?mean`.\n- `%>%` from `library(magrittr)` and `library(tidyverse)` is the \"pipe operator\". It passes the output from one function to another function. This is useful because strings of operations can be \"piped\" together instead of each individual operation needing to be assigned to an object.\n\n### Vectors\n\nVectors are the fundamental piece of data in R. R has six vector types (you can't mix vector types): logical, integer, double, character, complex, and raw. . You can check the type of a vector with `typeof()` and the length with `length()`\n\n### Data frames\n\nData frames are combinations of equal length vectors. Data analysis in R is built around the data frames. As a guiding principle working with data frames, you want to have \"tidy data\" whenever possible. A tidy data frame means that :\n\n1. Each variable has its own column.\n\n2. Each observation has its own row.\n\n3. Each value has its own cell.\n\n[![\\[Source\\](https://r4ds.had.co.nz/tidy-data.html)](intro-to-r/images/tidy-data.png)](https://r4ds.had.co.nz/tidy-data.html)\n\nHaving data in a tidy format allows R's vectorized nature to shine and many of the `tidyverse` functions are designed for tidy data.\n\n### Missing values\n\nR stores missing values as `NA`. A single `NA` in a calculation can cause the entire result to return as `NA`.\n\n```{r}\nsum(c(2, 2, NA))\n```\n\nThe contagiousness of `NA` is good, it makes users explicitly acknowledge dropping missing values with `na.rm = TRUE`.\n\n```{r}\nsum(c(2, 2, NA), na.rm = TRUE)\n```\n\n`== NA` does not test for missing values. Instead, use `is.na()`.\n\n- is.na() and math with booleans\n- complete.cases\n\n### Functions\n\nFunctions in R are collections of code that when called cause certain actions. R contains hundreds of functions and thousands of more functions can be accessed through packages.\n\nMost functions take arguments. For example, the function `mean()` has arguments `x`, `trim`, `na.rm`, and `...`. The first argument in most functions, in this case `x`, is an input object. Arguments can be passed to functions by name or position. `mean(c(1, 2, 3))` is equivalent to `mean(x = c(1, 2, 3))`.\n\nNotice how the other three arguments were skipped. Most arguments in functions have default values. The best way to see default values is to submit the function name with a question mark, like `?mean`. In this case, `trim = 0`, `na.rm = FALSE`, and no further arguments were passed through with `...`.\n\nIn the previous example, the `c()` function was nested inside of the `mean()` function. It is also possible to assign a vector of 1, 2, and 3 to a name and pass the name to the mean function.\n\n```{r mean, eval = FALSE}\napples <- c(1, 2, 3)\n\nmean(apples)\n```\n\nR is a [functional programming language](http://adv-r.had.co.nz/Functional-programming.html). In addition to having many pre-made functions like `mean()`, R has powerful tools for creating and manipulating custom functions. This is useful because:\n\n- It avoids tedious and error-prone copying-and-pasting and makes iterating processes simple;\n- Is a powerful way to organize sets of operations;\n- Is a standardized way to save code for later and to share operations with others.\n\nThis last bullet is key to the package system in R.\n\n### Packages\n\nOpening RStudio automatically loads \"base R\", a fundamental collection of code and functions that handles simple operations like math and system management. R can be extended with collections of code and functions developed by the R community called packages. This sounds wild, but most packages are created and maintained by some of the best statisticians and developers in the world.\n\nMost packages can be installed with `install.packages(\"dplyr\")`, where the string between the quotation marks is the name of the package. Packages installed with `install.packages()` come from CRAN and must pass certain checks for performance and documentation. Popular packages on CRAN, like dplyr, have as much, if not more support, standards, and quality than code in proprietary software packages like Stata or SAS.\n\nIt is possible, but less common, to install packages from places like GitHub. This is less secure and the functionality of the packages is more likely to change over time. `install.packages()` need only be run once per version of package per machine and should rarely be included in .R scripts.\n\nPackages are loaded once per R session with the function `library()`. It is a good idea to include `library(package-name)` at the top of scripts for each package used in the script. This way it is obvious at the top of the script which packages are installed and loaded.\n\n**Note:** `install.packages()` uses quoted package names and `library()` uses unquoted package names.\n\nFor practice, submit the following three lines of code to install `RXKCD`, load `library(RXKCD)`, and get a random [XKCD comic](https://www.xkcd.com/).\n\n```{r xkcd, eval=FALSE}\ninstall.packages(\"RXKCD\")\nlibrary(RXKCD)\ngetXKCD(\"random\")\n```\n\n```{r xkcd run, echo=FALSE}\nlibrary(RXKCD)\n# assignment to hide text output\ncomic <- getXKCD(539)\n```\n\nPackages are frequently updated, especially around the time R versions change. The easiest way to update packages is Tools \\> Check for Package Updated in RStudio.\n\nOccasionally, two loaded packages will have functions with identical names. Any conflicts with be announced when loading packages. See how `filter()` and `lag()` from `library(tidyverse)` and `library(stats)` conflict:\n\n![](intro-to-r/images/load-tidyverse.png) In this case, the tidyverse functions are usually favored. If there is ever a conflict or any doubt about which function is used, use the package name and `::` to directly call the function. For example, `dplyr::select(apples)`. `::` can also be used to call a function without loading the entire package.\n\n### CRAN\n\nThe [Comprehensive R Archive Network](https://cran.r-project.org/index.html) (CRAN) contains almost 12,000 packages contributed over the last two decades by a range of developers. New packages are added to CRAN almost every day.\n\nCRAN enables R to have all of the benefits of open-source development and the security and predictability of proprietary statistical packages like SAS and Stata. CRAN weds the benefits of broad-based, real-time package development with certain [standards](https://cran.r-project.org/index.html) for functionality and documentation. Methods and tools make it to R before SAS or Stata, if they ever make it to SAS or Stata, but have standards that generally exceed Python or other open-source languages. (See: [Malicious Libraries Found on Python Package Index (PyPI)](https://www.blog.pythonlibrary.org/2017/09/15/malicious-libraries-found-on-python-package-index-pypi/))\n\nBecause of CRAN's long history and R's place in the statistics community, CRAN contains many methods that can't be accessed, much less duplicated, using proprietary software. In addition to being useful now, this also ensures that R isn't a temporary fad and will have staying power because of the challenge of replicating or besting CRAN.\n\nR's extensible design is important, but most tasks can be accomplished with a handful of packages:\n\n- `ggplot2` data visualization\n- `dplyr` data management\n- `tidyr` data tidying\n- `readr` data import\n- `purrr` functional programming\n- `tibble` data frames\n- `hms` times\n- `stringr` character strings\n- `lubridate` dates/times\\\n- `forcats` factors\n- `DBI` databases\n- `haven` SPSS, SAS, and Stata files\n- `readxl`.xls and .xlsx\n- `modelr` simple modeling within a pipeline\n- `broom` turning models into tidy data\n- `tidyverse` loads all of the packages listed up to this point; see Hadley Wichkham's \"[tidyverse](https://blog.rstudio.org/2016/09/15/tidyverse-1-0-0/)\"\n\n## Organizing Analyses\n\n------------------------------------------------------------------------\n\nThis section outlines how to organize an analysis to get the most out of R. Newer users may want to skip this section and work through [R for Data Science](http://r4ds.had.co.nz/) until they understand `library(readr)`, `library(dplyr)`, and `library(ggplot2)`.\n\n### Projects\n\nOrganizing scripts, files, and data is one of the most important steps to creating a clear and reproducible analysis.\n\nR Projects, proper noun, are the best way to organize an analysis. They have several advantages:\n\n- They make it possible to concurrently run multiple RStudio sessions.\n- They allow for project-specific RStudio settings.\n- They integrate well with Git version control.\n- They are the \"node\" of relative file paths. (more on this in a second)\n\nBefore setting up an R Project, go to Tools \\> Global Options and uncheck \"Restore most recently opened project at startup\".\n\n![](intro-to-r/images/restore.png){width=\"50%\"}\n\nEvery new analysis in R should start with an R Project. First, create a directory that holds all data, scripts, and files for the analysis. Storing files and data in a sub-directories is encouraged. For example, data can be stored in a folder called data/.\n\nNext, click \"New Project...\" in the top right corner.\n\n![](intro-to-r/images/new-project.png){width=\"50%\"}\n\nWhen prompted, turn your recently created \"Existing Directory\" into a project.\n\n![](intro-to-r/images/existing-directory.png){width=\"50%\"}\n\nUpon completion, the name of the R Project should now be displayed in the top right corner of RStudio where it previously displayed \"Project: (None)\". Once opened, .RProj files do not need to be saved. Double-clicking .Rproj files in the directory is now the best way to open RStudio. This will allow for the concurrent use of multiple R sessions and ensure the portability of file paths. Once an RStudio project is open, scripts can be opened by double-clicking individual files in the computer directory or clicking files in the \"Files\" tab in the top right of RStudio.\n\nR Projects make code highly portable because of the way they handle file paths. Here are a few rules:\n\n#### Filepaths\n\nNever use `\\` in file paths in R. `\\` is a regular expression and will complicate an analysis. Fortunately, RStudio understands `/` in file paths regardless of operating system.\n\nNever use `setwd()` in R. It is unnecessary, it makes code unreproducible across machines, and it is rude to collaborators. R Projects create a better framework for file paths. Simply treat the directory where the R Project lives as the working directory and directories inside of that directory as sub-directories.\n\nFor example, say there's a `.Rproj` called `starwars-analysis.Rproj` in a directory called `starwars-analysis`. If there is a .csv in that folder called `jedi.csv`, the file can be loaded with `read_csv(\"jedi.csv\")` instead of `read_csv(\"H:/ibp/analyses/starwars-analysis/diamonds.csv\")`. If that file is in a sub-directory of `starwars-analysis` called `data`, it can be loaded with `read_csv(\"data/jedi.csv\")`. The same concepts hold for writing data and graphics.\n\nThis simplifies code and makes it portable because all relative filepaths will be identical on all computers. To share an analysis, simply send the entire directory to a collaborator or share it with GitHub.\n\nHere's an example directory:\n\n
![](intro-to-r/images/directory.png){width=\"50%\"}
\n\nIt isn't always possible to avoid absolute file paths because of the many different ways the Urban Institute stores data. Avoid absolute paths when possible and be deliberate about where analyses live in relation to where data live.\n\nFinally, it's good practice to include a README in the same directory as the .Rproj. The README should outline the purpose and the directories and can include information about how to contribute, licenses, dependencies, and acknowledgements. This [GitHub page](https://gist.github.com/PurpleBooth/109311bb0361f32d87a2) is a good README template.\n\nCheck out [R for Data Science](http://r4ds.had.co.nz/workflow-projects.html) by Hadley Wickham and Garrett Grolemund for a more thorough explanation of this workflow. Jenny Bryan also has a good [blogpost](https://www.tidyverse.org/articles/2017/12/workflow-vs-script/) about avoiding `setwd()`.\n\n### Naming Conventions\n\nNaming functions, objects, variables, files, and scripts is one of the toughest and least-taught dimensions of computer programming. Better names can add clarity to code, save time and effort, and minimize errors caused by accidentally overwriting existing functions or other objects.\n\n> There are only two hard things in Computer Science: cache invalidation and naming things. \\~ [Phil Karlton](http://www.meerkat.com/2017/12/naming-things-hard/)\n\n#### Functions and Other Objects\n\nR is case-sensitive.\n\nObjects in R can be named anything - [even unicode characters](https://www.r-bloggers.com/rules-for-naming-objects-in-r/). But just because something *can* be named anything doesn't mean it should.\n\nMost functions and objects in R are lowerCamelCase, period.separated, or underscore_separated. As an individual or team, it's important to pick a style and stick with it, but as [this article](https://journal.r-project.org/archive/2012-2/RJournal_2012-2_Baaaath.pdf) from 2012 shows, there isn't much consistency across the R community. Hadley Wickham's tidyverse uses underscores, so expect to see some consolidation into this style.\n\nIn general, it's good practice to name functions with verbs and other objects with nouns.\n\nVariable and object names that start with numbers, have spaces, or use peculiar syntax require back-ticks.\n\n> select(urban, \\`R Users Group\\`)\n\n> urban\\$\\`R Users Group\\`)\n\nFinally, it's possible to overwrite existing functions and other objects in R with the assignment operator. Don't give vectors or data frames the same names as exisiting functions and don't overwrite existing functions with custom functions.\n\n#### Files\n\nNaming conventions for scripts and files is probably the most overlooked dimension in programming and analysis. The first three bullets from this section come from this [rich slide deck](http://www2.stat.duke.edu/~rcs46/lectures_2015/01-markdown-git/slides/naming-slides/naming-slides.pdf) by Jenny Bryan. This may seem pedantic, but picking a file naming convention now can save a bunch of time and headaches in the future.\n\n**1) Machine readable**\n\nCreate file names that are easily machine readable. Use all lower case letters and skip punctuation other than delimiters. Use underscores as characters for splitting the file name. For example, `stringr::str_split_fixed(\"2018-01-10_r-introduction_machine-readable-example_01.csv\", \"[_\\\\.]\", 5)` splits the file name on underscores and periods and returns date, project, file name, file number, and file type. This information can then be stored and sorted in a data frame.\n\n**2) Human readable**\n\nCreate file names that are human readable. The example from above is informative without any machine interpretation.\n\n**3) Plays well with default ordering**\n\nIt is often useful to include date or sequence numbers in script and file names. For example, include 2018-10-01 for data collected on January 10th, 2018 or include 3 for the third script a sequence of five `.R` programs. Starting file names with the date or sequence numbers means files will show up in a logical order by default. Be sure to use ISO 8601 standard for dates (YYYY-MM-DD).\n\n**4) Don't Use File Names for Version Control**\n\nVersion control with file names is unwieldy and usually results in names that are barely human readable and definitely not machine readable.\n\n> \"2018-01-10_r-introduction_machine-readable-example_01_v2_for-aaron_after-review_before-submission.R\"\n\nIterations usually don't iterate sensibly. For example, what was \"v1\", \"v2\" abandoned for \"for-aaron\", \"after-review\", \"before-submission\". Furthermore, version control with file names is poor for concurrent work and merging.\n\nThe next section will outline the optimal tool for version control.\n\n### Version Control\n\nThe workflow outlined above integrates perfectly with version control like Git and distributed version control repository hosting services like GitHub.\n\nVersion control is a system for recording changes to files over time. Version control is built around repositories. In this case, the folder containing the `.Rproj` is the perfect directory to use as a repository. A handful of simple commands are used to track and commit changes to text files (.R, .Rmd, etc.) and data. This record is valuable for testing alternatives, communicating with others and your future self, and documenting progress on projects.\n\nGitHub is a distributed repository system built on top of Git. GitHub has a number of valuable tools for collaboration and project management. In particular, it makes concurrent collaboration on code simpler with branches and has a slick system for issues. Here are the [branches](https://github.com/UrbanInstitute/urban_R_theme/branches) and [issues](https://github.com/UrbanInstitute/urban_R_theme/issues) for the Urban Institute R Graphics Guide. It also has free web hosting for websites like the website you are reading right now. [GitHub has a quick guide that is a good place to start learning Git](https://try.github.io/levels/1/challenges/1).\n\nThe Urban Institute has a number of legacy models and code bases that span years and have been touched by scores of brilliant researchers. The future value of a record of all code changes and development is borderline unthinkable.\n\n### Coding Style\n\n> \"Good coding style is like using correct punctuation. You can manage without it, but it sure makes things easier to read.\" \\~Hadley Wickham (2014)\n\ngood coding style is like using correct punctuation you can manage without it but it sure makes thing easier to read\n\nThe details of a coding style are less important than consistently sticking to that style. Be flexible when working with collaborators so the style doesn't change inside an analysis.\n\nHere are three good sources for inspiration:\n\n- [Tidyverse Style Guide](http://style.tidyverse.org/)\n- [Google's R Style Guide](https://google.github.io/styleguide/Rguide.xml)\n- [Hadley Wickham's R Style Guide](http://adv-r.had.co.nz/Style.html)\n\n## Putting it All Together\n\n------------------------------------------------------------------------\n\nR can augment or replace a traditional proprietary statistical packages like SAS or Stata with a few extra bells and whistles, but hopefully this guide and other resources show a fuller vision for developing reproducible, accurate, and collaborative analyses.[^1]\n\n[^1]: The language \"reproducible, accurate, and collaborative analyses\" comes from [Hilary S. Parker's talk](https://www.rstudio.com/resources/videos/opinionated-analysis-development/) at rstudio::conf 2017 about opinionated analysis development.\n\nThis research pipeline, to use the phrase by Roger Peng, Jeff Leek, and Brian Caffo, combines the best of traditional economic and social policy research, computer science/software development, and statistics.[^2] Here are the rules:\n\n[^2]: The basis for [this section](https://www.coursera.org/learn/reproducible-research/lecture/abevs/reproducible-research-concepts-and-ideas-part-2) comes from this Coursera talk by Roger Peng.\n\n#### 1) No steps in an analysis are done by hand and all steps are recorded with executable scripts.\n\nIt is common to use executable scripts to estimate a regression equation or to tabulate weighted summary statistics. But for some reason, other steps like file management, data munging, and visualization are often done \"by hand\". Good science demands that every step of an analysis is recorded - and if possible - with executable scripts.\n\nFortunately, it is possible to script most steps in R from downloading data from the Internet and accessing APIs to visualizations and drafting manuscripts. This may be challenging at first, but it will save time and result in better research in the long run.\n\n#### 2) All code is entirely reproducible and portable.\n\nExecutable scripts are for communicating with other researchers and our future selves. Scripts lose value if they aren't portable and can't be reproduced in the future or by others. Recording every step with execuatble scripts is a start, but scripts aren't valuable if they require expensive proprietary software,or if researchers have to significantly alter scripts to run an analysis.\n\nOpen source software, like R, promotes accessibility, portability, and reproducibility. Also, be sure to avoid `setwd()` and use relative filepaths.\n\n#### 3) Local and collaborative version control is used and all repositories include all code and a README.\n\nUse local version control like Git and a distributed version control repository hosting service like GitHub to track changes and share analyses. The version control should include all scripts and meta information about the analysis in a README.\n\n#### 4) Raw data and tidy analytic data are stored in a collaborative location with a code book.\n\nMany raw data are already stored in collaborative locations like BLS.gov and don't need to be duplicated. Tidy analytic data, like the data used to estimate a regression equation, should be stored in a collaborative location. This is good practice, but is less essential if executable scripts are flawless and reproducible. Researcher-entered data and data from less-stable sources should be stored in raw and analytic forms.\n\nSmall data sets can be stored on GitHub without issue. Larger data sets should be stored in collaborative locations accessible by scripting languages. This is only possible for public data and best-practices for private data are less established.\n\nSave codebooks for data sets as text files or PDFs in repositories. Creating codebooks for user-entered data or variables created in executable scripts is often worth the time.\n\n#### 5) Code review and issue tracking are used to improve accuracy and computational efficiency.\n\nGetting stronger programmers and/or methodologists to review code is valuable for limiting programming and analytic mistakes, improving computational efficiency, and learning.\n\n[GitHub issues](https://guides.github.com/features/issues/) is a powerful tool for managing, discussing, and collaborating on code.\n\n#### 6) Projects rely heavily on literate statistical programming and standard means of distribution for execution, validation, and publishing.\n\nLiterate statistical programming is the combination of natural language explanations for humans and executable code in one document. The idea was created by Donald Knuth and is embodied by R Markdown.\n\nR Markdown combines text chunks, code chunks, and output chunks in one script that can be \"knitted\" using `library(knitr)` to created PDFs, books, .htmls, and websites like the website where this guide lives.\n\nThis workflow combines the analytic and narrative process in a tool that is flexible, scalable, reproducible, and less error-prone. R Markdown documents can be used for executing programs, validating models and analyses, and publishing. These documents can be submitted to many academic journals and shared easily with [GitHub pages](https://pages.github.com/).\n\n#### 7) Software versions and dependencies are recorded and all software is cited in publications.\n\n`sessionInfo()` reports the R version, locale, packages used, and other important information about an R session. `citation()` creates a text and BibTex entry of the citation for R. `citation()` creates a text and BibTex entry for R packages. `library(packrat)` (outlined [here](https://rstudio.github.io/packrat/)) is a tool for saving R dependencies.\n\n## Bibliography and References\n\n------------------------------------------------------------------------\n\nHadley Wickham (2017). tidyverse: Easily Install and Load the 'Tidyverse'. R package version 1.2.1. https://CRAN.R-project.org/package=tidyverse\n\nHadley Wickham and Garrett Grolemund (2017). R For Data Science http://r4ds.had.co.nz/\n\nHadley Wickham (2014). Advanced R http://adv-r.had.co.nz/Style.html\n\nHilary S. Parker (2017. Opinionated Analysis Development https://www.rstudio.com/resources/videos/opinionated-analysis-development/\n\nJenny Bryan (2017).\\\nProject-oriented workflow https://www.tidyverse.org/articles/2017/12/workflow-vs-script/\n\nJenny Bryan (2015). naming things. http://www2.stat.duke.edu/\\~rcs46/lectures_2015/01-markdown-git/slides/naming-slides/naming-slides.pdf\n\nJJ Allaire, Yihui Xie, Jonathan McPherson, Javier Luraschi, Kevin Ushey, Aron Atkins, Hadley Wickham, Joe Cheng and Winston Chang (2017). rmarkdown: Dynamic Documents for R. R package version 1.8. https://CRAN.R-project.org/package=rmarkdown\n\nJustin M. Shea (2017). wooldridge: 105 Data Sets from \"Introductory Econometrics: A Modern Approach\" by Jeffrey M. Wooldridge. R package version 1.2.0. https://CRAN.R-project.org/package=wooldridge\n\nRoger Peng Reproducible Research Part 2 https://www.coursera.org/learn/reproducible-research/lecture/abevs/reproducible-research-concepts-and-ideas-part-2\n\nYihui Xie (2017). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.18.\n\n```{r session info}\nsessionInfo()\n```\n"},"formats":{"html":{"identifier":{"display-name":"HTML","target-format":"html","base-format":"html"},"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"knitr"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"inline-includes":false,"preserve-yaml":false,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[],"notebook-links":true,"format-links":true},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"intro-to-r.html"},"language":{"toc-title-document":"Table of contents","toc-title-website":"On this page","related-formats-title":"Other Formats","related-notebooks-title":"Notebooks","source-notebooks-prefix":"Source","section-title-abstract":"Abstract","section-title-appendices":"Appendices","section-title-footnotes":"Footnotes","section-title-references":"References","section-title-reuse":"Reuse","section-title-copyright":"Copyright","section-title-citation":"Citation","appendix-attribution-cite-as":"For attribution, please cite this work as:","appendix-attribution-bibtex":"BibTeX citation:","title-block-author-single":"Author","title-block-author-plural":"Authors","title-block-affiliation-single":"Affiliation","title-block-affiliation-plural":"Affiliations","title-block-published":"Published","title-block-modified":"Modified","callout-tip-title":"Tip","callout-note-title":"Note","callout-warning-title":"Warning","callout-important-title":"Important","callout-caution-title":"Caution","code-summary":"Code","code-tools-menu-caption":"Code","code-tools-show-all-code":"Show All Code","code-tools-hide-all-code":"Hide All Code","code-tools-view-source":"View Source","code-tools-source-code":"Source Code","code-line":"Line","code-lines":"Lines","copy-button-tooltip":"Copy to Clipboard","copy-button-tooltip-success":"Copied!","repo-action-links-edit":"Edit this page","repo-action-links-source":"View source","repo-action-links-issue":"Report an issue","back-to-top":"Back to top","search-no-results-text":"No results","search-matching-documents-text":"matching documents","search-copy-link-title":"Copy link to search","search-hide-matches-text":"Hide additional matches","search-more-match-text":"more match in this document","search-more-matches-text":"more matches in this document","search-clear-button-title":"Clear","search-detached-cancel-button-title":"Cancel","search-submit-button-title":"Submit","search-label":"Search","toggle-section":"Toggle section","toggle-sidebar":"Toggle sidebar navigation","toggle-dark-mode":"Toggle dark mode","toggle-reader-mode":"Toggle reader mode","toggle-navigation":"Toggle navigation","crossref-fig-title":"Figure","crossref-tbl-title":"Table","crossref-lst-title":"Listing","crossref-thm-title":"Theorem","crossref-lem-title":"Lemma","crossref-cor-title":"Corollary","crossref-prp-title":"Proposition","crossref-cnj-title":"Conjecture","crossref-def-title":"Definition","crossref-exm-title":"Example","crossref-exr-title":"Exercise","crossref-ch-prefix":"Chapter","crossref-apx-prefix":"Appendix","crossref-sec-prefix":"Section","crossref-eq-prefix":"Equation","crossref-lof-title":"List of Figures","crossref-lot-title":"List of Tables","crossref-lol-title":"List of Listings","environment-proof-title":"Proof","environment-remark-title":"Remark","environment-solution-title":"Solution","listing-page-order-by":"Order By","listing-page-order-by-default":"Default","listing-page-order-by-date-asc":"Oldest","listing-page-order-by-date-desc":"Newest","listing-page-order-by-number-desc":"High to Low","listing-page-order-by-number-asc":"Low to High","listing-page-field-date":"Date","listing-page-field-title":"Title","listing-page-field-description":"Description","listing-page-field-author":"Author","listing-page-field-filename":"File Name","listing-page-field-filemodified":"Modified","listing-page-field-subtitle":"Subtitle","listing-page-field-readingtime":"Reading Time","listing-page-field-categories":"Categories","listing-page-minutes-compact":"{0} min","listing-page-category-all":"All","listing-page-no-matches":"No matching items"},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.3.433"},"extensions":{"book":{"multiFile":true}}}},"projectFormats":["html"]} \ No newline at end of file diff --git a/.quarto/idx/mapping.qmd.json b/.quarto/idx/mapping.qmd.json index d506e2e..9f5ae65 100644 --- a/.quarto/idx/mapping.qmd.json +++ b/.quarto/idx/mapping.qmd.json @@ -1 +1 @@ -{"title":"Introduction","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"editor_options":{"markdown":{"wrap":72}}},"headingText":"Introduction","containsRefs":false,"markdown":"\n\n\n\n::: {#header}\n\n:::\n\n```{r markdown-setup, include=FALSE}\nknitr::opts_chunk$set(fig.path = \"mapping/www/images/\")\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\nknitr::opts_chunk$set(warning = FALSE)\n\noptions(scipen = 999)\n```\n\n```{r setup, include=FALSE}\nlibrary(tidyverse)\nlibrary(knitr)\nlibrary(kableExtra)\nlibrary(here)\nlibrary(sf)\n```\n\n\nThis guide will teach you the concepts and code you will need for\nmapping and geospatial analysis in R. **This is a long guide, so if you\nneed something specific, we encourage you to scroll to the appropriate\nsection using the Table of Contents on the left.** If you just want copy\nand pasteable code to create different kinds of maps, head to the\n[`Map Gallery`](#map_gallery).\n\nNow let's start mapping!\n\n![](mapping/www/images/yay_maps.gif)\n\n## Geospatial Workflow\n\nThis picture below outlines what we think are the main steps in a\ngeospatial workflow. This guide will be split into sections describing\neach of the steps.\n\n![](mapping/www/images/geospatial_workflow.png)\n\n## Should this be a map?\n\nThe [Urban Institute Data Visualization Style\nGuide](http://urbaninstitute.github.io/graphics-styleguide/) offers some\nblunt but useful suggestions for maps:\n\n> Just because you've got geographic data, doesn't mean that you have to\n> make a map. Many times, there are more efficient storyforms that will\n> get your point across more clearly. If your data shows a very clear\n> geographic trend or if the absolute location of a place or event\n> matters, maps might be the best approach, but sometimes the reflexive\n> impulse to map the data can make you forget that showing the data in\n> another form might answer other---and sometimes more\n> important---questions.\n\nSo we would encourage you to think critically before making a map.\n\n## Why map with R?\n\nR can have a steeper learning curve than point-and-click tools - like\nQGIS or ArcGIS - for geospatial analysis and mapping. But creating maps\nin R has many advantages including:\n\n1) **Reproducibility**: By creating maps with R code, you can easily\n share the outputs and the code that generated the output with\n collaborators, allowing them to replicate your work and catch errors\n easily.\n\n2) **Iteration**: With point and click software like ArcGIS, making 50\n maps would be 50 times the work/time. But using R, we can easily\n make make many iterations of the same map with a few changes to the\n code.\n\n3) **Easy Updates**: Writing code provides a roadmap for others (and\n future you!) to quickly update parts of the map as needed. Say for\n example a collaborator wanted to change the legend colors of 50\n state maps. With R, this is possible in just a few seconds!\n\n4) **An Expansive ecosystem**: There are several R packages that make\n it very easy to get spatial data, create static and interactive\n maps, and perform spatial analyses. This feature rich package\n ecosystem which all play nice together is frankly unmatched by other\n programming languages and even point and click tools like QGIS and\n ArcGIS. Some of these R packages include:\n\n - `sf`: For managing and analyzing spatial dataframes\n - `tigris`: For downloading in Census geographies\n - `ggplot2`: For making publication ready static maps\n - `urbnmapr`: For automatically adding Urban styling to static\n maps\n - `mapview`: For making expxploratory interactive maps\n\n5) **Cost**: Most point-and-click tools for geospatial analysis are\n proprietary and expensive. R is free open-source software. The\n software and most of its packages can be used for free by anyone for\n almost any use case.\n\n## Helpful Learning Resources\n\nIn addition to this guide, you may want to look at these other helpful\nresources:\n\n- The Urban Institute [mapping training\n series](https://ui-research.github.io/urbn101-mapping/) (with video\n lectures and notes)\n- Chapters\n [5](https://walker-data.com/census-r/census-geographic-data-and-applications-in-r.html),\n [6](https://walker-data.com/census-r/mapping-census-data-with-r.html),\n and\n [7](https://walker-data.com/census-r/spatial-analysis-with-us-census-data.html)\n from Kyle Walker's Analyzing US Census Data\n [book](https://walker-data.com/census-r/index.html).\n- Andrew Heiss' fantastic mapping\n [guide](https://datavizm20.classes.andrewheiss.com/example/12-example/)\n- All of the vignettes for the [`sf`\n package](https://cran.r-project.org/web/packages/sf/sf.pdf)\n- [Geocomputation with\n R](https://geocompr.robinlovelace.net/index.html): A book by Robin\n Lovelace and others\n- UChicago's R Spatial Workshops:\n \n\n# Get Spatial Data {#get_spatial_data}\n\n------------------------------------------------------------------------\n\n## library(sf) {.tabset .tabset-pills}\n\n### The short version\n\n`library(sf)` stores geospatial data, which are\n**points** (a single longitude/latitude),\n**lines** (a pair of connected points), or\n**polygons** (a collection of points which\nmake a polygon) in a `geometry` column within R dataframes\n\n![](mapping/www/images/amtrak_points_lines_polygons.jpg)\n\nThis is what `sf` dataframe looks like in the console:\n\n```{r print-sf-dataframe}\ndc_parks <- st_read(\"mapping/data/dc_parks.geojson\", \n\t\t\t\t\t\t\t\t\t\tquiet = TRUE)\n\n# Print just the NAME and geometry column\ndc_parks %>%\n select(NAME) %>%\n head(2)\n```\n\n### The long version\n\nThe `sf` library is a key tool for reading in, managing, and working\nwith spatial data in R. `sf` stands for simple features (not San\nFrancisco you Bay Area folks) and denotes a way to describe the spatial\nattributes of real life objects. The R object you will be working with\nmost frequently for mapping is an `sf` dataframe. An `sf` dataframe is\nessentially a regular R dataframe, with a couple of extra features for\nuse in mapping. These extra features exclusive to `sf` dataframes\ninclude:\n\n- sticky `geometry` columns\n- attached coordinate reference systems\n- some other spatial metadata\n\nThe most important of the above list is the sticky `geometry` column,\nwhich is a magical column that contains all of the geographic\ninformation for each row of data. Say for example you had a `sf`\ndataframe of all DC census tracts. Then the `geometry` column would\ncontain all of the geographic points used to define DC census tract\npolygons. The stickiness of this column means that no matter what data\nmunging/filtering you do, you will not be able to drop or delete the\n`geometry` column. Below is a graphic to help you understand this:\n\n![](mapping/www/images/sf_sticky_geometry.png)\n\ncredits: @allisonhorst\n\nThis is what an `sf` dataframe looks like in the console:\n\n```{r print_sf}\n# Read in spatial data about DC parks from DC Open Data Portal\ndc_parks <- st_read(\"https://opendata.arcgis.com/api/v3/datasets/287eaa2ecbff4d699762bbc6795ffdca_9/downloads/data?format=geojson&spatialRefId=4326\",\n\t\t\t\t\t\t\t\t\t\tquiet = TRUE)\n\n# dc_parks <- st_read(\"mapping/data/dc_parks.geojson\")\n\n# Select just a few columns for readability\ndc_parks <- dc_parks %>%\n select(NAME, geometry)\n\n# Print to the console\ndc_parks\n```\n\nNote that there is some spatial metadata such as the `Geometry Type`,\n`Bounding Box`, and `CRS` which shows up as a header before the actual\ncontents of the dataframe.\n\nSince `sf` dataframes operate similarly to regular dataframes, we can\nuse all our familiar `tidyverse` functions for data wrangling, including\n`select`, `filter`, `rename`, `mutate`, `group_by` and `summarize`. The\n`sf` package also has many functions that provide easy ways to replicate\ncommon tasks done in other GIS software like spatial joins, clipping,\nand buffering. Almost all of the mapping and geospatial analysis methods\ndescribed in this guide rely on you having an `sf` dataframe. So let's\ntalk about how to get one!\n\n## Importing spatial data {.tabset .tabset-pills}\n\nGetting an `sf` dataframe is always the first step in the geospatial\nworkflow. Here's how to import spatial data for...\n\n### States and counties\n\nWe highly recommend using the `library(urbnmapr)` package, which was\ncreated by folks here at Urban to easily create state and county level\nmaps. The `get_urbn_map()` function in the package allows you to read in\nspatial data on states and counties, with options to include\nterritories. Importantly, it will also display AL and HI as insets on\nthe map in accordance with the Urban Institute Data Visualization Style\nGuide. For information on how to install `urbnmapr`, see the [GitHub\nrepository](https://github.com/UrbanInstitute/urbnmapr).\n\nBelow is an example of how you would use `urbnmapr` to get an `sf`\ndataframe of all the states or counties in the US.\n\n```{r urbnmapr-1, eval=FALSE}\nlibrary(urbnmapr)\n\n# Get state data\nstates <- get_urbn_map(\"states\", sf = TRUE)\n\n# Can also get county data\ncounties <- get_urbn_map(\"counties\", sf = TRUE)\n```\n\n### Other Census geographies\n\nUse the `library(tigris)` package, which allows you to easily download\nTIGER and other cartographic boundaries from the US Census Bureau. In\norder to automatically load in the boundaries as `sf` objects, run\n`r options(tigris_class = \"sf\")` once per R session.\n\n`library(tigris)` has all the standard census geographies, including\ncensus tracts, counties, CBSAs, ZCTAs, congressional districts, tribal\nareas, and more. It also includes other elements such as water, roads,\nand military bases.\n\nBy default, `libraray(tigris)` will download large very large and\ndetailed TIGER line boundary files. For thematic mapping, the smaller\ncartographic boundary files are a better choice, as they are clipped to\nthe shoreline, generalized, and therefore usually smaller in size\nwithout losing too much accuracy. To load cartographic boundaries, use\nthe `cb = TRUE` argument. If you are doing detailed geospatial analysis\nand need the most detailed shapefiles, then you should use the detailed\nTIGER line boundary files and set `cb = FALSE`.\n\nBelow is an example of how you would use `library(tigris)` to get a `sf`\ndataframe of all Census tracts in DC for 2019.\n\n```{r tigris-1, eval=FALSE}\nlibrary(tigris)\n\n# Only need to set once per script\noptions(tigris_class = \"sf\")\n\ndc_tracts <- tracts(\n state = \"DC\",\n cb = TRUE,\n year = 2019\n)\n```\n\nUnlike `library(urbnmapr)`, different functions are used to get\ngeographic data for different geographic levels. For instance, the\n`blocks()` function will load census block group data, and the\n`tracts()` function will load tract data. Other functions include\n`block_groups()`, `zctas()` , and `core_based_statistical_areas()`. For\nthe full list of supported geographies and functions, see the [package\nvignette](https://cran.r-project.org/web/packages/tigris/tigris.pdf).\n\nFor folks interested in pulling in Census demographic information along\nwith Census geographies, we recommend checking out the sister package to\n`library(tigris)`: `library(tidycensus)`. That package allows you to\ndownload in Census variables and Census geographic data simultaneously.\n\n### Countries\n\nWe recommend using the `library(rnaturalearth)` package, which is\nsimilar to `library(tigris)` but allows you to download and use\nboundaries beyond the US. Instead of setting class to `sf` one time per\nsession as we did with `library(tigris)`, you must set the\n`returnclass = \"sf\"` argument each time you use a function from the\npackage. Below is an example of downloading in an `sf` dataframe of all\nthe countries in the world.\n\n```{r natural-earth, eval = FALSE}\n\nlibrary(rnaturalearth)\n\nworld <- ne_countries(returnclass = \"sf\")\n\nggplot() +\n geom_sf(data = world, mapping = aes())\n```\n\n### Your own files\n\n#### Shapefiles/GeoJSONS\n\nShapefiles and GeoJSONs are 2 common spatial file formats you will found\nout in the wild. `library(sf)` has a function called `st_read` which\nallows you to easily read in these files as `sf` dataframes. The only\nrequired argument is `dsn` or data source name. This is the filepath of\nthe `.shp` file or the `.geojson` file on your local computer. For\ngeojsons, `dsn` can also be a URL.\n\nBelow is an example of reading in a shapefile of fire stations in DC\nwhich is stored in `mapping/data/shapefiles/`. Note that shapefiles are\nactually stored as 6+ different files inside a folder. You need to\nprovide the filepath to the file ending in `.shp`.\n\n```{r list f-ei}\nlibrary(sf)\n\n# Print out all files in the directory\nlist.files(\"mapping/data/shapefiles\")\n\n# Read in .shp file\ndc_firestations <- st_read(\n dsn = \"mapping/data/shapefiles/Fire_Stations.shp\",\n quiet = TRUE\n)\n```\n\nAnd now `dc_firestations` is an `sf` dataframe you can use for all your\nmapping needs! `st_read` supports reading in a wide variety of other\nspatial file formats, including geodatabases, KML files, and over 200\nothers. For an incomplete list, please see the this `sf`\n[vignette](https://r-spatial.github.io/sf/articles/sf2.html).\n\n#### CSVs or dataframes with lat/lons\n\nIf you have a CSV with geographic information stored in columns, you\nwill need to read in the CSV as a regular R dataframe and then convert\nto an `sf` dataframe. `library(sf)` contains the `st_as_sf()` function\nfor converting regular R dataframes into an `sf` dataframe. The two\narguments you must specify for this function are:\n\n- `coords`: A length 2 vector with the names of the columns\n corresponding to longitude and latitude (in that order!). For\n example, `c(\"lon\", \"lat\")`.\n- `crs`: The CRS (coordinate references system) for your\n longitude/latitude coordinates. Remember you need to specify both\n the\\\n authority and the SRID code, for example (\"EPSG:4326\"). For more\n information on finding and setting CRS codes, please see the\n [`CRS`](#crs) section.\n\nBelow is an example of reading in data from a CSV and converting it to\nan `sf` dataframe.\n\n```{r make-sf}\nlibrary(sf)\n\n# Read in dataset of state capitals which is stored as a csv\nstate_capitals <- read_csv(\"mapping/data/state-capitals.csv\")\n\nstate_capitals <- state_capitals %>%\n # Specify names of the lon/lat columns in the CSV to use to make geometry col\n st_as_sf(\n coords = c(\"longitude\", \"latitude\"),\n crs = 4326\n )\n```\n\nOne common mistake is that before converting to an `sf` dataframe, you\nmust drop any rows that have `NA` values for latitude or longitude. If\nyour data contains `NA` values, then the `st_as_sf()` function will\nthrow an error.\n\n## Appending spatial info to your data\n\nOftentimes, the data you are working with will just have state or county\nidentifiers - like FIPS codes or state abbreviations - but will not\ncontain any geographic information. In this case, you must do the extra\nwork of downloading in the geographic data as an `sf` dataframe and then\njoining your non-spatial data to the spatial data. Generally this\ninvolves 3 steps:\n\n1) Reading in your own data as a data frame\n2) Reading in the geographic data as an `sf` dataframe\n3) Using `left_join` to merge the geographic data with your own non\n spatial data and create a new expanded `sf` dataframe\n\nLet's say we had a dataframe on CHIP enrollment by state with state\nabbreviations.\n\n```{r readin-chip-data}\n\n# read the state CHIP data\nchip_by_state <- read_csv(\"mapping/data/chip-enrollment.csv\") %>%\n # clean column names so there are no random spaces/uppercase letters\n janitor::clean_names()\n\n# print to the console\nchip_by_state %>% head()\n```\n\nIn order to convert this to an `sf` dataframe, we need to read in the\nspatial boundaries for each state and append it to our dataframe. Here\nis how we do that with `get_urbn_map()` and `left_join()` .\n\n```{r append-spatial-info, cache = FALSE}\nlibrary(urbnmapr)\n\n# read in state geographic data from urbnmapr\nstates <- get_urbn_map(map = \"states\", sf = TRUE)\n\n# left join state geographies to chip data\nchip_with_geographies <- states %>%\n left_join(\n chip_by_state,\n # Specify join column, which are slightly differently named in states and chip\n # respectively\n by = c(\"state_abbv\" = \"state_abbreviation\")\n )\n\nchip_with_geographies %>%\n select(state_fips, state_abbv, chip_enrollment)\n```\n\n```{r append-state-pops, include = FALSE, eval = TRUE, echo = FALSE}\n# TODO: DELETE THIS\n\n# Read in data on state populations from 2010\nstate_pops <-\n read_csv(\"https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv\",\n # Set this to disable printing column info to console\n col_types = cols()\n ) %>%\n filter(ages == \"total\", year == \"2010\") %>%\n select(state_abbv = `state/region`, population)\n\nchip_with_geographies <- chip_with_geographies %>%\n # Specify left_join from tidylog to print summary messages\n tidylog::left_join(state_pops, by = \"state_abbv\") %>%\n # Calculate the chip enrollment percentage and append as a column\n mutate(chip_pct = chip_enrollment / population)\n```\n\n# Project\n\n## Coordinate Reference Systems {#crs .tabset .tabset-pills}\n\n### The short version\n\nJust watch [this video](https://www.youtube.com/watch?v=vVX-PrBRtTY%60)\nand know the following:\n\n- All spatial data has a CRS, which specifies how to identify a\n location on earth.\n\n- It's important that all spatial datasets you are working with be in\n the same CRS. You can find the CRS with `st_crs()` and change the\n CRS with `st_transform()`.\n\n- The Urban Institute Style Guide requires the use of the Atlas Equal\n Earth Projection (`\"ESRI:102003\"`) for national maps. For state and\n local maps, use [this](https://github.com/veltman/d3-stateplane)\n handy guide to find an appropriate State Plane projection.\n\n### The long version\n\nCoordinate reference systems (CRS) specify the 3d shape of the earth and\noptionally how we project that 3d shape onto a 2d surface. They are an\nimportant part of working with spatial data as you need to ensure that\nall the data you are working with are in the same CRS in order for\nspatial operations and maps to be accurate.\n\nCRS can be specified either by name (ie Maryland State Plane) or\n**S**patial **R**eference System **ID**entifier (SRID). THe SRID is a\nnumeric identifier that uniquely identifies a coordinate reference\nsystem. Generally when referring to an SRID, you need to refer to an\nauthority (ie the data source) and a unique ID. An example is\n`EPSG:26985` which refers to the Maryland State plane projection from\nthe EPSG, or `ESRI:102003` which refers to the Atlas Equal Area\nprojection from ESRI. Most CRS codes will be from the EPSG, and some\nfrom ESRI and others. A good resource for finding/validating CRS codes\nis [epsg.io](epsg.io).\n\nSidenote - EPSG stands for the now defunct European Petroleum Survey\nGroup. And while oil companies have generally been terrible for the\nearth, the one nice thing they did for the earth was to set up common\nstandards for coordinate reference systems.\n\nYou might be thinking well isn't the earth just a sphere? Why do we need\nall this complicated stuff? And the answer is well the earth is [kind\nof](https://oceanservice.noaa.gov/facts/earth-round.html) a sphere, but\nit's really more of a misshapen ellipsoid which is pudgier at the\nequator than at the poles. To visualize how coordinate reference systems\nwork, imagine that the earth is a (lumpy) orange. Now peel the skin off\nan orange and try to flatten it. There are many ways to do it, but all\nwill create\n[distortions](https://twitter.com/neilrkaye/status/1050740679008296967)\nof some kind. The CRS will give us the formula we've used to specify the\nshape of the orange (usually a sphere or ellipsoid of some kind) and\noptionally, specify how we flattened the orange into 2d.\n\nBroadly, there are two kinds of Coordinate Reference Systems:\n\n1) [**Geographic coordinate\n systems**](https://www.ibm.com/support/knowledgecenter/en/SSGU8G_12.1.0/com.ibm.spatial.doc/ids_spat_407.html)\n\n - (sometimes called unprojected coordinate systems)\n - Specifies a 3d shape for the earth\n - Uses a spheroid/ellipsoid to approximate shape of the earth\n - Usually use decimal degree units (ie latitude/longitude) to\n identify locations on earth\n\n![](mapping/www/images/gcs_image.png)\n\n1) [**Projected coordinate\n systems**](https://mgimond.github.io/Spatial/chp09-0.html#projected-coordinate-systems)\n\n - Specifies a 3d shape for the earth + a 2d mapping\n\n - Is a geographic coordinate system + a *projection*\n\n ![](mapping/www/images/projecting_xkcd.png)\n\n credit: [xkcd](https://imgs.xkcd.com/comics/projecting.png)\n\n - **projection**: mathematical formula used to convert a 3d\n coordinate system to a 2d flat coordinate system\n\n - Many different kinds of projections, including Equal Area,\n Equidistant, Conformal, etc\n\n - All projections distort the true shape of the earth in some\n way, either in terms of shape, area, or angle. Required\n [xkcd comic](https://xkcd.com/977/)\n\n - Usually use linear units (ie feet, meters) and therefore\n useful for distance based spatial operations (ie creating\n buffers)\n\n## Finding the CRS\n\nIf you are lucky, your data will have embedded CRS data that will be\nautomatically detected when the file is read in. This is usually the\ncase for GeoJSONS (`.geojson`) and shapefiles (`.shp`). When you use\n`st_read()` on these files, you should see the CRS displayed in the\nmetadata:\n\n![](mapping/www/images/sf_crs_pic.png)\n\nYou can also the `st_crs()` function to find the CRS. The CRS code is\nlocated at the end in `ID[authority, SRID]`.\n\n```{r st_crs}\nst_crs(dc_firestations)\n```\n\nSometimes, the CRS will be blank or `NA` as the dataset did not specify\nthe CRS. In that case you **MUST find and set the CRS for your data\nbefore proceeding** with analysis. Below are some good rules of thumb\nfor finding out what the CRS for your data is:\n\n- For geojsons, the CRS should always be `EPSG:4326` (or WGS 84). The\n official geojson specification states that this is the only valid\n CRS for geojsons, but in the wild, this may not be true 100% of the\n time.\n- For shapefiles, there should be a file that ends in `.proj` in the\n same directory as the `.shp` file. This file contains the projection\n information for that file and should be used automatically when\n reading in shapefiles.\n- For CSV's with latitude/longitude columns, the CRS is usually\n `EPSG:4326` (or WGS 84).\n- Look at the metadata and any accompanying documentation to see if\n the coordinate reference system for the data is specified\n\nIf none of the above rules of thumb apply to you, check out the\n`crsuggest` R [package](https://github.com/walkerke/crsuggest).\n\nOnce you've identified the appropriate CRS, you can set the CRS for your\ndata with `st_crs()`:\n\n```{r set_crs, eval = FALSE}\n\n# If you are certain that your data contains coordinates in the ESRI Atlas Equal Earth projections\nst_crs(some_sf_dataframe) <- st_crs(\"ESRI:102003\")\n```\n\n## Transforming the CRS\n\nOften you will need to change the CRS for your `sf` dataframe so that\nall datasets you are using have the same CRS, or to use a projected CRS\nfor performing more accurate spatial operations. You can do this with\n`st_transform`:\n\n```{r transform-crs}\n# Transforming CRS from WGS 84 to Urban required Equal Earth Projection\nstate_capitals <- state_capitals %>% st_transform(\"ESRI:102003\")\n```\n\n`st_transform()` also allows you to just use the CRS of another `sf`\ndataframe when transforming.\n\n```{r transform-crs-with-another-sf-object}\n# transform CRS of chip_with_geographies to be the same as CRS of dc_firestations\nchip_with_geographies <- chip_with_geographies %>%\n st_transform(crs = st_crs(state_capitals))\n```\n\nIf you are working with local data, you should use an appropriate state\nplane projection instead of the Atlas Equal Earth projection which is\nmeant for national maps. `library(crsuggest)` can simplify the process\nof picking an appropriate state plane CRS.\n\n```{r crsuggest-ex, cache = TRUE}\nlibrary(crsuggest)\n\nsuggest_crs(dc_firestations) %>%\n # Use the value in the \"crs_code\" column to transform CRS's\n head(4)\n```\n\n# Map\n\nIn order to start mapping, you need an `sf` dataframe. If you don't have\none, see the [`Get Spatial Data`](#get_spatial_data) section above.\n\n## The basics\n\n### library(ggplot2)\n\nMost mapping in R fits the same theoretical framework as plotting in R\nusing `library(ggplot2)`. To learn more about ggplot2, visit the Data\nViz\n[page](https://urbaninstitute.github.io/r-at-urban/graphics-guide.html#Grammar_of_Graphics_and_Conventions)\nor read the official ggplot [book](html).\n\nThe key function for mapping is **the special `geom_sf()` function**\nwhich works with `sf` dataframes. This function magically detects\nwhether you have point or polygon spatial data and displays the results\non a map.\n\n### A simple map\n\nTo make a simple map, add `geom_sf()` to a `ggplot()` and set\n`data = an_sf_dataframe`. Below is code for making a map of all 50\nstates using `library(urbnmapr)`:\n\n```{r first-map, cache = TRUE}\nlibrary(urbnmapr)\n\nstates <- get_urbn_map(\"states\", sf = TRUE)\n\nggplot() +\n geom_sf(\n data = states,\n mapping = aes()\n )\n```\n\n## Styling\n\n### `library(urbnthemes)`\n\n`library(urbnthemes)` automatically styles maps in accordance with the\n[Urban Institute Data Visualization Style\nGuide](http://urbaninstitute.github.io/graphics-styleguide/). By using\n`library(urbnthemes)`, you can create publication ready maps you can\nimmediately drop in to Urban research briefs or blog posts.\n\nTo install `urbnthemes`, visit the package's [GitHub\nrepository](https://github.com/UrbanInstitute/urbnthemes) and follow the\ninstructions. There are 2 ways to use the `urbnthemes` functions:\n\n```{r urbnthemes}\n\nlibrary(urbnthemes)\n\n# You can either run this once per script to automatically style all maps with\n# the Urban theme\nset_urbn_defaults(style = \"map\")\n\n# Or you can add `+ theme_urbn_map()` to the end of every map you make\nggplot() +\n geom_sf(states, mapping = aes()) +\n theme_urbn_map()\n```\n\n### Layering\n\nYou can layer multiple points/lines/polygons on top of each other using\nthe `+` operator from `library(ggplot2)`. The shapes will appear from\nbottom to top (ie the last mapped object will show up on top). It is\nimportant that all layers are in the same CRS (coordinate reference\nsystem).\n\n```{r layers, cache = TRUE}\n\nstate_capitals <- state_capitals %>%\n # This will change CRS to ESRI:102003 and shift the AK and HI state capitals\n # point locations to the appropriate locations on the inset maps.\n tigris::shift_geometry() %>%\n # For now filter out AL and HI as their state capitals will be slightly off.\n filter(!state %in% c(\"Alaska\", \"Hawaii\"))\n\nggplot() +\n geom_sf(\n data = states,\n mapping = aes()\n ) +\n # Note we change the data argument\n geom_sf(\n data = state_capitals,\n mapping = aes(),\n # urbnthemes library has urbn color palettes built in.\n color = palette_urbn_main[\"yellow\"],\n size = 2.0\n ) +\n theme_urbn_map()\n```\n\n### Fill and Outline Colors\n\nThe same commands used to change colors, opacity, lines, size, etc. in\ncharts can be used for maps too. To change the colors of the map , just\nuse the `fill =` and `color =` parameters in `geom_sf()`. `fill` will\nchange the fill color of polygons; `color` will change the color of\npolygon outlines, lines, and points.\n\nGenerally, maps that show the magnitude of a variable use the blue\nsequential ramp and maps that display positives and negatives use the\ndiverging color ramp.`library(urbnthemes)` contains inbuilt. helper\nvariables (like `palette_urbn_main`) for accessing color palettes from\nthe Urban Data Viz Style guide. If for example you want states to be\nUrban's magenta color:\n\n```{r urbnthemes- pink}\n\nggplot() +\n geom_sf(states,\n mapping = aes(),\n # Adjust polygon fill color\n fill = palette_urbn_main[\"magenta\"],\n # Adjust polygon outline color\n color = \"white\"\n ) +\n theme_urbn_map()\n```\n\n### Adding text\n\nYou can also add text, like state abbreviations, directly to your map\nusing `geom_sf_text` and the helper function `get_urbn_labels()`.\n\n```{r geom_sf_text}\nlibrary(urbnmapr)\n\nggplot() +\n geom_sf(states,\n mapping = aes(),\n color = \"white\"\n ) +\n theme_urbn_map() +\n # Generates dataframe of state abbv and appropriate location to plot them\n geom_sf_text(\n data = get_urbn_labels(\n map = \"states\",\n sf = TRUE\n ),\n aes(label = state_abbv),\n size = 3\n )\n```\n\nThere's also `geom_sf_label()` if you want labels with a border.\n\n# Map Gallery {#map_gallery}\n\nBelow are copy and pasteable examples of maps you can make, after you\nhave an `sf` dataframe.\n\n## Choropleth Maps\n\nChoropleth maps display geographic areas with shades, colors, or\npatterns in proportion to a variable or variables. Choropleth maps can\nrepresent massive geographies like the entire world and small\ngeographies like Census Tracts. To make a choropleth map, you need to\nset `geom_sf(aes(fill = some_variable_name))`. Below are examples\n\n### Continuous color scale\n\n```{r choropoleth_continious}\n# Map of CHIP enrollment percentage by state\nchip_with_geographies_map <- chip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct\n ))\n\n\n# Below add-ons to the map are optional, but make the map look prettier.\nchip_with_geographies_map +\n # scale_fill_gradientn adds colors with more interpolation and reverses color scale\n scale_fill_gradientn(\n # Convert legend from decimal to percentages\n labels = scales::percent_format(),\n # Make legend title more readable\n name = \"CHIP Enrollment %\",\n # Manually add 0 to lower limit to include it in legend. NA=use maximum value in data\n limits = c(0, NA),\n # Set number of breaks on legend = 3\n n.breaks = 3\n )\n```\n\n### Discrete color scale\n\nThe quick and dirty way is with `scale_fill_steps()`, which creates\ndiscretized bins for continuous variables:\n\n```{r chorpleth_disccrete}\nchip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct\n )) +\n scale_fill_steps(\n # Convert legend from decimal to percentages\n labels = scales::percent_format(),\n # Make legend title more readable\n name = \"CHIP Enrollment %\",\n # Show top and bottom limits on legend\n show.limits = TRUE,\n # Roughly set number of bins. Won't be exact as R uses algorithms under the\n # hood for pretty looking breaks.\n n.breaks = 4\n )\n```\n\nOften you will want to manually generate the bins yourself to give you\nmore fine grained control over the exact legend text. (ie `1% - 1.8%`,\n`1.8 - 2.5%`, etc). Below is an example of discretizing the continuous\n`chip_pct` variable yourself using `cut_interval()` and a helper\nfunction to get nice looking interval labels:\n\n```{r format_intervals}\n\n# Helper function to clean up R generated intervals into nice looking interval labels\nformat_interval <- function(interval_text) {\n text <- interval_text %>%\n # Remove open and close brackets which is R generated math notation\n str_remove_all(\"\\\\(\") %>%\n str_remove_all(\"\\\\)\") %>%\n str_remove_all(\"\\\\[\") %>%\n str_remove_all(\"\\\\]\") %>%\n str_replace_all(\",\", \" — \")\n\n # Convert decimal ranges to percent ranges\n text <- text %>%\n str_split(\" — \") %>%\n map(~ as.numeric(.x) %>%\n scales::percent() %>%\n paste0(collapse = \" — \")) %>%\n unlist() %>%\n # By default character vectors are plotted in alphabetical order. We want\n # factors in reverse alphabetical order to get correct colors in ggplot\n fct_rev()\n\n return(text)\n}\n\nchip_with_geographies <- chip_with_geographies %>%\n # cut_interval into n groups with equal range. Set boundary so 0 is included in the bins\n mutate(chip_pct_interval = cut_interval(chip_pct, n = 5)) %>%\n # Generate nice looking interval labels\n mutate(chip_pct_interval = format_interval(chip_pct_interval))\n```\n\nAnd now we can map the discretized `chip_pct_interval` variable using\n`geom_sf()`:\n\n```{r make_discrete_map}\nchip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct_interval\n )) +\n # Default is to use main urban palette, which assumes unrelated groups. We\n # adjust colors manually to be on Urban cyan palette\n scale_fill_manual(\n values = palette_urbn_cyan[c(8, 7, 5, 3, 1)],\n name = \"CHIP Enrollment %\"\n )\n```\n\nIn addition to `cut_interval` there are [similar\nfunctions](https://ggplot2.tidyverse.org/reference/cut_interval.html)\nfor creating intervals/bins with slightly different rules. When creating\nbins, be careful as changing the number of bins can drastically change\nhow the map looks.\n\n## Bubble Maps\n\nThis is just a layered map with one polygon layer and one point layer,\nwhere the points are sized in accordance with a variable in your data.\n\n```{r bubble_maps, cache = TRUE}\nset_urbn_defaults(style = \"map\")\n\n# Get sf dataframe of DC tracts\nlibrary(tigris)\ndc_tracts <- tracts(\n state = \"DC\",\n year = 2019,\n progress_bar = FALSE\n)\n\n# Add bubbles for firestations\nggplot() +\n geom_sf(data = dc_tracts, fill = palette_urbn_main[\"gray\"]) +\n geom_sf(\n data = dc_firestations,\n # Size bubbles by number of trucks at each station\n aes(size = TRUCK),\n color = palette_urbn_main[\"yellow\"],\n # Adjust transparency for readability\n alpha = 0.8\n )\n```\n\n## Dot-density Maps\n\nThese maps scatter dots within a geographic area. Typically each dot\nrepresents a unit (like 100 people, or 1000 houses). To create this kind\nof map, you need to start with an `sf` dataframe that is of `geometry`\ntype `POLYGON` or `MULTIPOLYGON` and then sample points within the\npolygon.\n\nThe below code generates a dot-density map representing people of\ndifferent races within Washington DC tracts The code may look a little\ncomplicated, but the key workhorse function is `st_sample()` which\nsamples points within each polygon to use in the dot density map:\n\n```{r dot_density_maps, cache = TRUE}\nlibrary(tidycensus)\n\n# Get counts by race of DC tracts\ndc_pop <- get_acs(\n geography = \"tract\",\n state = \"DC\",\n year = 2019,\n variables = c(\n Hispanic = \"DP05_0071\",\n White = \"DP05_0077\",\n Black = \"DP05_0078\",\n Asian = \"DP05_0080\"\n ),\n geometry = TRUE,\n progress_bar = FALSE\n)\n\n# Get unique groups (ie races)\ngroups <- unique(dc_pop$variable)\n\n# For each unique group (ie race), generate sampled points\ndc_race_dots <- map_dfr(groups, ~ {\n dc_pop %>%\n # .x = the group used in the loop\n filter(variable == .x) %>%\n # Use the projected MD state plane for accuracy\n st_transform(crs = \"EPSG:6487\") %>%\n # Have every dot represent 100 people\n mutate(est100 = as.integer(estimate / 100)) %>%\n st_sample(size = .$est100, exact = TRUE) %>%\n st_sf() %>%\n # Add group (ie race) as a column so we can use it when plotting\n mutate(group = .x)\n})\n\n\nggplot() +\n # Plot tracts, then dots on top of tracts\n geom_sf(\n data = dc_pop,\n # Make interior of tracts transparent and boundaries black\n fill = \"transparent\",\n color = \"black\"\n ) +\n geom_sf(\n data = dc_race_dots,\n # Color in dots by racial group\n aes(color = group),\n # Adjust transparency and size to be more readable\n alpha = 0.5,\n size = 1.1,\n stroke = FALSE\n )\n```\n\n## Geofacets\n\nGeofaceting arranges sub-geography-specific plots into a grid that\nresembles a larger geography (usually the US). This can be a useful\nalternative to choropleth maps, which tend to overemphasize\nlow-population density areas with large areas. To make geofacetted\ncharts, use the `facet_geo()` function from the `geofacet` library,\nwhich can be thought of as equivalent to ggplot2's `facet_wrap()`. For\nthis example, we'll use the built-in `state_ranks` data.\n\n```{r geofacet-data}\nlibrary(geofacet)\n\nhead(state_ranks %>% as_tibble())\n```\n\n```{r geofacet-ex, cache = TRUE}\nset_urbn_defaults(style = \"print\")\n\nstate_ranks %>%\n filter(variable %in% c(\"education\", \"employment\")) %>%\n ggplot(aes(x = rank, y = variable)) +\n geom_col() +\n facet_geo(\n facets = \"state\",\n # Use custom urban geofacet grid which is built into urbnthemes\n # For now we need to rename a few columns as urbnthemes has to be\n # updated\n grid = urbnthemes::urbn_geofacet %>%\n rename(\n code = state_code,\n name = state_name\n )\n )\n```\n\nInteractive geofacets of the United States have been used in Urban\nFeatures like [A Matter of\nTime](https://apps.urban.org/features/long-prison-terms/trends.html)\nwhich included geofaceted line charts showing trends in incarceration by\nstate. Static geofacets of the United States were included in [Barriers\nto Accessing Homeownership Down Payment, Credit, and\nAffordability](https://www.urban.org/sites/default/files/publication/94801/barriers-to-homeownership-down-payments-credit-access-and-affordability_3.pdf)\nby the Housing Finance Policy Center.\n\n### Tile grid map\n\nYou can select predefined grids, or create your own at https://hafen.github.io/grid-designer/ \n\n```{r}\n# create a grid with all of the US states and territories \nmygrid <- data.frame(\n code = c(\"ME\", \"AK\", \"WI\", \"VT\", \"NH\", \"IL\", \"ID\", \"WA\", \"MN\", \"MT\", \"ND\", \"MI\", \"NY\", \"MA\", \"IA\", \"IN\", \"CT\", \"RI\", \"NJ\", \"PA\", \"OH\", \"SD\", \"WY\", \"NV\", \"OR\", \"CA\", \"NE\", \"DE\", \"MD\", \"VA\", \"WV\", \"KY\", \"MO\", \"CO\", \"UT\", \"AZ\", \"KS\", \"AR\", \"DC\", \"SC\", \"NC\", \"TN\", \"NM\", \"LA\", \"AL\", \"GA\", \"MS\", \"OK\", \"HI\", \"FL\", \"TX\"),\n row = c(1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8),\n col = c(12, 2, 7, 11, 12, 7, 3, 2, 6, 4, 5, 8, 10, 11, 6, 7, 11, 12, 10, 9, 8, 5, 4, 3, 2, 2, 5, 11, 10, 9, 8, 7, 6, 4, 3, 3, 5, 6, 10, 9, 8, 7, 4, 6, 8, 9, 7, 5, 2, 10, 5),\n stringsAsFactors = FALSE\n)\n\n## Combine data into geo_grid for tiling:\ngeo_grid_data <- mygrid %>% \n left_join(chip_with_geographies, by=c(\"code\" = \"state_abbv\")) \n\n## plot tile grid\ngeo_grid_data %>% \n ggplot(aes(x = col, y = row, fill = chip_pct_interval)) +\n scale_fill_manual(values = palette_urbn_cyan[c(8, 7, 5, 3, 1)], \n \t\t\t\t\t\t\t\t\t name = \"CHIP Enrollment %\") +\n geom_tile(color = \"white\", linewidth = 1) +\n geom_text(aes(label = code), color=\"white\", size = 4) +\n scale_y_reverse() +\n coord_equal() +\n labs(fill=NULL)\n```\n\n\n## Cartograms\n\nCartograms are a modified form of a choropleth map with intentionally\ndistorted sizes that map to a variable in your data. Below we create a\ncartogram with `library(cartogram)` where the state sizes are\nproportional to the population.\n\n```{r cartogram-example, cache = TRUE}\nlibrary(cartogram)\n\nset_urbn_defaults(style = \"map\")\n\nchip_with_geographies_weighted <- chip_with_geographies %>%\n # Note column name needs to be in quotes for this package\n cartogram_cont(weight = \"population\")\n\nggplot() +\n geom_sf(\n data = chip_with_geographies_weighted,\n # Color in states by chip percentages\n aes(fill = chip_pct)\n )\n```\n\n## Interactive Maps\n\nInteractive maps can be a great exploratory tool to explore and\nunderstand your data. And luckily there are a lot of new R packages that\nmake it really easy to create them. Interactive maps are powerful but\n**we do not recommend them for official use in Urban publications** as\ngetting them in Urban styles and appropriate basemaps can be tricky\n(reach out to\n[anarayanan\\@urban.org](mailto:anarayanan@urban.org){.email} if you\nreally want to include them).\n\n### `library(mapview)`\n\n`library(mapview)` is probably the most user friendly of the interactive\nmapping R libraries. All you have to do to create an interactive map is:\n\n```{r show-mapview}\nlibrary(mapview)\n\n\nchip_with_geographies_for_interactive_mapping <- chip_with_geographies %>%\n # Filter out AL and HI bc they would appear in Mexico. If you want AL, HI and\n # in the correct place in interactive maps, make sure to use tigris::states()\n filter(!state_abbv %in% c(\"AK\", \"HI\"))\n\nmapview(chip_with_geographies_for_interactive_mapping)\n```\n\nWhen you click on an object, you get a popup table of all it's\nattributes. And when you hover over an object, you get a popup with an\nobject id.\n\nEach of the above behaviors can be changed if desired. As you'll see in\nthe below section, the syntax for `library(mapview)` is significantly\ndifferent from `library(ggplot2)` so be careful!\n\n#### Coloring in points/polygons\n\nIn order to create a choropleth map where we color in the\npoints/polygons by a variable, we need to feed in a column name *in\nquotes* to the`zcol` argument inside the `mapview()` function:\n\n```{r mapview_zcol}\n# Create interactive state map colored in by chip enrollment\nmapview(chip_with_geographies_for_interactive_mapping, zcol = \"chip_enrollment\")\n```\n\nIf you want more granular control over the color palette for the legend\ncan also feed in a vector of color hex codes to `col.regions` along with\na column name to `zcol`. This will create a continuous color range along\nthe provided colors. Be careful though as the color interpolation is not\nperfect.\n\n```{r mapview-colors-granular}\n# library(RColorBrewer)\nmapview(chip_with_geographies_for_interactive_mapping,\n col.regions = c(\n palette_urbn_green[6],\n \"white\",\n palette_urbn_cyan[6]\n ),\n zcol = \"chip_enrollment\"\n)\n```\n\nIf you want to color in all points/polygons as the same color, just feed\nin a single color hex code to the `col.regions` argument:\n\n```{r mapview-colors}\nmapview(chip_with_geographies_for_interactive_mapping,\n col.regions = palette_urbn_green[5]\n)\n```\n\n#### Adding layers\n\nYou can add multiple `sf` objects on the same map by using the `+`\noperator. This is very useful when comparing 2 or more spatial datasets.\n\n```{r mapview-layers}\nmapview(chip_with_geographies_for_interactive_mapping, col.regions = palette_urbn_green[5]) +\n mapview(state_capitals, col.regions = palette_urbn_cyan[5])\n```\n\nYou can even create slider maps by using the `|` operator!\n\n```{r mapview-sliders}\nmapview(chip_with_geographies_for_interactive_mapping, col.regions = palette_urbn_green[5]) |\n mapview(state_capitals, col.regions = palette_urbn_cyan[5])\n```\n\n### More details\n\nTo learn more about more advanced options with `mapview` maps, check out\nthe\n[documentation](https://r-spatial.github.io/mapview/articles/articles/mapview_02-advanced.html)\npage and the [reference\nmanual](https://cran.r-project.org/web/packages/mapview/mapview.pdf).\n\nThere are also other interactive map making packages in R like `leaflet`\n(which `mapview` is a more user friendly wrapper of), `tmap`, and\n`mapdeck`. To learn about these other packages, [this book\nchapter](https://geocompr.robinlovelace.net/adv-map.html#interactive-maps)\nis a good starting point.\n\n# Spatial Operations\n\n## Cropping\n\nCropping (or clipping) is geographically filtering an `sf` dataframe to\njust the area we are interested in. Say we wanted to look at the roads\naround Fire Station 24 in DC.\n\n```{r roads_cropping_before, cache = TRUE}\nlibrary(tigris)\nlibrary(units)\n\ndc_firestations <- dc_firestations %>%\n st_transform(\"EPSG:6487\")\n\n\n# Draw 500 meter circle around one fire station\nfire_station_24_buffered <- dc_firestations %>%\n filter(NAME == \"Engine 24 Station\") %>%\n st_buffer(set_units(500, \"meter\"))\n\n# Get listing of all roads in DC\ndc_roads <- roads(\n state = \"DC\",\n county = \"District of Columbia\",\n class = \"sf\",\n progress_bar = FALSE\n) %>%\n st_transform(\"EPSG:6487\")\n\n# View roads on top of fire_station\nggplot() +\n # Order matters! We need to plot fire_stations first, and then roads on top\n # to see overlapping firestations\n geom_sf(\n data = fire_station_24_buffered,\n fill = palette_urbn_cyan[1],\n color = palette_urbn_cyan[7]\n ) +\n geom_sf(\n data = dc_roads,\n color = palette_urbn_gray[7]\n ) +\n theme_urbn_map()\n```\n\nWe can clip the larger roads dataframe to just roads that overlap with\nthe circle around the fire station with `st_intersection()`.\n\n```{r roads_cropping_after}\n\n# Use st_intersection() to crop the roads data to just roads within the\n# fire_station radius\ndc_roads_around_fire_station_24_buffered <- fire_station_24_buffered %>%\n st_intersection(dc_roads)\n\nggplot() +\n geom_sf(\n data = fire_station_24_buffered,\n fill = palette_urbn_cyan[1],\n color = palette_urbn_cyan[7]\n ) +\n geom_sf(\n data = dc_roads_around_fire_station_24_buffered,\n color = palette_urbn_gray[7]\n ) +\n theme_urbn_map()\n```\n\n**More Coming Soon!**\n\n## Calculating Distance\n\n## Spatial Joins\n\n### Point to Polygon\n\n### Polygon to Polygon\n\n## Aggregating\n\n## Drive/Transit times\n\n## Geocoding\n\nGeocoding is the process of turning text (usually addresses) into\ngeographic coordinates (usually latitudes/longitudes) for use in\nmapping. For Urban researchers, we highly recommend using the [Urban\ngeocoder](https://tech-tools.urban.org/geocoding/) as it is fast,\naccurate, designed to work with sensitive/confidential data and most\nimportantly free to use for Urban researchers! To learn about how we set\nup and chose the geocoder for the Urban Institute, you can read our\n[Data\\@Urban\nblog](https://medium.com/@urban_institute/choosing-a-geocoder-for-the-urban-institute-86192f656c5f).\n\n### Cleaning Addresses\n\nThe single most important factor in getting accurate geocoded data is\nhaving cleaned, well structured address data. This can prove difficult\nas address data out in the wild is often messy and unstandardized. While\nthe rules for cleaning addresses are very data specific, below are some\nexamples of clean addresses you should aim for in your data cleaning\nprocess:\n\n```{r cleaned-addr, cache=TRUE,eval=TRUE,results=TRUE, echo=FALSE}\nlibrary(gt)\ncleaned_address_table <- tribble(\n ~\"f_address\", ~\"Type of address\",\n \"123 Troy Drive, Pillowtown, CO, 92432\", \"residnetial address\",\n \"789 Abed Avenue, Apt 666, Blankesburg, CO, 92489\", \"residential apartment address\",\n \"Shirley Boulevard and Britta Drive, Blanketsburg, CO, 92489\", \"street intersection\",\n \"Pillowtown, CO\", \"city\",\n \"92489, CO\", \"Zip Code\",\n)\n\ngt(cleaned_address_table) %>%\n # tab_header(title = md(\"Clean Address Examples\")) %>%\n opt_row_striping(row_striping = TRUE) %>%\n tab_style(\n style = list(\n cell_text(weight = \"bold\")\n ),\n locations = cells_column_labels(\n columns = vars(f_address, `Type of address`)\n )\n ) %>%\n opt_align_table_header(align = c(\"left\")) %>%\n tab_options(\n container.width = \"100%\",\n container.height = \"400px\",\n # column_labels.background.color = palette_urbn_cyan[1],\n table.border.top.width = 0,\n table.border.bottom.width = 0,\n column_labels.border.bottom.width = 0,\n )\n```\n\nAll that being said, our geocoder is pretty tolerant of different\naddress formats, typos/spelling errors and missing states, zip codes,\netc. So don't spend too much time cleaning every address in the data.\nAlso note that while our geocoder is able to geocode cities and zip\ncodes, it will return the lat/lon of the center of the city/zip code,\nwhich may not be what you want.\n\n### Instructions\n\nTo use the [Urban geocoder](https://tech-tools.urban.org/geocoding/),\nyou will need to:\n\n1) Generate a CSV with a column named `f_address` which contains the\n addresses in single line format (ie\n `123 Abed Avenue, Blanketsburg, CO, 94328`). This means that if you\n have the addresses split across multiple columns (ie `Address`,\n `City`, `State`, `Zip` columns), you will need to concatenate them\n into one column. Also see our Address cleaning section above.\n\n2) Go to the Urban geocoder and answer the initial questions. This will\n tell you whether your data is non-confidential or confidential data,\n and allow you to upload your CSV for geocoding.\n\n3) Wait for an email telling you your results are ready. If your data\n is non-confidential, this email will contain a link to your geocoded\n results. This link expires in 24 hours, so make sure to download\n your data before then. If you data is confidential, the email will\n contain a link to the location on the Y Drive where your\n confidential geocoded data is stored. You can specify this output\n folder when submitting the CSV in step 1.\n\n### Geocoder outputs\n\n

The geocoded file will be your original data, plus a few more columns\n(including latitude and longitude). each of the new columns that have\nbeen appended to your original data. [It's very important that you take\na look at the Addr_type\ncolumn]{style=\"background-color: #FFFF00; font-weight: bold\"} in the\nCSV before doing further analysis to check the accuracy of the geocoding\nprocess.

\n\n+---------------+---------------------------------------------------+\n| Column | Description |\n+:==============+:==================================================+\n| Match_addr | The actual address that the inputted address was |\n| | matched to. This is the address that the geocoder |\n| | used to get Latitudes / Longitudes. If there are |\n| | potentially many typos or non standard address |\n| | formats in your data file, you will want to take |\n| | a close look at this column to confirm that the |\n| | matched address correctly handled typos and badly |\n| | formatted addresses. |\n+---------------+---------------------------------------------------+\n| Longitude | The WGS 84 datum Longitude (EPSG code 4326) |\n+---------------+---------------------------------------------------+\n| Latitude | The WGS 84 datum Latitude (EPSG code 4326) |\n+---------------+---------------------------------------------------+\n| Addr_type | The match level for a geocode request. This |\n| | should be used as an indicator of the precision |\n| | of geocode results. Generally, Subaddress, |\n| | PointAddress, StreetAddress, and StreetInt |\n| | represent accurate matches. The list below |\n| | contains all possible values for this field. |\n| | **Green values represent High accuracy matches, |\n| | yellow represents Medium accuracy matches and red |\n| | represents Low accuracy/inaccurate matches**. If |\n| | you have many yellow and red values in your data, |\n| | you should manually check the results before |\n| | proceeding with analysis. All possible values:\\ |\n| | \\ |\n| | **Subaddress:** A subset of a PointAddress that |\n| | represents a house or building subaddress |\n| | location, such as an apartment unit, floor, or |\n| | individual building within a complex. The |\n| | UnitName, UnitType, LevelName, LevelType, |\n| | BldgName, and BldgType field values help to |\n| | distinguish subaddresses which may be associated |\n| | with the same PointAddress. Reference data |\n| | consists of point features with associated house |\n| | number, street name, and subaddress elements, |\n| | along with administrative divisions and optional |\n| | postal code; for example, 3836 Emerald Ave, Suite |\n| | C, La Verne, CA, 91750.\\ |\n| | \\ |\n| | **PointAddress:** A street address based on |\n| | points that represent house and building |\n| | locations. Typically, this is the most spatially |\n| | accurate match level. Reference data contains |\n| | address points with associated house numbers and |\n| | street names, along with administrative divisions |\n| | and optional postal code. The X / Y |\n| | (`Longitude`/`Latitude`) and `geometry` output |\n| | values for a PointAddress match represent the |\n| | street entry location for the address; this is |\n| | the location used for routing operations. The |\n| | `DisplayX` and `DisplayY` values represent the |\n| | rooftop, or actual, location of the address. |\n| | Example: 380 New York St, Redlands, CA, 92373.\\ |\n| | \\ |\n| | **StreetAddress** --- A street address that |\n| | differs from PointAddress because the house |\n| | number is interpolated from a range of numbers. |\n| | Reference data contains street center lines with |\n| | house number ranges, along with administrative |\n| | divisions and optional postal code information, |\n| | for example, 647 Haight St, San Francisco, CA, |\n| | 94117.\\ |\n| | \\ |\n| | **StreetInt:** A street address consisting of a |\n| | street intersection along with city and optional |\n| | state and postal code information. This is |\n| | derived from StreetAddress reference data, for |\n| | example, Redlands Blvd & New York St, Redlands, |\n| | CA, 92373.\\ |\n| | \\ |\n| | **StreetName:** Similar to a street address but |\n| | without the house number. Reference data contains |\n| | street centerlines with associated street names |\n| | (no numbered address ranges), along with |\n| | administrative divisions and optional postal |\n| | code, for example, W Olive Ave, Redlands, CA, |\n| | 92373.\\ |\n| | \\ |\n| | **StreetAddressExt:** An interpolated street |\n| | address match that is returned when parameter |\n| | matchOutOfRange=true and the input house number |\n| | exceeds the house number range for the matched |\n| | street segment.\\ |\n| | \\ |\n| | **DistanceMarker:** A street address that |\n| | represents the linear distance along a street, |\n| | typically in kilometers or miles, from a |\n| | designated origin location. Example: Carr 682 KM |\n| | 4, Barceloneta, 00617.\\ |\n| | \\ |\n| | **PostalExt:** A postal code with an additional |\n| | extension, such as the United States Postal |\n| | Service ZIP+4. Reference data is postal code |\n| | points with extensions, for example, 90210-3841.\\ |\n| | \\ |\n| | **POI:** ---Points of interest. Reference data |\n| | consists of administrative division place-names, |\n| | businesses, landmarks, and geographic features, |\n| | for example, Golden Gate Bridge.\\ |\n| | \\ |\n| | **Locality:** A place-name representing a |\n| | populated place. The Type output field provides |\n| | more detailed information about the type of |\n| | populated place. Possible Type values for |\n| | Locality matches include Block, Sector, |\n| | Neighborhood, District, City, MetroArea, County, |\n| | State or Province, Territory, Country, and Zone. |\n| | Example: Bogotá, COL,\\ |\n| | \\ |\n| | **PostalLoc:** A combination of postal code and |\n| | city name. Reference data is typically a union of |\n| | postal boundaries and administrative (locality) |\n| | boundaries, for example, 7132 Frauenkirchen.\\ |\n| | \\ |\n| | **Postal:** Postal code. Reference data is postal |\n| | code points, for example, 90210 USA. |\n+---------------+---------------------------------------------------+\n| Score | A number from 1--100 indicating the degree to |\n| | which the input tokens in a geocoding request |\n| | match the address components in a candidate |\n| | record. A score of 100 represents a perfect |\n| | match, while lower scores represent decreasing |\n| | match accuracy. |\n+---------------+---------------------------------------------------+\n| Status | Indicates whether a batch geocode request results |\n| | in a match, tie, or unmatched. Possible values |\n| | include\\ |\n| | \\ |\n| | M - Match. The returned address matches the input |\n| | address and is the highest scoring candidate.\\ |\n| | \\ |\n| | T - Tied. The returned address matches the input |\n| | address but has the same score as one or more |\n| | additional candidates.\\ |\n| | \\ |\n| | U - Unmatched. No addresses match the inputted |\n| | address. |\n+---------------+---------------------------------------------------+\n| geometry | The WKT (Well-known text) representation of the |\n| | latitudes and longitudes. This column may be |\n| | useful if you're reading the CSV into R, Python, |\n| | or ArcGIS |\n+---------------+---------------------------------------------------+\n| Region | The state that `Match_addr` is located in |\n+---------------+---------------------------------------------------+\n| RegionAbbr | Abbreviated State Name. For example, CA for |\n| | California |\n+---------------+---------------------------------------------------+\n| Subregion | The county that the input address is located in |\n+---------------+---------------------------------------------------+\n| MetroArea | The name of the Metropolitan area that |\n| | `Match_addr` is located in. This field may be |\n| | blank if the input address is not located within |\n| | a metro area. |\n+---------------+---------------------------------------------------+\n| City | The city that `Match_addr` is located in |\n+---------------+---------------------------------------------------+\n| Nbrhd | The Neighborhood that `Match_addr` is located in. |\n| | Note these are ESRI defined neighborhoods which |\n| | may or may not align with other sources |\n| | neighborhood definitions |\n+---------------+---------------------------------------------------+\n\n\\\n\n# Geospatial Modeling\n\nComing soon!\n\n# Bibliography and references\n\n------------------------------------------------------------------------\n\n```{r session-info}\n\nsessionInfo()\n```\n"},"formats":{"html":{"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"knitr"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[]},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"mapping.html"},"language":{},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.2.269","editor_options":{"markdown":{"wrap":72}}},"extensions":{"book":{"multiFile":true}}}}} \ No newline at end of file +{"title":"Introduction","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"editor_options":{"markdown":{"wrap":72}}},"headingText":"Introduction","containsRefs":false,"markdown":"\n\n\n\n::: {#header}\n\n:::\n\n```{r markdown-setup, include=FALSE}\nknitr::opts_chunk$set(fig.path = \"mapping/www/images/\")\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\nknitr::opts_chunk$set(warning = FALSE)\n\noptions(scipen = 999)\n```\n\n```{r setup, include=FALSE}\nlibrary(tidyverse)\nlibrary(knitr)\nlibrary(kableExtra)\nlibrary(here)\nlibrary(sf)\n```\n\n\nThis guide will teach you the concepts and code you will need for\nmapping and geospatial analysis in R. **This is a long guide, so if you\nneed something specific, we encourage you to scroll to the appropriate\nsection using the Table of Contents on the left.** If you just want copy\nand pasteable code to create different kinds of maps, head to the\n[`Map Gallery`](#map_gallery).\n\nNow let's start mapping!\n\n![](mapping/www/images/yay_maps.gif)\n\n## Geospatial Workflow\n\nThis picture below outlines what we think are the main steps in a\ngeospatial workflow. This guide will be split into sections describing\neach of the steps.\n\n![](mapping/www/images/geospatial_workflow.png)\n\n## Should this be a map?\n\nThe [Urban Institute Data Visualization Style\nGuide](http://urbaninstitute.github.io/graphics-styleguide/) offers some\nblunt but useful suggestions for maps:\n\n> Just because you've got geographic data, doesn't mean that you have to\n> make a map. Many times, there are more efficient storyforms that will\n> get your point across more clearly. If your data shows a very clear\n> geographic trend or if the absolute location of a place or event\n> matters, maps might be the best approach, but sometimes the reflexive\n> impulse to map the data can make you forget that showing the data in\n> another form might answer other---and sometimes more\n> important---questions.\n\nSo we would encourage you to think critically before making a map.\n\n## Why map with R?\n\nR can have a steeper learning curve than point-and-click tools - like\nQGIS or ArcGIS - for geospatial analysis and mapping. But creating maps\nin R has many advantages including:\n\n1) **Reproducibility**: By creating maps with R code, you can easily\n share the outputs and the code that generated the output with\n collaborators, allowing them to replicate your work and catch errors\n easily.\n\n2) **Iteration**: With point and click software like ArcGIS, making 50\n maps would be 50 times the work/time. But using R, we can easily\n make make many iterations of the same map with a few changes to the\n code.\n\n3) **Easy Updates**: Writing code provides a roadmap for others (and\n future you!) to quickly update parts of the map as needed. Say for\n example a collaborator wanted to change the legend colors of 50\n state maps. With R, this is possible in just a few seconds!\n\n4) **An Expansive ecosystem**: There are several R packages that make\n it very easy to get spatial data, create static and interactive\n maps, and perform spatial analyses. This feature rich package\n ecosystem which all play nice together is frankly unmatched by other\n programming languages and even point and click tools like QGIS and\n ArcGIS. Some of these R packages include:\n\n - `sf`: For managing and analyzing spatial dataframes\n - `tigris`: For downloading in Census geographies\n - `ggplot2`: For making publication ready static maps\n - `urbnmapr`: For automatically adding Urban styling to static\n maps\n - `mapview`: For making expxploratory interactive maps\n\n5) **Cost**: Most point-and-click tools for geospatial analysis are\n proprietary and expensive. R is free open-source software. The\n software and most of its packages can be used for free by anyone for\n almost any use case.\n\n## Helpful Learning Resources\n\nIn addition to this guide, you may want to look at these other helpful\nresources:\n\n- The Urban Institute [mapping training\n series](https://ui-research.github.io/urbn101-mapping/) (with video\n lectures and notes)\n- Chapters\n [5](https://walker-data.com/census-r/census-geographic-data-and-applications-in-r.html),\n [6](https://walker-data.com/census-r/mapping-census-data-with-r.html),\n and\n [7](https://walker-data.com/census-r/spatial-analysis-with-us-census-data.html)\n from Kyle Walker's Analyzing US Census Data\n [book](https://walker-data.com/census-r/index.html).\n- Andrew Heiss' fantastic mapping\n [guide](https://datavizm20.classes.andrewheiss.com/example/12-example/)\n- All of the vignettes for the [`sf`\n package](https://cran.r-project.org/web/packages/sf/sf.pdf)\n- [Geocomputation with\n R](https://geocompr.robinlovelace.net/index.html): A book by Robin\n Lovelace and others\n- UChicago's R Spatial Workshops:\n \n\n# Get Spatial Data {#get_spatial_data}\n\n------------------------------------------------------------------------\n\n## library(sf) {.tabset .tabset-pills}\n\n### The short version\n\n`library(sf)` stores geospatial data, which are\n**points** (a single longitude/latitude),\n**lines** (a pair of connected points), or\n**polygons** (a collection of points which\nmake a polygon) in a `geometry` column within R dataframes\n\n![](mapping/www/images/amtrak_points_lines_polygons.jpg)\n\nThis is what `sf` dataframe looks like in the console:\n\n```{r print-sf-dataframe}\ndc_parks <- st_read(\"mapping/data/dc_parks.geojson\", \n\t\t\t\t\t\t\t\t\t\tquiet = TRUE)\n\n# Print just the NAME and geometry column\ndc_parks %>%\n select(NAME) %>%\n head(2)\n```\n\n### The long version\n\nThe `sf` library is a key tool for reading in, managing, and working\nwith spatial data in R. `sf` stands for simple features (not San\nFrancisco you Bay Area folks) and denotes a way to describe the spatial\nattributes of real life objects. The R object you will be working with\nmost frequently for mapping is an `sf` dataframe. An `sf` dataframe is\nessentially a regular R dataframe, with a couple of extra features for\nuse in mapping. These extra features exclusive to `sf` dataframes\ninclude:\n\n- sticky `geometry` columns\n- attached coordinate reference systems\n- some other spatial metadata\n\nThe most important of the above list is the sticky `geometry` column,\nwhich is a magical column that contains all of the geographic\ninformation for each row of data. Say for example you had a `sf`\ndataframe of all DC census tracts. Then the `geometry` column would\ncontain all of the geographic points used to define DC census tract\npolygons. The stickiness of this column means that no matter what data\nmunging/filtering you do, you will not be able to drop or delete the\n`geometry` column. Below is a graphic to help you understand this:\n\n![](mapping/www/images/sf_sticky_geometry.png)\n\ncredits: @allisonhorst\n\nThis is what an `sf` dataframe looks like in the console:\n\n```{r print_sf}\n# Read in spatial data about DC parks from DC Open Data Portal\ndc_parks <- st_read(\"https://opendata.arcgis.com/api/v3/datasets/287eaa2ecbff4d699762bbc6795ffdca_9/downloads/data?format=geojson&spatialRefId=4326\",\n\t\t\t\t\t\t\t\t\t\tquiet = TRUE)\n\n# dc_parks <- st_read(\"mapping/data/dc_parks.geojson\")\n\n# Select just a few columns for readability\ndc_parks <- dc_parks %>%\n select(NAME, geometry)\n\n# Print to the console\ndc_parks\n```\n\nNote that there is some spatial metadata such as the `Geometry Type`,\n`Bounding Box`, and `CRS` which shows up as a header before the actual\ncontents of the dataframe.\n\nSince `sf` dataframes operate similarly to regular dataframes, we can\nuse all our familiar `tidyverse` functions for data wrangling, including\n`select`, `filter`, `rename`, `mutate`, `group_by` and `summarize`. The\n`sf` package also has many functions that provide easy ways to replicate\ncommon tasks done in other GIS software like spatial joins, clipping,\nand buffering. Almost all of the mapping and geospatial analysis methods\ndescribed in this guide rely on you having an `sf` dataframe. So let's\ntalk about how to get one!\n\n## Importing spatial data {.tabset .tabset-pills}\n\nGetting an `sf` dataframe is always the first step in the geospatial\nworkflow. Here's how to import spatial data for...\n\n### States and counties\n\nWe highly recommend using the `library(urbnmapr)` package, which was\ncreated by folks here at Urban to easily create state and county level\nmaps. The `get_urbn_map()` function in the package allows you to read in\nspatial data on states and counties, with options to include\nterritories. Importantly, it will also display AL and HI as insets on\nthe map in accordance with the Urban Institute Data Visualization Style\nGuide. For information on how to install `urbnmapr`, see the [GitHub\nrepository](https://github.com/UrbanInstitute/urbnmapr).\n\nBelow is an example of how you would use `urbnmapr` to get an `sf`\ndataframe of all the states or counties in the US.\n\n```{r urbnmapr-1, eval=FALSE}\nlibrary(urbnmapr)\n\n# Get state data\nstates <- get_urbn_map(\"states\", sf = TRUE)\n\n# Can also get county data\ncounties <- get_urbn_map(\"counties\", sf = TRUE)\n```\n\n### Other Census geographies\n\nUse the `library(tigris)` package, which allows you to easily download\nTIGER and other cartographic boundaries from the US Census Bureau. In\norder to automatically load in the boundaries as `sf` objects, run\n`r options(tigris_class = \"sf\")` once per R session.\n\n`library(tigris)` has all the standard census geographies, including\ncensus tracts, counties, CBSAs, ZCTAs, congressional districts, tribal\nareas, and more. It also includes other elements such as water, roads,\nand military bases.\n\nBy default, `libraray(tigris)` will download large very large and\ndetailed TIGER line boundary files. For thematic mapping, the smaller\ncartographic boundary files are a better choice, as they are clipped to\nthe shoreline, generalized, and therefore usually smaller in size\nwithout losing too much accuracy. To load cartographic boundaries, use\nthe `cb = TRUE` argument. If you are doing detailed geospatial analysis\nand need the most detailed shapefiles, then you should use the detailed\nTIGER line boundary files and set `cb = FALSE`.\n\nBelow is an example of how you would use `library(tigris)` to get a `sf`\ndataframe of all Census tracts in DC for 2019.\n\n```{r tigris-1, eval=FALSE}\nlibrary(tigris)\n\n# Only need to set once per script\noptions(tigris_class = \"sf\")\n\ndc_tracts <- tracts(\n state = \"DC\",\n cb = TRUE,\n year = 2019\n)\n```\n\nUnlike `library(urbnmapr)`, different functions are used to get\ngeographic data for different geographic levels. For instance, the\n`blocks()` function will load census block group data, and the\n`tracts()` function will load tract data. Other functions include\n`block_groups()`, `zctas()` , and `core_based_statistical_areas()`. For\nthe full list of supported geographies and functions, see the [package\nvignette](https://cran.r-project.org/web/packages/tigris/tigris.pdf).\n\nFor folks interested in pulling in Census demographic information along\nwith Census geographies, we recommend checking out the sister package to\n`library(tigris)`: `library(tidycensus)`. That package allows you to\ndownload in Census variables and Census geographic data simultaneously.\n\n### Countries\n\nWe recommend using the `library(rnaturalearth)` package, which is\nsimilar to `library(tigris)` but allows you to download and use\nboundaries beyond the US. Instead of setting class to `sf` one time per\nsession as we did with `library(tigris)`, you must set the\n`returnclass = \"sf\"` argument each time you use a function from the\npackage. Below is an example of downloading in an `sf` dataframe of all\nthe countries in the world.\n\n```{r natural-earth, eval = FALSE}\n\nlibrary(rnaturalearth)\n\nworld <- ne_countries(returnclass = \"sf\")\n\nggplot() +\n geom_sf(data = world, mapping = aes())\n```\n\n### Your own files\n\n#### Shapefiles/GeoJSONS\n\nShapefiles and GeoJSONs are 2 common spatial file formats you will found\nout in the wild. `library(sf)` has a function called `st_read` which\nallows you to easily read in these files as `sf` dataframes. The only\nrequired argument is `dsn` or data source name. This is the filepath of\nthe `.shp` file or the `.geojson` file on your local computer. For\ngeojsons, `dsn` can also be a URL.\n\nBelow is an example of reading in a shapefile of fire stations in DC\nwhich is stored in `mapping/data/shapefiles/`. Note that shapefiles are\nactually stored as 6+ different files inside a folder. You need to\nprovide the filepath to the file ending in `.shp`.\n\n```{r list f-ei}\nlibrary(sf)\n\n# Print out all files in the directory\nlist.files(\"mapping/data/shapefiles\")\n\n# Read in .shp file\ndc_firestations <- st_read(\n dsn = \"mapping/data/shapefiles/Fire_Stations.shp\",\n quiet = TRUE\n)\n```\n\nAnd now `dc_firestations` is an `sf` dataframe you can use for all your\nmapping needs! `st_read` supports reading in a wide variety of other\nspatial file formats, including geodatabases, KML files, and over 200\nothers. For an incomplete list, please see the this `sf`\n[vignette](https://r-spatial.github.io/sf/articles/sf2.html).\n\n#### CSVs or dataframes with lat/lons\n\nIf you have a CSV with geographic information stored in columns, you\nwill need to read in the CSV as a regular R dataframe and then convert\nto an `sf` dataframe. `library(sf)` contains the `st_as_sf()` function\nfor converting regular R dataframes into an `sf` dataframe. The two\narguments you must specify for this function are:\n\n- `coords`: A length 2 vector with the names of the columns\n corresponding to longitude and latitude (in that order!). For\n example, `c(\"lon\", \"lat\")`.\n- `crs`: The CRS (coordinate references system) for your\n longitude/latitude coordinates. Remember you need to specify both\n the\\\n authority and the SRID code, for example (\"EPSG:4326\"). For more\n information on finding and setting CRS codes, please see the\n [`CRS`](#crs) section.\n\nBelow is an example of reading in data from a CSV and converting it to\nan `sf` dataframe.\n\n```{r make-sf}\nlibrary(sf)\n\n# Read in dataset of state capitals which is stored as a csv\nstate_capitals <- read_csv(\"mapping/data/state-capitals.csv\")\n\nstate_capitals <- state_capitals %>%\n # Specify names of the lon/lat columns in the CSV to use to make geometry col\n st_as_sf(\n coords = c(\"longitude\", \"latitude\"),\n crs = 4326\n )\n```\n\nOne common mistake is that before converting to an `sf` dataframe, you\nmust drop any rows that have `NA` values for latitude or longitude. If\nyour data contains `NA` values, then the `st_as_sf()` function will\nthrow an error.\n\n## Appending spatial info to your data\n\nOftentimes, the data you are working with will just have state or county\nidentifiers - like FIPS codes or state abbreviations - but will not\ncontain any geographic information. In this case, you must do the extra\nwork of downloading in the geographic data as an `sf` dataframe and then\njoining your non-spatial data to the spatial data. Generally this\ninvolves 3 steps:\n\n1) Reading in your own data as a data frame\n2) Reading in the geographic data as an `sf` dataframe\n3) Using `left_join` to merge the geographic data with your own non\n spatial data and create a new expanded `sf` dataframe\n\nLet's say we had a dataframe on CHIP enrollment by state with state\nabbreviations.\n\n```{r readin-chip-data}\n\n# read the state CHIP data\nchip_by_state <- read_csv(\"mapping/data/chip-enrollment.csv\") %>%\n # clean column names so there are no random spaces/uppercase letters\n janitor::clean_names()\n\n# print to the console\nchip_by_state %>% head()\n```\n\nIn order to convert this to an `sf` dataframe, we need to read in the\nspatial boundaries for each state and append it to our dataframe. Here\nis how we do that with `get_urbn_map()` and `left_join()` .\n\n```{r append-spatial-info, cache = FALSE}\nlibrary(urbnmapr)\n\n# read in state geographic data from urbnmapr\nstates <- get_urbn_map(map = \"states\", sf = TRUE)\n\n# left join state geographies to chip data\nchip_with_geographies <- states %>%\n left_join(\n chip_by_state,\n # Specify join column, which are slightly differently named in states and chip\n # respectively\n by = c(\"state_abbv\" = \"state_abbreviation\")\n )\n\nchip_with_geographies %>%\n select(state_fips, state_abbv, chip_enrollment)\n```\n\n```{r append-state-pops, include = FALSE, eval = TRUE, echo = FALSE}\n# TODO: DELETE THIS\n\n# Read in data on state populations from 2010\nstate_pops <-\n read_csv(\"https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv\",\n # Set this to disable printing column info to console\n col_types = cols()\n ) %>%\n filter(ages == \"total\", year == \"2010\") %>%\n select(state_abbv = `state/region`, population)\n\nchip_with_geographies <- chip_with_geographies %>%\n # Specify left_join from tidylog to print summary messages\n tidylog::left_join(state_pops, by = \"state_abbv\") %>%\n # Calculate the chip enrollment percentage and append as a column\n mutate(chip_pct = chip_enrollment / population)\n```\n\n# Project\n\n## Coordinate Reference Systems {#crs .tabset .tabset-pills}\n\n### The short version\n\nJust watch [this video](https://www.youtube.com/watch?v=vVX-PrBRtTY%60)\nand know the following:\n\n- All spatial data has a CRS, which specifies how to identify a\n location on earth.\n\n- It's important that all spatial datasets you are working with be in\n the same CRS. You can find the CRS with `st_crs()` and change the\n CRS with `st_transform()`.\n\n- The Urban Institute Style Guide requires the use of the Atlas Equal\n Earth Projection (`\"ESRI:102003\"`) for national maps. For state and\n local maps, use [this](https://github.com/veltman/d3-stateplane)\n handy guide to find an appropriate State Plane projection.\n\n### The long version\n\nCoordinate reference systems (CRS) specify the 3d shape of the earth and\noptionally how we project that 3d shape onto a 2d surface. They are an\nimportant part of working with spatial data as you need to ensure that\nall the data you are working with are in the same CRS in order for\nspatial operations and maps to be accurate.\n\nCRS can be specified either by name (ie Maryland State Plane) or\n**S**patial **R**eference System **ID**entifier (SRID). THe SRID is a\nnumeric identifier that uniquely identifies a coordinate reference\nsystem. Generally when referring to an SRID, you need to refer to an\nauthority (ie the data source) and a unique ID. An example is\n`EPSG:26985` which refers to the Maryland State plane projection from\nthe EPSG, or `ESRI:102003` which refers to the Atlas Equal Area\nprojection from ESRI. Most CRS codes will be from the EPSG, and some\nfrom ESRI and others. A good resource for finding/validating CRS codes\nis [epsg.io](epsg.io).\n\nSidenote - EPSG stands for the now defunct European Petroleum Survey\nGroup. And while oil companies have generally been terrible for the\nearth, the one nice thing they did for the earth was to set up common\nstandards for coordinate reference systems.\n\nYou might be thinking well isn't the earth just a sphere? Why do we need\nall this complicated stuff? And the answer is well the earth is [kind\nof](https://oceanservice.noaa.gov/facts/earth-round.html) a sphere, but\nit's really more of a misshapen ellipsoid which is pudgier at the\nequator than at the poles. To visualize how coordinate reference systems\nwork, imagine that the earth is a (lumpy) orange. Now peel the skin off\nan orange and try to flatten it. There are many ways to do it, but all\nwill create\n[distortions](https://twitter.com/neilrkaye/status/1050740679008296967)\nof some kind. The CRS will give us the formula we've used to specify the\nshape of the orange (usually a sphere or ellipsoid of some kind) and\noptionally, specify how we flattened the orange into 2d.\n\nBroadly, there are two kinds of Coordinate Reference Systems:\n\n1) [**Geographic coordinate\n systems**](https://www.ibm.com/support/knowledgecenter/en/SSGU8G_12.1.0/com.ibm.spatial.doc/ids_spat_407.html)\n\n - (sometimes called unprojected coordinate systems)\n - Specifies a 3d shape for the earth\n - Uses a spheroid/ellipsoid to approximate shape of the earth\n - Usually use decimal degree units (ie latitude/longitude) to\n identify locations on earth\n\n![](mapping/www/images/gcs_image.png)\n\n1) [**Projected coordinate\n systems**](https://mgimond.github.io/Spatial/chp09-0.html#projected-coordinate-systems)\n\n - Specifies a 3d shape for the earth + a 2d mapping\n\n - Is a geographic coordinate system + a *projection*\n\n ![](mapping/www/images/projecting_xkcd.png)\n\n credit: [xkcd](https://imgs.xkcd.com/comics/projecting.png)\n\n - **projection**: mathematical formula used to convert a 3d\n coordinate system to a 2d flat coordinate system\n\n - Many different kinds of projections, including Equal Area,\n Equidistant, Conformal, etc\n\n - All projections distort the true shape of the earth in some\n way, either in terms of shape, area, or angle. Required\n [xkcd comic](https://xkcd.com/977/)\n\n - Usually use linear units (ie feet, meters) and therefore\n useful for distance based spatial operations (ie creating\n buffers)\n\n## Finding the CRS\n\nIf you are lucky, your data will have embedded CRS data that will be\nautomatically detected when the file is read in. This is usually the\ncase for GeoJSONS (`.geojson`) and shapefiles (`.shp`). When you use\n`st_read()` on these files, you should see the CRS displayed in the\nmetadata:\n\n![](mapping/www/images/sf_crs_pic.png)\n\nYou can also the `st_crs()` function to find the CRS. The CRS code is\nlocated at the end in `ID[authority, SRID]`.\n\n```{r st_crs}\nst_crs(dc_firestations)\n```\n\nSometimes, the CRS will be blank or `NA` as the dataset did not specify\nthe CRS. In that case you **MUST find and set the CRS for your data\nbefore proceeding** with analysis. Below are some good rules of thumb\nfor finding out what the CRS for your data is:\n\n- For geojsons, the CRS should always be `EPSG:4326` (or WGS 84). The\n official geojson specification states that this is the only valid\n CRS for geojsons, but in the wild, this may not be true 100% of the\n time.\n- For shapefiles, there should be a file that ends in `.proj` in the\n same directory as the `.shp` file. This file contains the projection\n information for that file and should be used automatically when\n reading in shapefiles.\n- For CSV's with latitude/longitude columns, the CRS is usually\n `EPSG:4326` (or WGS 84).\n- Look at the metadata and any accompanying documentation to see if\n the coordinate reference system for the data is specified\n\nIf none of the above rules of thumb apply to you, check out the\n`crsuggest` R [package](https://github.com/walkerke/crsuggest).\n\nOnce you've identified the appropriate CRS, you can set the CRS for your\ndata with `st_crs()`:\n\n```{r set_crs, eval = FALSE}\n\n# If you are certain that your data contains coordinates in the ESRI Atlas Equal Earth projections\nst_crs(some_sf_dataframe) <- st_crs(\"ESRI:102003\")\n```\n\n## Transforming the CRS\n\nOften you will need to change the CRS for your `sf` dataframe so that\nall datasets you are using have the same CRS, or to use a projected CRS\nfor performing more accurate spatial operations. You can do this with\n`st_transform`:\n\n```{r transform-crs}\n# Transforming CRS from WGS 84 to Urban required Equal Earth Projection\nstate_capitals <- state_capitals %>% st_transform(\"ESRI:102003\")\n```\n\n`st_transform()` also allows you to just use the CRS of another `sf`\ndataframe when transforming.\n\n```{r transform-crs-with-another-sf-object}\n# transform CRS of chip_with_geographies to be the same as CRS of dc_firestations\nchip_with_geographies <- chip_with_geographies %>%\n st_transform(crs = st_crs(state_capitals))\n```\n\nIf you are working with local data, you should use an appropriate state\nplane projection instead of the Atlas Equal Earth projection which is\nmeant for national maps. `library(crsuggest)` can simplify the process\nof picking an appropriate state plane CRS.\n\n```{r crsuggest-ex, cache = TRUE}\nlibrary(crsuggest)\n\nsuggest_crs(dc_firestations) %>%\n # Use the value in the \"crs_code\" column to transform CRS's\n head(4)\n```\n\n# Map\n\nIn order to start mapping, you need an `sf` dataframe. If you don't have\none, see the [`Get Spatial Data`](#get_spatial_data) section above.\n\n## The basics\n\n### library(ggplot2)\n\nMost mapping in R fits the same theoretical framework as plotting in R\nusing `library(ggplot2)`. To learn more about ggplot2, visit the Data\nViz\n[page](https://urbaninstitute.github.io/r-at-urban/graphics-guide.html#Grammar_of_Graphics_and_Conventions)\nor read the official ggplot [book](html).\n\nThe key function for mapping is **the special `geom_sf()` function**\nwhich works with `sf` dataframes. This function magically detects\nwhether you have point or polygon spatial data and displays the results\non a map.\n\n### A simple map\n\nTo make a simple map, add `geom_sf()` to a `ggplot()` and set\n`data = an_sf_dataframe`. Below is code for making a map of all 50\nstates using `library(urbnmapr)`:\n\n```{r first-map, cache = TRUE}\nlibrary(urbnmapr)\n\nstates <- get_urbn_map(\"states\", sf = TRUE)\n\nggplot() +\n geom_sf(\n data = states,\n mapping = aes()\n )\n```\n\n## Styling\n\n### `library(urbnthemes)`\n\n`library(urbnthemes)` automatically styles maps in accordance with the\n[Urban Institute Data Visualization Style\nGuide](http://urbaninstitute.github.io/graphics-styleguide/). By using\n`library(urbnthemes)`, you can create publication ready maps you can\nimmediately drop in to Urban research briefs or blog posts.\n\nTo install `urbnthemes`, visit the package's [GitHub\nrepository](https://github.com/UrbanInstitute/urbnthemes) and follow the\ninstructions. There are 2 ways to use the `urbnthemes` functions:\n\n```{r urbnthemes}\n\nlibrary(urbnthemes)\n\n# You can either run this once per script to automatically style all maps with\n# the Urban theme\nset_urbn_defaults(style = \"map\")\n\n# Or you can add `+ theme_urbn_map()` to the end of every map you make\nggplot() +\n geom_sf(states, mapping = aes()) +\n theme_urbn_map()\n```\n\n### Layering\n\nYou can layer multiple points/lines/polygons on top of each other using\nthe `+` operator from `library(ggplot2)`. The shapes will appear from\nbottom to top (ie the last mapped object will show up on top). It is\nimportant that all layers are in the same CRS (coordinate reference\nsystem).\n\n```{r layers, cache = TRUE}\n\nstate_capitals <- state_capitals %>%\n # This will change CRS to ESRI:102003 and shift the AK and HI state capitals\n # point locations to the appropriate locations on the inset maps.\n tigris::shift_geometry() %>%\n # For now filter out AL and HI as their state capitals will be slightly off.\n filter(!state %in% c(\"Alaska\", \"Hawaii\"))\n\nggplot() +\n geom_sf(\n data = states,\n mapping = aes()\n ) +\n # Note we change the data argument\n geom_sf(\n data = state_capitals,\n mapping = aes(),\n # urbnthemes library has urbn color palettes built in.\n color = palette_urbn_main[\"yellow\"],\n size = 2.0\n ) +\n theme_urbn_map()\n```\n\n### Fill and Outline Colors\n\nThe same commands used to change colors, opacity, lines, size, etc. in\ncharts can be used for maps too. To change the colors of the map , just\nuse the `fill =` and `color =` parameters in `geom_sf()`. `fill` will\nchange the fill color of polygons; `color` will change the color of\npolygon outlines, lines, and points.\n\nGenerally, maps that show the magnitude of a variable use the blue\nsequential ramp and maps that display positives and negatives use the\ndiverging color ramp.`library(urbnthemes)` contains inbuilt. helper\nvariables (like `palette_urbn_main`) for accessing color palettes from\nthe Urban Data Viz Style guide. If for example you want states to be\nUrban's magenta color:\n\n```{r urbnthemes- pink}\n\nggplot() +\n geom_sf(states,\n mapping = aes(),\n # Adjust polygon fill color\n fill = palette_urbn_main[\"magenta\"],\n # Adjust polygon outline color\n color = \"white\"\n ) +\n theme_urbn_map()\n```\n\n### Adding text\n\nYou can also add text, like state abbreviations, directly to your map\nusing `geom_sf_text` and the helper function `get_urbn_labels()`.\n\n```{r geom_sf_text}\nlibrary(urbnmapr)\n\nggplot() +\n geom_sf(states,\n mapping = aes(),\n color = \"white\"\n ) +\n theme_urbn_map() +\n # Generates dataframe of state abbv and appropriate location to plot them\n geom_sf_text(\n data = get_urbn_labels(\n map = \"states\",\n sf = TRUE\n ),\n aes(label = state_abbv),\n size = 3\n )\n```\n\nThere's also `geom_sf_label()` if you want labels with a border.\n\n# Map Gallery {#map_gallery}\n\nBelow are copy and pasteable examples of maps you can make, after you\nhave an `sf` dataframe.\n\n## Choropleth Maps\n\nChoropleth maps display geographic areas with shades, colors, or\npatterns in proportion to a variable or variables. Choropleth maps can\nrepresent massive geographies like the entire world and small\ngeographies like Census Tracts. To make a choropleth map, you need to\nset `geom_sf(aes(fill = some_variable_name))`. Below are examples\n\n### Continuous color scale\n\n```{r choropoleth_continious}\n# Map of CHIP enrollment percentage by state\nchip_with_geographies_map <- chip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct\n ))\n\n\n# Below add-ons to the map are optional, but make the map look prettier.\nchip_with_geographies_map +\n # scale_fill_gradientn adds colors with more interpolation and reverses color scale\n scale_fill_gradientn(\n # Convert legend from decimal to percentages\n labels = scales::percent_format(),\n # Make legend title more readable\n name = \"CHIP Enrollment %\",\n # Manually add 0 to lower limit to include it in legend. NA=use maximum value in data\n limits = c(0, NA),\n # Set number of breaks on legend = 3\n n.breaks = 3\n )\n```\n\n### Discrete color scale\n\nThe quick and dirty way is with `scale_fill_steps()`, which creates\ndiscretized bins for continuous variables:\n\n```{r chorpleth_disccrete}\nchip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct\n )) +\n scale_fill_steps(\n # Convert legend from decimal to percentages\n labels = scales::percent_format(),\n # Make legend title more readable\n name = \"CHIP Enrollment %\",\n # Show top and bottom limits on legend\n show.limits = TRUE,\n # Roughly set number of bins. Won't be exact as R uses algorithms under the\n # hood for pretty looking breaks.\n n.breaks = 4\n )\n```\n\nOften you will want to manually generate the bins yourself to give you\nmore fine grained control over the exact legend text. (ie `1% - 1.8%`,\n`1.8 - 2.5%`, etc). Below is an example of discretizing the continuous\n`chip_pct` variable yourself using `cut_interval()` and a helper\nfunction to get nice looking interval labels:\n\n```{r format_intervals}\n\n# Helper function to clean up R generated intervals into nice looking interval labels\nformat_interval <- function(interval_text) {\n text <- interval_text %>%\n # Remove open and close brackets which is R generated math notation\n str_remove_all(\"\\\\(\") %>%\n str_remove_all(\"\\\\)\") %>%\n str_remove_all(\"\\\\[\") %>%\n str_remove_all(\"\\\\]\") %>%\n str_replace_all(\",\", \" — \")\n\n # Convert decimal ranges to percent ranges\n text <- text %>%\n str_split(\" — \") %>%\n map(~ as.numeric(.x) %>%\n scales::percent() %>%\n paste0(collapse = \" — \")) %>%\n unlist() %>%\n # By default character vectors are plotted in alphabetical order. We want\n # factors in reverse alphabetical order to get correct colors in ggplot\n fct_rev()\n\n return(text)\n}\n\nchip_with_geographies <- chip_with_geographies %>%\n # cut_interval into n groups with equal range. Set boundary so 0 is included in the bins\n mutate(chip_pct_interval = cut_interval(chip_pct, n = 5)) %>%\n # Generate nice looking interval labels\n mutate(chip_pct_interval = format_interval(chip_pct_interval))\n```\n\nAnd now we can map the discretized `chip_pct_interval` variable using\n`geom_sf()`:\n\n```{r make_discrete_map}\nchip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct_interval\n )) +\n # Default is to use main urban palette, which assumes unrelated groups. We\n # adjust colors manually to be on Urban cyan palette\n scale_fill_manual(\n values = palette_urbn_cyan[c(8, 7, 5, 3, 1)],\n name = \"CHIP Enrollment %\"\n )\n```\n\nIn addition to `cut_interval` there are [similar\nfunctions](https://ggplot2.tidyverse.org/reference/cut_interval.html)\nfor creating intervals/bins with slightly different rules. When creating\nbins, be careful as changing the number of bins can drastically change\nhow the map looks.\n\n## Bubble Maps\n\nThis is just a layered map with one polygon layer and one point layer,\nwhere the points are sized in accordance with a variable in your data.\n\n```{r bubble_maps, cache = TRUE}\nset_urbn_defaults(style = \"map\")\n\n# Get sf dataframe of DC tracts\nlibrary(tigris)\ndc_tracts <- tracts(\n state = \"DC\",\n year = 2019,\n progress_bar = FALSE\n)\n\n# Add bubbles for firestations\nggplot() +\n geom_sf(data = dc_tracts, fill = palette_urbn_main[\"gray\"]) +\n geom_sf(\n data = dc_firestations,\n # Size bubbles by number of trucks at each station\n aes(size = TRUCK),\n color = palette_urbn_main[\"yellow\"],\n # Adjust transparency for readability\n alpha = 0.8\n )\n```\n\n## Dot-density Maps\n\nThese maps scatter dots within a geographic area. Typically each dot\nrepresents a unit (like 100 people, or 1000 houses). To create this kind\nof map, you need to start with an `sf` dataframe that is of `geometry`\ntype `POLYGON` or `MULTIPOLYGON` and then sample points within the\npolygon.\n\nThe below code generates a dot-density map representing people of\ndifferent races within Washington DC tracts The code may look a little\ncomplicated, but the key workhorse function is `st_sample()` which\nsamples points within each polygon to use in the dot density map:\n\n```{r dot_density_maps, cache = TRUE}\nlibrary(tidycensus)\n\n# Get counts by race of DC tracts\ndc_pop <- get_acs(\n geography = \"tract\",\n state = \"DC\",\n year = 2019,\n variables = c(\n Hispanic = \"DP05_0071\",\n White = \"DP05_0077\",\n Black = \"DP05_0078\",\n Asian = \"DP05_0080\"\n ),\n geometry = TRUE,\n progress_bar = FALSE\n)\n\n# Get unique groups (ie races)\ngroups <- unique(dc_pop$variable)\n\n# For each unique group (ie race), generate sampled points\ndc_race_dots <- map_dfr(groups, ~ {\n dc_pop %>%\n # .x = the group used in the loop\n filter(variable == .x) %>%\n # Use the projected MD state plane for accuracy\n st_transform(crs = \"EPSG:6487\") %>%\n # Have every dot represent 100 people\n mutate(est100 = as.integer(estimate / 100)) %>%\n st_sample(size = .$est100, exact = TRUE) %>%\n st_sf() %>%\n # Add group (ie race) as a column so we can use it when plotting\n mutate(group = .x)\n})\n\n\nggplot() +\n # Plot tracts, then dots on top of tracts\n geom_sf(\n data = dc_pop,\n # Make interior of tracts transparent and boundaries black\n fill = \"transparent\",\n color = \"black\"\n ) +\n geom_sf(\n data = dc_race_dots,\n # Color in dots by racial group\n aes(color = group),\n # Adjust transparency and size to be more readable\n alpha = 0.5,\n size = 1.1,\n stroke = FALSE\n )\n```\n\n## Geofacets\n\nGeofaceting arranges sub-geography-specific plots into a grid that\nresembles a larger geography (usually the US). This can be a useful\nalternative to choropleth maps, which tend to overemphasize\nlow-population density areas with large areas. To make geofacetted\ncharts, use the `facet_geo()` function from the `geofacet` library,\nwhich can be thought of as equivalent to ggplot2's `facet_wrap()`. For\nthis example, we'll use the built-in `state_ranks` data.\n\n```{r geofacet-data}\nlibrary(geofacet)\n\nhead(state_ranks %>% as_tibble())\n```\n\n```{r geofacet-ex, cache = TRUE}\nset_urbn_defaults(style = \"print\")\n\nstate_ranks %>%\n filter(variable %in% c(\"education\", \"employment\")) %>%\n ggplot(aes(x = rank, y = variable)) +\n geom_col() +\n facet_geo(\n facets = \"state\",\n # Use custom urban geofacet grid which is built into urbnthemes\n # For now we need to rename a few columns as urbnthemes has to be\n # updated\n grid = urbnthemes::urbn_geofacet %>%\n rename(\n code = state_code,\n name = state_name\n )\n )\n```\n\nInteractive geofacets of the United States have been used in Urban\nFeatures like [A Matter of\nTime](https://apps.urban.org/features/long-prison-terms/trends.html)\nwhich included geofaceted line charts showing trends in incarceration by\nstate. Static geofacets of the United States were included in [Barriers\nto Accessing Homeownership Down Payment, Credit, and\nAffordability](https://www.urban.org/sites/default/files/publication/94801/barriers-to-homeownership-down-payments-credit-access-and-affordability_3.pdf)\nby the Housing Finance Policy Center.\n\n### Tile grid map\n\nYou can select predefined grids, or create your own at https://hafen.github.io/grid-designer/ \n\n```{r}\n# create a grid with all of the US states and territories \nmygrid <- data.frame(\n code = c(\"ME\", \"AK\", \"WI\", \"VT\", \"NH\", \"IL\", \"ID\", \"WA\", \"MN\", \"MT\", \"ND\", \"MI\", \"NY\", \"MA\", \"IA\", \"IN\", \"CT\", \"RI\", \"NJ\", \"PA\", \"OH\", \"SD\", \"WY\", \"NV\", \"OR\", \"CA\", \"NE\", \"DE\", \"MD\", \"VA\", \"WV\", \"KY\", \"MO\", \"CO\", \"UT\", \"AZ\", \"KS\", \"AR\", \"DC\", \"SC\", \"NC\", \"TN\", \"NM\", \"LA\", \"AL\", \"GA\", \"MS\", \"OK\", \"HI\", \"FL\", \"TX\"),\n row = c(1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8),\n col = c(12, 2, 7, 11, 12, 7, 3, 2, 6, 4, 5, 8, 10, 11, 6, 7, 11, 12, 10, 9, 8, 5, 4, 3, 2, 2, 5, 11, 10, 9, 8, 7, 6, 4, 3, 3, 5, 6, 10, 9, 8, 7, 4, 6, 8, 9, 7, 5, 2, 10, 5),\n stringsAsFactors = FALSE\n)\n\n## Combine data into geo_grid for tiling:\ngeo_grid_data <- mygrid %>% \n left_join(chip_with_geographies, by=c(\"code\" = \"state_abbv\")) \n\n## plot tile grid\ngeo_grid_data %>% \n ggplot(aes(x = col, y = row, fill = chip_pct_interval)) +\n scale_fill_manual(values = palette_urbn_cyan[c(8, 7, 5, 3, 1)], \n \t\t\t\t\t\t\t\t\t name = \"CHIP Enrollment %\") +\n geom_tile(color = \"white\", linewidth = 1) +\n geom_text(aes(label = code), color=\"white\", size = 4) +\n scale_y_reverse() +\n coord_equal() +\n labs(fill=NULL)\n```\n\n\n## Cartograms\n\nCartograms are a modified form of a choropleth map with intentionally\ndistorted sizes that map to a variable in your data. Below we create a\ncartogram with `library(cartogram)` where the state sizes are\nproportional to the population.\n\n```{r cartogram-example, cache = TRUE}\nlibrary(cartogram)\n\nset_urbn_defaults(style = \"map\")\n\nchip_with_geographies_weighted <- chip_with_geographies %>%\n # Note column name needs to be in quotes for this package\n cartogram_cont(weight = \"population\")\n\nggplot() +\n geom_sf(\n data = chip_with_geographies_weighted,\n # Color in states by chip percentages\n aes(fill = chip_pct)\n )\n```\n\n## Interactive Maps\n\nInteractive maps can be a great exploratory tool to explore and\nunderstand your data. And luckily there are a lot of new R packages that\nmake it really easy to create them. Interactive maps are powerful but\n**we do not recommend them for official use in Urban publications** as\ngetting them in Urban styles and appropriate basemaps can be tricky\n(reach out to\n[anarayanan\\@urban.org](mailto:anarayanan@urban.org){.email} if you\nreally want to include them).\n\n### `library(mapview)`\n\n`library(mapview)` is probably the most user friendly of the interactive\nmapping R libraries. All you have to do to create an interactive map is:\n\n```{r show-mapview}\nlibrary(mapview)\n\n\nchip_with_geographies_for_interactive_mapping <- chip_with_geographies %>%\n # Filter out AL and HI bc they would appear in Mexico. If you want AL, HI and\n # in the correct place in interactive maps, make sure to use tigris::states()\n filter(!state_abbv %in% c(\"AK\", \"HI\"))\n\nmapview(chip_with_geographies_for_interactive_mapping)\n```\n\nWhen you click on an object, you get a popup table of all it's\nattributes. And when you hover over an object, you get a popup with an\nobject id.\n\nEach of the above behaviors can be changed if desired. As you'll see in\nthe below section, the syntax for `library(mapview)` is significantly\ndifferent from `library(ggplot2)` so be careful!\n\n#### Coloring in points/polygons\n\nIn order to create a choropleth map where we color in the\npoints/polygons by a variable, we need to feed in a column name *in\nquotes* to the`zcol` argument inside the `mapview()` function:\n\n```{r mapview_zcol}\n# Create interactive state map colored in by chip enrollment\nmapview(chip_with_geographies_for_interactive_mapping, zcol = \"chip_enrollment\")\n```\n\nIf you want more granular control over the color palette for the legend\ncan also feed in a vector of color hex codes to `col.regions` along with\na column name to `zcol`. This will create a continuous color range along\nthe provided colors. Be careful though as the color interpolation is not\nperfect.\n\n```{r mapview-colors-granular}\n# library(RColorBrewer)\nmapview(chip_with_geographies_for_interactive_mapping,\n col.regions = c(\n palette_urbn_green[6],\n \"white\",\n palette_urbn_cyan[6]\n ),\n zcol = \"chip_enrollment\"\n)\n```\n\nIf you want to color in all points/polygons as the same color, just feed\nin a single color hex code to the `col.regions` argument:\n\n```{r mapview-colors}\nmapview(chip_with_geographies_for_interactive_mapping,\n col.regions = palette_urbn_green[5]\n)\n```\n\n#### Adding layers\n\nYou can add multiple `sf` objects on the same map by using the `+`\noperator. This is very useful when comparing 2 or more spatial datasets.\n\n```{r mapview-layers}\nmapview(chip_with_geographies_for_interactive_mapping, col.regions = palette_urbn_green[5]) +\n mapview(state_capitals, col.regions = palette_urbn_cyan[5])\n```\n\nYou can even create slider maps by using the `|` operator!\n\n```{r mapview-sliders}\nmapview(chip_with_geographies_for_interactive_mapping, col.regions = palette_urbn_green[5]) |\n mapview(state_capitals, col.regions = palette_urbn_cyan[5])\n```\n\n### More details\n\nTo learn more about more advanced options with `mapview` maps, check out\nthe\n[documentation](https://r-spatial.github.io/mapview/articles/articles/mapview_02-advanced.html)\npage and the [reference\nmanual](https://cran.r-project.org/web/packages/mapview/mapview.pdf).\n\nThere are also other interactive map making packages in R like `leaflet`\n(which `mapview` is a more user friendly wrapper of), `tmap`, and\n`mapdeck`. To learn about these other packages, [this book\nchapter](https://geocompr.robinlovelace.net/adv-map.html#interactive-maps)\nis a good starting point.\n\n# Spatial Operations\n\n## Cropping\n\nCropping (or clipping) is geographically filtering an `sf` dataframe to\njust the area we are interested in. Say we wanted to look at the roads\naround Fire Station 24 in DC.\n\n```{r roads_cropping_before, cache = TRUE}\nlibrary(tigris)\nlibrary(units)\n\ndc_firestations <- dc_firestations %>%\n st_transform(\"EPSG:6487\")\n\n\n# Draw 500 meter circle around one fire station\nfire_station_24_buffered <- dc_firestations %>%\n filter(NAME == \"Engine 24 Station\") %>%\n st_buffer(set_units(500, \"meter\"))\n\n# Get listing of all roads in DC\ndc_roads <- roads(\n state = \"DC\",\n county = \"District of Columbia\",\n class = \"sf\",\n progress_bar = FALSE\n) %>%\n st_transform(\"EPSG:6487\")\n\n# View roads on top of fire_station\nggplot() +\n # Order matters! We need to plot fire_stations first, and then roads on top\n # to see overlapping firestations\n geom_sf(\n data = fire_station_24_buffered,\n fill = palette_urbn_cyan[1],\n color = palette_urbn_cyan[7]\n ) +\n geom_sf(\n data = dc_roads,\n color = palette_urbn_gray[7]\n ) +\n theme_urbn_map()\n```\n\nWe can clip the larger roads dataframe to just roads that overlap with\nthe circle around the fire station with `st_intersection()`.\n\n```{r roads_cropping_after}\n\n# Use st_intersection() to crop the roads data to just roads within the\n# fire_station radius\ndc_roads_around_fire_station_24_buffered <- fire_station_24_buffered %>%\n st_intersection(dc_roads)\n\nggplot() +\n geom_sf(\n data = fire_station_24_buffered,\n fill = palette_urbn_cyan[1],\n color = palette_urbn_cyan[7]\n ) +\n geom_sf(\n data = dc_roads_around_fire_station_24_buffered,\n color = palette_urbn_gray[7]\n ) +\n theme_urbn_map()\n```\n\n**More Coming Soon!**\n\n## Calculating Distance\n\n## Spatial Joins\n\n### Point to Polygon\n\n### Polygon to Polygon\n\n## Aggregating\n\n## Drive/Transit times\n\n## Geocoding\n\nGeocoding is the process of turning text (usually addresses) into\ngeographic coordinates (usually latitudes/longitudes) for use in\nmapping. For Urban researchers, we highly recommend using the [Urban\ngeocoder](https://tech-tools.urban.org/geocoding/) as it is fast,\naccurate, designed to work with sensitive/confidential data and most\nimportantly free to use for Urban researchers! To learn about how we set\nup and chose the geocoder for the Urban Institute, you can read our\n[Data\\@Urban\nblog](https://medium.com/@urban_institute/choosing-a-geocoder-for-the-urban-institute-86192f656c5f).\n\n### Cleaning Addresses\n\nThe single most important factor in getting accurate geocoded data is\nhaving cleaned, well structured address data. This can prove difficult\nas address data out in the wild is often messy and unstandardized. While\nthe rules for cleaning addresses are very data specific, below are some\nexamples of clean addresses you should aim for in your data cleaning\nprocess:\n\n```{r cleaned-addr, cache=TRUE,eval=TRUE,results=TRUE, echo=FALSE}\nlibrary(gt)\ncleaned_address_table <- tribble(\n ~\"f_address\", ~\"Type of address\",\n \"123 Troy Drive, Pillowtown, CO, 92432\", \"residnetial address\",\n \"789 Abed Avenue, Apt 666, Blankesburg, CO, 92489\", \"residential apartment address\",\n \"Shirley Boulevard and Britta Drive, Blanketsburg, CO, 92489\", \"street intersection\",\n \"Pillowtown, CO\", \"city\",\n \"92489, CO\", \"Zip Code\",\n)\n\ngt(cleaned_address_table) %>%\n # tab_header(title = md(\"Clean Address Examples\")) %>%\n opt_row_striping(row_striping = TRUE) %>%\n tab_style(\n style = list(\n cell_text(weight = \"bold\")\n ),\n locations = cells_column_labels(\n columns = vars(f_address, `Type of address`)\n )\n ) %>%\n opt_align_table_header(align = c(\"left\")) %>%\n tab_options(\n container.width = \"100%\",\n container.height = \"400px\",\n # column_labels.background.color = palette_urbn_cyan[1],\n table.border.top.width = 0,\n table.border.bottom.width = 0,\n column_labels.border.bottom.width = 0,\n )\n```\n\nAll that being said, our geocoder is pretty tolerant of different\naddress formats, typos/spelling errors and missing states, zip codes,\netc. So don't spend too much time cleaning every address in the data.\nAlso note that while our geocoder is able to geocode cities and zip\ncodes, it will return the lat/lon of the center of the city/zip code,\nwhich may not be what you want.\n\n### Instructions\n\nTo use the [Urban geocoder](https://tech-tools.urban.org/geocoding/),\nyou will need to:\n\n1) Generate a CSV with a column named `f_address` which contains the\n addresses in single line format (ie\n `123 Abed Avenue, Blanketsburg, CO, 94328`). This means that if you\n have the addresses split across multiple columns (ie `Address`,\n `City`, `State`, `Zip` columns), you will need to concatenate them\n into one column. Also see our Address cleaning section above.\n\n2) Go to the Urban geocoder and answer the initial questions. This will\n tell you whether your data is non-confidential or confidential data,\n and allow you to upload your CSV for geocoding.\n\n3) Wait for an email telling you your results are ready. If your data\n is non-confidential, this email will contain a link to your geocoded\n results. This link expires in 24 hours, so make sure to download\n your data before then. If you data is confidential, the email will\n contain a link to the location on the Y Drive where your\n confidential geocoded data is stored. You can specify this output\n folder when submitting the CSV in step 1.\n\n### Geocoder outputs\n\n

The geocoded file will be your original data, plus a few more columns\n(including latitude and longitude). each of the new columns that have\nbeen appended to your original data. [It's very important that you take\na look at the Addr_type\ncolumn]{style=\"background-color: #FFFF00; font-weight: bold\"} in the\nCSV before doing further analysis to check the accuracy of the geocoding\nprocess.

\n\n+---------------+---------------------------------------------------+\n| Column | Description |\n+:==============+:==================================================+\n| Match_addr | The actual address that the inputted address was |\n| | matched to. This is the address that the geocoder |\n| | used to get Latitudes / Longitudes. If there are |\n| | potentially many typos or non standard address |\n| | formats in your data file, you will want to take |\n| | a close look at this column to confirm that the |\n| | matched address correctly handled typos and badly |\n| | formatted addresses. |\n+---------------+---------------------------------------------------+\n| Longitude | The WGS 84 datum Longitude (EPSG code 4326) |\n+---------------+---------------------------------------------------+\n| Latitude | The WGS 84 datum Latitude (EPSG code 4326) |\n+---------------+---------------------------------------------------+\n| Addr_type | The match level for a geocode request. This |\n| | should be used as an indicator of the precision |\n| | of geocode results. Generally, Subaddress, |\n| | PointAddress, StreetAddress, and StreetInt |\n| | represent accurate matches. The list below |\n| | contains all possible values for this field. |\n| | **Green values represent High accuracy matches, |\n| | yellow represents Medium accuracy matches and red |\n| | represents Low accuracy/inaccurate matches**. If |\n| | you have many yellow and red values in your data, |\n| | you should manually check the results before |\n| | proceeding with analysis. All possible values:\\ |\n| | \\ |\n| | **Subaddress:** A subset of a PointAddress that |\n| | represents a house or building subaddress |\n| | location, such as an apartment unit, floor, or |\n| | individual building within a complex. The |\n| | UnitName, UnitType, LevelName, LevelType, |\n| | BldgName, and BldgType field values help to |\n| | distinguish subaddresses which may be associated |\n| | with the same PointAddress. Reference data |\n| | consists of point features with associated house |\n| | number, street name, and subaddress elements, |\n| | along with administrative divisions and optional |\n| | postal code; for example, 3836 Emerald Ave, Suite |\n| | C, La Verne, CA, 91750.\\ |\n| | \\ |\n| | **PointAddress:** A street address based on |\n| | points that represent house and building |\n| | locations. Typically, this is the most spatially |\n| | accurate match level. Reference data contains |\n| | address points with associated house numbers and |\n| | street names, along with administrative divisions |\n| | and optional postal code. The X / Y |\n| | (`Longitude`/`Latitude`) and `geometry` output |\n| | values for a PointAddress match represent the |\n| | street entry location for the address; this is |\n| | the location used for routing operations. The |\n| | `DisplayX` and `DisplayY` values represent the |\n| | rooftop, or actual, location of the address. |\n| | Example: 380 New York St, Redlands, CA, 92373.\\ |\n| | \\ |\n| | **StreetAddress** --- A street address that |\n| | differs from PointAddress because the house |\n| | number is interpolated from a range of numbers. |\n| | Reference data contains street center lines with |\n| | house number ranges, along with administrative |\n| | divisions and optional postal code information, |\n| | for example, 647 Haight St, San Francisco, CA, |\n| | 94117.\\ |\n| | \\ |\n| | **StreetInt:** A street address consisting of a |\n| | street intersection along with city and optional |\n| | state and postal code information. This is |\n| | derived from StreetAddress reference data, for |\n| | example, Redlands Blvd & New York St, Redlands, |\n| | CA, 92373.\\ |\n| | \\ |\n| | **StreetName:** Similar to a street address but |\n| | without the house number. Reference data contains |\n| | street centerlines with associated street names |\n| | (no numbered address ranges), along with |\n| | administrative divisions and optional postal |\n| | code, for example, W Olive Ave, Redlands, CA, |\n| | 92373.\\ |\n| | \\ |\n| | **StreetAddressExt:** An interpolated street |\n| | address match that is returned when parameter |\n| | matchOutOfRange=true and the input house number |\n| | exceeds the house number range for the matched |\n| | street segment.\\ |\n| | \\ |\n| | **DistanceMarker:** A street address that |\n| | represents the linear distance along a street, |\n| | typically in kilometers or miles, from a |\n| | designated origin location. Example: Carr 682 KM |\n| | 4, Barceloneta, 00617.\\ |\n| | \\ |\n| | **PostalExt:** A postal code with an additional |\n| | extension, such as the United States Postal |\n| | Service ZIP+4. Reference data is postal code |\n| | points with extensions, for example, 90210-3841.\\ |\n| | \\ |\n| | **POI:** ---Points of interest. Reference data |\n| | consists of administrative division place-names, |\n| | businesses, landmarks, and geographic features, |\n| | for example, Golden Gate Bridge.\\ |\n| | \\ |\n| | **Locality:** A place-name representing a |\n| | populated place. The Type output field provides |\n| | more detailed information about the type of |\n| | populated place. Possible Type values for |\n| | Locality matches include Block, Sector, |\n| | Neighborhood, District, City, MetroArea, County, |\n| | State or Province, Territory, Country, and Zone. |\n| | Example: Bogotá, COL,\\ |\n| | \\ |\n| | **PostalLoc:** A combination of postal code and |\n| | city name. Reference data is typically a union of |\n| | postal boundaries and administrative (locality) |\n| | boundaries, for example, 7132 Frauenkirchen.\\ |\n| | \\ |\n| | **Postal:** Postal code. Reference data is postal |\n| | code points, for example, 90210 USA. |\n+---------------+---------------------------------------------------+\n| Score | A number from 1--100 indicating the degree to |\n| | which the input tokens in a geocoding request |\n| | match the address components in a candidate |\n| | record. A score of 100 represents a perfect |\n| | match, while lower scores represent decreasing |\n| | match accuracy. |\n+---------------+---------------------------------------------------+\n| Status | Indicates whether a batch geocode request results |\n| | in a match, tie, or unmatched. Possible values |\n| | include\\ |\n| | \\ |\n| | M - Match. The returned address matches the input |\n| | address and is the highest scoring candidate.\\ |\n| | \\ |\n| | T - Tied. The returned address matches the input |\n| | address but has the same score as one or more |\n| | additional candidates.\\ |\n| | \\ |\n| | U - Unmatched. No addresses match the inputted |\n| | address. |\n+---------------+---------------------------------------------------+\n| geometry | The WKT (Well-known text) representation of the |\n| | latitudes and longitudes. This column may be |\n| | useful if you're reading the CSV into R, Python, |\n| | or ArcGIS |\n+---------------+---------------------------------------------------+\n| Region | The state that `Match_addr` is located in |\n+---------------+---------------------------------------------------+\n| RegionAbbr | Abbreviated State Name. For example, CA for |\n| | California |\n+---------------+---------------------------------------------------+\n| Subregion | The county that the input address is located in |\n+---------------+---------------------------------------------------+\n| MetroArea | The name of the Metropolitan area that |\n| | `Match_addr` is located in. This field may be |\n| | blank if the input address is not located within |\n| | a metro area. |\n+---------------+---------------------------------------------------+\n| City | The city that `Match_addr` is located in |\n+---------------+---------------------------------------------------+\n| Nbrhd | The Neighborhood that `Match_addr` is located in. |\n| | Note these are ESRI defined neighborhoods which |\n| | may or may not align with other sources |\n| | neighborhood definitions |\n+---------------+---------------------------------------------------+\n\n\\\n\n# Geospatial Modeling\n\nComing soon!\n\n# Bibliography and references\n\n------------------------------------------------------------------------\n\n```{r session-info}\n\nsessionInfo()\n```\n","srcMarkdownNoYaml":"\n\n\n\n::: {#header}\n\n:::\n\n```{r markdown-setup, include=FALSE}\nknitr::opts_chunk$set(fig.path = \"mapping/www/images/\")\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\nknitr::opts_chunk$set(warning = FALSE)\n\noptions(scipen = 999)\n```\n\n```{r setup, include=FALSE}\nlibrary(tidyverse)\nlibrary(knitr)\nlibrary(kableExtra)\nlibrary(here)\nlibrary(sf)\n```\n\n# Introduction\n\nThis guide will teach you the concepts and code you will need for\nmapping and geospatial analysis in R. **This is a long guide, so if you\nneed something specific, we encourage you to scroll to the appropriate\nsection using the Table of Contents on the left.** If you just want copy\nand pasteable code to create different kinds of maps, head to the\n[`Map Gallery`](#map_gallery).\n\nNow let's start mapping!\n\n![](mapping/www/images/yay_maps.gif)\n\n## Geospatial Workflow\n\nThis picture below outlines what we think are the main steps in a\ngeospatial workflow. This guide will be split into sections describing\neach of the steps.\n\n![](mapping/www/images/geospatial_workflow.png)\n\n## Should this be a map?\n\nThe [Urban Institute Data Visualization Style\nGuide](http://urbaninstitute.github.io/graphics-styleguide/) offers some\nblunt but useful suggestions for maps:\n\n> Just because you've got geographic data, doesn't mean that you have to\n> make a map. Many times, there are more efficient storyforms that will\n> get your point across more clearly. If your data shows a very clear\n> geographic trend or if the absolute location of a place or event\n> matters, maps might be the best approach, but sometimes the reflexive\n> impulse to map the data can make you forget that showing the data in\n> another form might answer other---and sometimes more\n> important---questions.\n\nSo we would encourage you to think critically before making a map.\n\n## Why map with R?\n\nR can have a steeper learning curve than point-and-click tools - like\nQGIS or ArcGIS - for geospatial analysis and mapping. But creating maps\nin R has many advantages including:\n\n1) **Reproducibility**: By creating maps with R code, you can easily\n share the outputs and the code that generated the output with\n collaborators, allowing them to replicate your work and catch errors\n easily.\n\n2) **Iteration**: With point and click software like ArcGIS, making 50\n maps would be 50 times the work/time. But using R, we can easily\n make make many iterations of the same map with a few changes to the\n code.\n\n3) **Easy Updates**: Writing code provides a roadmap for others (and\n future you!) to quickly update parts of the map as needed. Say for\n example a collaborator wanted to change the legend colors of 50\n state maps. With R, this is possible in just a few seconds!\n\n4) **An Expansive ecosystem**: There are several R packages that make\n it very easy to get spatial data, create static and interactive\n maps, and perform spatial analyses. This feature rich package\n ecosystem which all play nice together is frankly unmatched by other\n programming languages and even point and click tools like QGIS and\n ArcGIS. Some of these R packages include:\n\n - `sf`: For managing and analyzing spatial dataframes\n - `tigris`: For downloading in Census geographies\n - `ggplot2`: For making publication ready static maps\n - `urbnmapr`: For automatically adding Urban styling to static\n maps\n - `mapview`: For making expxploratory interactive maps\n\n5) **Cost**: Most point-and-click tools for geospatial analysis are\n proprietary and expensive. R is free open-source software. The\n software and most of its packages can be used for free by anyone for\n almost any use case.\n\n## Helpful Learning Resources\n\nIn addition to this guide, you may want to look at these other helpful\nresources:\n\n- The Urban Institute [mapping training\n series](https://ui-research.github.io/urbn101-mapping/) (with video\n lectures and notes)\n- Chapters\n [5](https://walker-data.com/census-r/census-geographic-data-and-applications-in-r.html),\n [6](https://walker-data.com/census-r/mapping-census-data-with-r.html),\n and\n [7](https://walker-data.com/census-r/spatial-analysis-with-us-census-data.html)\n from Kyle Walker's Analyzing US Census Data\n [book](https://walker-data.com/census-r/index.html).\n- Andrew Heiss' fantastic mapping\n [guide](https://datavizm20.classes.andrewheiss.com/example/12-example/)\n- All of the vignettes for the [`sf`\n package](https://cran.r-project.org/web/packages/sf/sf.pdf)\n- [Geocomputation with\n R](https://geocompr.robinlovelace.net/index.html): A book by Robin\n Lovelace and others\n- UChicago's R Spatial Workshops:\n \n\n# Get Spatial Data {#get_spatial_data}\n\n------------------------------------------------------------------------\n\n## library(sf) {.tabset .tabset-pills}\n\n### The short version\n\n`library(sf)` stores geospatial data, which are\n**points** (a single longitude/latitude),\n**lines** (a pair of connected points), or\n**polygons** (a collection of points which\nmake a polygon) in a `geometry` column within R dataframes\n\n![](mapping/www/images/amtrak_points_lines_polygons.jpg)\n\nThis is what `sf` dataframe looks like in the console:\n\n```{r print-sf-dataframe}\ndc_parks <- st_read(\"mapping/data/dc_parks.geojson\", \n\t\t\t\t\t\t\t\t\t\tquiet = TRUE)\n\n# Print just the NAME and geometry column\ndc_parks %>%\n select(NAME) %>%\n head(2)\n```\n\n### The long version\n\nThe `sf` library is a key tool for reading in, managing, and working\nwith spatial data in R. `sf` stands for simple features (not San\nFrancisco you Bay Area folks) and denotes a way to describe the spatial\nattributes of real life objects. The R object you will be working with\nmost frequently for mapping is an `sf` dataframe. An `sf` dataframe is\nessentially a regular R dataframe, with a couple of extra features for\nuse in mapping. These extra features exclusive to `sf` dataframes\ninclude:\n\n- sticky `geometry` columns\n- attached coordinate reference systems\n- some other spatial metadata\n\nThe most important of the above list is the sticky `geometry` column,\nwhich is a magical column that contains all of the geographic\ninformation for each row of data. Say for example you had a `sf`\ndataframe of all DC census tracts. Then the `geometry` column would\ncontain all of the geographic points used to define DC census tract\npolygons. The stickiness of this column means that no matter what data\nmunging/filtering you do, you will not be able to drop or delete the\n`geometry` column. Below is a graphic to help you understand this:\n\n![](mapping/www/images/sf_sticky_geometry.png)\n\ncredits: @allisonhorst\n\nThis is what an `sf` dataframe looks like in the console:\n\n```{r print_sf}\n# Read in spatial data about DC parks from DC Open Data Portal\ndc_parks <- st_read(\"https://opendata.arcgis.com/api/v3/datasets/287eaa2ecbff4d699762bbc6795ffdca_9/downloads/data?format=geojson&spatialRefId=4326\",\n\t\t\t\t\t\t\t\t\t\tquiet = TRUE)\n\n# dc_parks <- st_read(\"mapping/data/dc_parks.geojson\")\n\n# Select just a few columns for readability\ndc_parks <- dc_parks %>%\n select(NAME, geometry)\n\n# Print to the console\ndc_parks\n```\n\nNote that there is some spatial metadata such as the `Geometry Type`,\n`Bounding Box`, and `CRS` which shows up as a header before the actual\ncontents of the dataframe.\n\nSince `sf` dataframes operate similarly to regular dataframes, we can\nuse all our familiar `tidyverse` functions for data wrangling, including\n`select`, `filter`, `rename`, `mutate`, `group_by` and `summarize`. The\n`sf` package also has many functions that provide easy ways to replicate\ncommon tasks done in other GIS software like spatial joins, clipping,\nand buffering. Almost all of the mapping and geospatial analysis methods\ndescribed in this guide rely on you having an `sf` dataframe. So let's\ntalk about how to get one!\n\n## Importing spatial data {.tabset .tabset-pills}\n\nGetting an `sf` dataframe is always the first step in the geospatial\nworkflow. Here's how to import spatial data for...\n\n### States and counties\n\nWe highly recommend using the `library(urbnmapr)` package, which was\ncreated by folks here at Urban to easily create state and county level\nmaps. The `get_urbn_map()` function in the package allows you to read in\nspatial data on states and counties, with options to include\nterritories. Importantly, it will also display AL and HI as insets on\nthe map in accordance with the Urban Institute Data Visualization Style\nGuide. For information on how to install `urbnmapr`, see the [GitHub\nrepository](https://github.com/UrbanInstitute/urbnmapr).\n\nBelow is an example of how you would use `urbnmapr` to get an `sf`\ndataframe of all the states or counties in the US.\n\n```{r urbnmapr-1, eval=FALSE}\nlibrary(urbnmapr)\n\n# Get state data\nstates <- get_urbn_map(\"states\", sf = TRUE)\n\n# Can also get county data\ncounties <- get_urbn_map(\"counties\", sf = TRUE)\n```\n\n### Other Census geographies\n\nUse the `library(tigris)` package, which allows you to easily download\nTIGER and other cartographic boundaries from the US Census Bureau. In\norder to automatically load in the boundaries as `sf` objects, run\n`r options(tigris_class = \"sf\")` once per R session.\n\n`library(tigris)` has all the standard census geographies, including\ncensus tracts, counties, CBSAs, ZCTAs, congressional districts, tribal\nareas, and more. It also includes other elements such as water, roads,\nand military bases.\n\nBy default, `libraray(tigris)` will download large very large and\ndetailed TIGER line boundary files. For thematic mapping, the smaller\ncartographic boundary files are a better choice, as they are clipped to\nthe shoreline, generalized, and therefore usually smaller in size\nwithout losing too much accuracy. To load cartographic boundaries, use\nthe `cb = TRUE` argument. If you are doing detailed geospatial analysis\nand need the most detailed shapefiles, then you should use the detailed\nTIGER line boundary files and set `cb = FALSE`.\n\nBelow is an example of how you would use `library(tigris)` to get a `sf`\ndataframe of all Census tracts in DC for 2019.\n\n```{r tigris-1, eval=FALSE}\nlibrary(tigris)\n\n# Only need to set once per script\noptions(tigris_class = \"sf\")\n\ndc_tracts <- tracts(\n state = \"DC\",\n cb = TRUE,\n year = 2019\n)\n```\n\nUnlike `library(urbnmapr)`, different functions are used to get\ngeographic data for different geographic levels. For instance, the\n`blocks()` function will load census block group data, and the\n`tracts()` function will load tract data. Other functions include\n`block_groups()`, `zctas()` , and `core_based_statistical_areas()`. For\nthe full list of supported geographies and functions, see the [package\nvignette](https://cran.r-project.org/web/packages/tigris/tigris.pdf).\n\nFor folks interested in pulling in Census demographic information along\nwith Census geographies, we recommend checking out the sister package to\n`library(tigris)`: `library(tidycensus)`. That package allows you to\ndownload in Census variables and Census geographic data simultaneously.\n\n### Countries\n\nWe recommend using the `library(rnaturalearth)` package, which is\nsimilar to `library(tigris)` but allows you to download and use\nboundaries beyond the US. Instead of setting class to `sf` one time per\nsession as we did with `library(tigris)`, you must set the\n`returnclass = \"sf\"` argument each time you use a function from the\npackage. Below is an example of downloading in an `sf` dataframe of all\nthe countries in the world.\n\n```{r natural-earth, eval = FALSE}\n\nlibrary(rnaturalearth)\n\nworld <- ne_countries(returnclass = \"sf\")\n\nggplot() +\n geom_sf(data = world, mapping = aes())\n```\n\n### Your own files\n\n#### Shapefiles/GeoJSONS\n\nShapefiles and GeoJSONs are 2 common spatial file formats you will found\nout in the wild. `library(sf)` has a function called `st_read` which\nallows you to easily read in these files as `sf` dataframes. The only\nrequired argument is `dsn` or data source name. This is the filepath of\nthe `.shp` file or the `.geojson` file on your local computer. For\ngeojsons, `dsn` can also be a URL.\n\nBelow is an example of reading in a shapefile of fire stations in DC\nwhich is stored in `mapping/data/shapefiles/`. Note that shapefiles are\nactually stored as 6+ different files inside a folder. You need to\nprovide the filepath to the file ending in `.shp`.\n\n```{r list f-ei}\nlibrary(sf)\n\n# Print out all files in the directory\nlist.files(\"mapping/data/shapefiles\")\n\n# Read in .shp file\ndc_firestations <- st_read(\n dsn = \"mapping/data/shapefiles/Fire_Stations.shp\",\n quiet = TRUE\n)\n```\n\nAnd now `dc_firestations` is an `sf` dataframe you can use for all your\nmapping needs! `st_read` supports reading in a wide variety of other\nspatial file formats, including geodatabases, KML files, and over 200\nothers. For an incomplete list, please see the this `sf`\n[vignette](https://r-spatial.github.io/sf/articles/sf2.html).\n\n#### CSVs or dataframes with lat/lons\n\nIf you have a CSV with geographic information stored in columns, you\nwill need to read in the CSV as a regular R dataframe and then convert\nto an `sf` dataframe. `library(sf)` contains the `st_as_sf()` function\nfor converting regular R dataframes into an `sf` dataframe. The two\narguments you must specify for this function are:\n\n- `coords`: A length 2 vector with the names of the columns\n corresponding to longitude and latitude (in that order!). For\n example, `c(\"lon\", \"lat\")`.\n- `crs`: The CRS (coordinate references system) for your\n longitude/latitude coordinates. Remember you need to specify both\n the\\\n authority and the SRID code, for example (\"EPSG:4326\"). For more\n information on finding and setting CRS codes, please see the\n [`CRS`](#crs) section.\n\nBelow is an example of reading in data from a CSV and converting it to\nan `sf` dataframe.\n\n```{r make-sf}\nlibrary(sf)\n\n# Read in dataset of state capitals which is stored as a csv\nstate_capitals <- read_csv(\"mapping/data/state-capitals.csv\")\n\nstate_capitals <- state_capitals %>%\n # Specify names of the lon/lat columns in the CSV to use to make geometry col\n st_as_sf(\n coords = c(\"longitude\", \"latitude\"),\n crs = 4326\n )\n```\n\nOne common mistake is that before converting to an `sf` dataframe, you\nmust drop any rows that have `NA` values for latitude or longitude. If\nyour data contains `NA` values, then the `st_as_sf()` function will\nthrow an error.\n\n## Appending spatial info to your data\n\nOftentimes, the data you are working with will just have state or county\nidentifiers - like FIPS codes or state abbreviations - but will not\ncontain any geographic information. In this case, you must do the extra\nwork of downloading in the geographic data as an `sf` dataframe and then\njoining your non-spatial data to the spatial data. Generally this\ninvolves 3 steps:\n\n1) Reading in your own data as a data frame\n2) Reading in the geographic data as an `sf` dataframe\n3) Using `left_join` to merge the geographic data with your own non\n spatial data and create a new expanded `sf` dataframe\n\nLet's say we had a dataframe on CHIP enrollment by state with state\nabbreviations.\n\n```{r readin-chip-data}\n\n# read the state CHIP data\nchip_by_state <- read_csv(\"mapping/data/chip-enrollment.csv\") %>%\n # clean column names so there are no random spaces/uppercase letters\n janitor::clean_names()\n\n# print to the console\nchip_by_state %>% head()\n```\n\nIn order to convert this to an `sf` dataframe, we need to read in the\nspatial boundaries for each state and append it to our dataframe. Here\nis how we do that with `get_urbn_map()` and `left_join()` .\n\n```{r append-spatial-info, cache = FALSE}\nlibrary(urbnmapr)\n\n# read in state geographic data from urbnmapr\nstates <- get_urbn_map(map = \"states\", sf = TRUE)\n\n# left join state geographies to chip data\nchip_with_geographies <- states %>%\n left_join(\n chip_by_state,\n # Specify join column, which are slightly differently named in states and chip\n # respectively\n by = c(\"state_abbv\" = \"state_abbreviation\")\n )\n\nchip_with_geographies %>%\n select(state_fips, state_abbv, chip_enrollment)\n```\n\n```{r append-state-pops, include = FALSE, eval = TRUE, echo = FALSE}\n# TODO: DELETE THIS\n\n# Read in data on state populations from 2010\nstate_pops <-\n read_csv(\"https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv\",\n # Set this to disable printing column info to console\n col_types = cols()\n ) %>%\n filter(ages == \"total\", year == \"2010\") %>%\n select(state_abbv = `state/region`, population)\n\nchip_with_geographies <- chip_with_geographies %>%\n # Specify left_join from tidylog to print summary messages\n tidylog::left_join(state_pops, by = \"state_abbv\") %>%\n # Calculate the chip enrollment percentage and append as a column\n mutate(chip_pct = chip_enrollment / population)\n```\n\n# Project\n\n## Coordinate Reference Systems {#crs .tabset .tabset-pills}\n\n### The short version\n\nJust watch [this video](https://www.youtube.com/watch?v=vVX-PrBRtTY%60)\nand know the following:\n\n- All spatial data has a CRS, which specifies how to identify a\n location on earth.\n\n- It's important that all spatial datasets you are working with be in\n the same CRS. You can find the CRS with `st_crs()` and change the\n CRS with `st_transform()`.\n\n- The Urban Institute Style Guide requires the use of the Atlas Equal\n Earth Projection (`\"ESRI:102003\"`) for national maps. For state and\n local maps, use [this](https://github.com/veltman/d3-stateplane)\n handy guide to find an appropriate State Plane projection.\n\n### The long version\n\nCoordinate reference systems (CRS) specify the 3d shape of the earth and\noptionally how we project that 3d shape onto a 2d surface. They are an\nimportant part of working with spatial data as you need to ensure that\nall the data you are working with are in the same CRS in order for\nspatial operations and maps to be accurate.\n\nCRS can be specified either by name (ie Maryland State Plane) or\n**S**patial **R**eference System **ID**entifier (SRID). THe SRID is a\nnumeric identifier that uniquely identifies a coordinate reference\nsystem. Generally when referring to an SRID, you need to refer to an\nauthority (ie the data source) and a unique ID. An example is\n`EPSG:26985` which refers to the Maryland State plane projection from\nthe EPSG, or `ESRI:102003` which refers to the Atlas Equal Area\nprojection from ESRI. Most CRS codes will be from the EPSG, and some\nfrom ESRI and others. A good resource for finding/validating CRS codes\nis [epsg.io](epsg.io).\n\nSidenote - EPSG stands for the now defunct European Petroleum Survey\nGroup. And while oil companies have generally been terrible for the\nearth, the one nice thing they did for the earth was to set up common\nstandards for coordinate reference systems.\n\nYou might be thinking well isn't the earth just a sphere? Why do we need\nall this complicated stuff? And the answer is well the earth is [kind\nof](https://oceanservice.noaa.gov/facts/earth-round.html) a sphere, but\nit's really more of a misshapen ellipsoid which is pudgier at the\nequator than at the poles. To visualize how coordinate reference systems\nwork, imagine that the earth is a (lumpy) orange. Now peel the skin off\nan orange and try to flatten it. There are many ways to do it, but all\nwill create\n[distortions](https://twitter.com/neilrkaye/status/1050740679008296967)\nof some kind. The CRS will give us the formula we've used to specify the\nshape of the orange (usually a sphere or ellipsoid of some kind) and\noptionally, specify how we flattened the orange into 2d.\n\nBroadly, there are two kinds of Coordinate Reference Systems:\n\n1) [**Geographic coordinate\n systems**](https://www.ibm.com/support/knowledgecenter/en/SSGU8G_12.1.0/com.ibm.spatial.doc/ids_spat_407.html)\n\n - (sometimes called unprojected coordinate systems)\n - Specifies a 3d shape for the earth\n - Uses a spheroid/ellipsoid to approximate shape of the earth\n - Usually use decimal degree units (ie latitude/longitude) to\n identify locations on earth\n\n![](mapping/www/images/gcs_image.png)\n\n1) [**Projected coordinate\n systems**](https://mgimond.github.io/Spatial/chp09-0.html#projected-coordinate-systems)\n\n - Specifies a 3d shape for the earth + a 2d mapping\n\n - Is a geographic coordinate system + a *projection*\n\n ![](mapping/www/images/projecting_xkcd.png)\n\n credit: [xkcd](https://imgs.xkcd.com/comics/projecting.png)\n\n - **projection**: mathematical formula used to convert a 3d\n coordinate system to a 2d flat coordinate system\n\n - Many different kinds of projections, including Equal Area,\n Equidistant, Conformal, etc\n\n - All projections distort the true shape of the earth in some\n way, either in terms of shape, area, or angle. Required\n [xkcd comic](https://xkcd.com/977/)\n\n - Usually use linear units (ie feet, meters) and therefore\n useful for distance based spatial operations (ie creating\n buffers)\n\n## Finding the CRS\n\nIf you are lucky, your data will have embedded CRS data that will be\nautomatically detected when the file is read in. This is usually the\ncase for GeoJSONS (`.geojson`) and shapefiles (`.shp`). When you use\n`st_read()` on these files, you should see the CRS displayed in the\nmetadata:\n\n![](mapping/www/images/sf_crs_pic.png)\n\nYou can also the `st_crs()` function to find the CRS. The CRS code is\nlocated at the end in `ID[authority, SRID]`.\n\n```{r st_crs}\nst_crs(dc_firestations)\n```\n\nSometimes, the CRS will be blank or `NA` as the dataset did not specify\nthe CRS. In that case you **MUST find and set the CRS for your data\nbefore proceeding** with analysis. Below are some good rules of thumb\nfor finding out what the CRS for your data is:\n\n- For geojsons, the CRS should always be `EPSG:4326` (or WGS 84). The\n official geojson specification states that this is the only valid\n CRS for geojsons, but in the wild, this may not be true 100% of the\n time.\n- For shapefiles, there should be a file that ends in `.proj` in the\n same directory as the `.shp` file. This file contains the projection\n information for that file and should be used automatically when\n reading in shapefiles.\n- For CSV's with latitude/longitude columns, the CRS is usually\n `EPSG:4326` (or WGS 84).\n- Look at the metadata and any accompanying documentation to see if\n the coordinate reference system for the data is specified\n\nIf none of the above rules of thumb apply to you, check out the\n`crsuggest` R [package](https://github.com/walkerke/crsuggest).\n\nOnce you've identified the appropriate CRS, you can set the CRS for your\ndata with `st_crs()`:\n\n```{r set_crs, eval = FALSE}\n\n# If you are certain that your data contains coordinates in the ESRI Atlas Equal Earth projections\nst_crs(some_sf_dataframe) <- st_crs(\"ESRI:102003\")\n```\n\n## Transforming the CRS\n\nOften you will need to change the CRS for your `sf` dataframe so that\nall datasets you are using have the same CRS, or to use a projected CRS\nfor performing more accurate spatial operations. You can do this with\n`st_transform`:\n\n```{r transform-crs}\n# Transforming CRS from WGS 84 to Urban required Equal Earth Projection\nstate_capitals <- state_capitals %>% st_transform(\"ESRI:102003\")\n```\n\n`st_transform()` also allows you to just use the CRS of another `sf`\ndataframe when transforming.\n\n```{r transform-crs-with-another-sf-object}\n# transform CRS of chip_with_geographies to be the same as CRS of dc_firestations\nchip_with_geographies <- chip_with_geographies %>%\n st_transform(crs = st_crs(state_capitals))\n```\n\nIf you are working with local data, you should use an appropriate state\nplane projection instead of the Atlas Equal Earth projection which is\nmeant for national maps. `library(crsuggest)` can simplify the process\nof picking an appropriate state plane CRS.\n\n```{r crsuggest-ex, cache = TRUE}\nlibrary(crsuggest)\n\nsuggest_crs(dc_firestations) %>%\n # Use the value in the \"crs_code\" column to transform CRS's\n head(4)\n```\n\n# Map\n\nIn order to start mapping, you need an `sf` dataframe. If you don't have\none, see the [`Get Spatial Data`](#get_spatial_data) section above.\n\n## The basics\n\n### library(ggplot2)\n\nMost mapping in R fits the same theoretical framework as plotting in R\nusing `library(ggplot2)`. To learn more about ggplot2, visit the Data\nViz\n[page](https://urbaninstitute.github.io/r-at-urban/graphics-guide.html#Grammar_of_Graphics_and_Conventions)\nor read the official ggplot [book](html).\n\nThe key function for mapping is **the special `geom_sf()` function**\nwhich works with `sf` dataframes. This function magically detects\nwhether you have point or polygon spatial data and displays the results\non a map.\n\n### A simple map\n\nTo make a simple map, add `geom_sf()` to a `ggplot()` and set\n`data = an_sf_dataframe`. Below is code for making a map of all 50\nstates using `library(urbnmapr)`:\n\n```{r first-map, cache = TRUE}\nlibrary(urbnmapr)\n\nstates <- get_urbn_map(\"states\", sf = TRUE)\n\nggplot() +\n geom_sf(\n data = states,\n mapping = aes()\n )\n```\n\n## Styling\n\n### `library(urbnthemes)`\n\n`library(urbnthemes)` automatically styles maps in accordance with the\n[Urban Institute Data Visualization Style\nGuide](http://urbaninstitute.github.io/graphics-styleguide/). By using\n`library(urbnthemes)`, you can create publication ready maps you can\nimmediately drop in to Urban research briefs or blog posts.\n\nTo install `urbnthemes`, visit the package's [GitHub\nrepository](https://github.com/UrbanInstitute/urbnthemes) and follow the\ninstructions. There are 2 ways to use the `urbnthemes` functions:\n\n```{r urbnthemes}\n\nlibrary(urbnthemes)\n\n# You can either run this once per script to automatically style all maps with\n# the Urban theme\nset_urbn_defaults(style = \"map\")\n\n# Or you can add `+ theme_urbn_map()` to the end of every map you make\nggplot() +\n geom_sf(states, mapping = aes()) +\n theme_urbn_map()\n```\n\n### Layering\n\nYou can layer multiple points/lines/polygons on top of each other using\nthe `+` operator from `library(ggplot2)`. The shapes will appear from\nbottom to top (ie the last mapped object will show up on top). It is\nimportant that all layers are in the same CRS (coordinate reference\nsystem).\n\n```{r layers, cache = TRUE}\n\nstate_capitals <- state_capitals %>%\n # This will change CRS to ESRI:102003 and shift the AK and HI state capitals\n # point locations to the appropriate locations on the inset maps.\n tigris::shift_geometry() %>%\n # For now filter out AL and HI as their state capitals will be slightly off.\n filter(!state %in% c(\"Alaska\", \"Hawaii\"))\n\nggplot() +\n geom_sf(\n data = states,\n mapping = aes()\n ) +\n # Note we change the data argument\n geom_sf(\n data = state_capitals,\n mapping = aes(),\n # urbnthemes library has urbn color palettes built in.\n color = palette_urbn_main[\"yellow\"],\n size = 2.0\n ) +\n theme_urbn_map()\n```\n\n### Fill and Outline Colors\n\nThe same commands used to change colors, opacity, lines, size, etc. in\ncharts can be used for maps too. To change the colors of the map , just\nuse the `fill =` and `color =` parameters in `geom_sf()`. `fill` will\nchange the fill color of polygons; `color` will change the color of\npolygon outlines, lines, and points.\n\nGenerally, maps that show the magnitude of a variable use the blue\nsequential ramp and maps that display positives and negatives use the\ndiverging color ramp.`library(urbnthemes)` contains inbuilt. helper\nvariables (like `palette_urbn_main`) for accessing color palettes from\nthe Urban Data Viz Style guide. If for example you want states to be\nUrban's magenta color:\n\n```{r urbnthemes- pink}\n\nggplot() +\n geom_sf(states,\n mapping = aes(),\n # Adjust polygon fill color\n fill = palette_urbn_main[\"magenta\"],\n # Adjust polygon outline color\n color = \"white\"\n ) +\n theme_urbn_map()\n```\n\n### Adding text\n\nYou can also add text, like state abbreviations, directly to your map\nusing `geom_sf_text` and the helper function `get_urbn_labels()`.\n\n```{r geom_sf_text}\nlibrary(urbnmapr)\n\nggplot() +\n geom_sf(states,\n mapping = aes(),\n color = \"white\"\n ) +\n theme_urbn_map() +\n # Generates dataframe of state abbv and appropriate location to plot them\n geom_sf_text(\n data = get_urbn_labels(\n map = \"states\",\n sf = TRUE\n ),\n aes(label = state_abbv),\n size = 3\n )\n```\n\nThere's also `geom_sf_label()` if you want labels with a border.\n\n# Map Gallery {#map_gallery}\n\nBelow are copy and pasteable examples of maps you can make, after you\nhave an `sf` dataframe.\n\n## Choropleth Maps\n\nChoropleth maps display geographic areas with shades, colors, or\npatterns in proportion to a variable or variables. Choropleth maps can\nrepresent massive geographies like the entire world and small\ngeographies like Census Tracts. To make a choropleth map, you need to\nset `geom_sf(aes(fill = some_variable_name))`. Below are examples\n\n### Continuous color scale\n\n```{r choropoleth_continious}\n# Map of CHIP enrollment percentage by state\nchip_with_geographies_map <- chip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct\n ))\n\n\n# Below add-ons to the map are optional, but make the map look prettier.\nchip_with_geographies_map +\n # scale_fill_gradientn adds colors with more interpolation and reverses color scale\n scale_fill_gradientn(\n # Convert legend from decimal to percentages\n labels = scales::percent_format(),\n # Make legend title more readable\n name = \"CHIP Enrollment %\",\n # Manually add 0 to lower limit to include it in legend. NA=use maximum value in data\n limits = c(0, NA),\n # Set number of breaks on legend = 3\n n.breaks = 3\n )\n```\n\n### Discrete color scale\n\nThe quick and dirty way is with `scale_fill_steps()`, which creates\ndiscretized bins for continuous variables:\n\n```{r chorpleth_disccrete}\nchip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct\n )) +\n scale_fill_steps(\n # Convert legend from decimal to percentages\n labels = scales::percent_format(),\n # Make legend title more readable\n name = \"CHIP Enrollment %\",\n # Show top and bottom limits on legend\n show.limits = TRUE,\n # Roughly set number of bins. Won't be exact as R uses algorithms under the\n # hood for pretty looking breaks.\n n.breaks = 4\n )\n```\n\nOften you will want to manually generate the bins yourself to give you\nmore fine grained control over the exact legend text. (ie `1% - 1.8%`,\n`1.8 - 2.5%`, etc). Below is an example of discretizing the continuous\n`chip_pct` variable yourself using `cut_interval()` and a helper\nfunction to get nice looking interval labels:\n\n```{r format_intervals}\n\n# Helper function to clean up R generated intervals into nice looking interval labels\nformat_interval <- function(interval_text) {\n text <- interval_text %>%\n # Remove open and close brackets which is R generated math notation\n str_remove_all(\"\\\\(\") %>%\n str_remove_all(\"\\\\)\") %>%\n str_remove_all(\"\\\\[\") %>%\n str_remove_all(\"\\\\]\") %>%\n str_replace_all(\",\", \" — \")\n\n # Convert decimal ranges to percent ranges\n text <- text %>%\n str_split(\" — \") %>%\n map(~ as.numeric(.x) %>%\n scales::percent() %>%\n paste0(collapse = \" — \")) %>%\n unlist() %>%\n # By default character vectors are plotted in alphabetical order. We want\n # factors in reverse alphabetical order to get correct colors in ggplot\n fct_rev()\n\n return(text)\n}\n\nchip_with_geographies <- chip_with_geographies %>%\n # cut_interval into n groups with equal range. Set boundary so 0 is included in the bins\n mutate(chip_pct_interval = cut_interval(chip_pct, n = 5)) %>%\n # Generate nice looking interval labels\n mutate(chip_pct_interval = format_interval(chip_pct_interval))\n```\n\nAnd now we can map the discretized `chip_pct_interval` variable using\n`geom_sf()`:\n\n```{r make_discrete_map}\nchip_with_geographies %>%\n ggplot() +\n geom_sf(aes(\n # Color in states by the chip_pct variable\n fill = chip_pct_interval\n )) +\n # Default is to use main urban palette, which assumes unrelated groups. We\n # adjust colors manually to be on Urban cyan palette\n scale_fill_manual(\n values = palette_urbn_cyan[c(8, 7, 5, 3, 1)],\n name = \"CHIP Enrollment %\"\n )\n```\n\nIn addition to `cut_interval` there are [similar\nfunctions](https://ggplot2.tidyverse.org/reference/cut_interval.html)\nfor creating intervals/bins with slightly different rules. When creating\nbins, be careful as changing the number of bins can drastically change\nhow the map looks.\n\n## Bubble Maps\n\nThis is just a layered map with one polygon layer and one point layer,\nwhere the points are sized in accordance with a variable in your data.\n\n```{r bubble_maps, cache = TRUE}\nset_urbn_defaults(style = \"map\")\n\n# Get sf dataframe of DC tracts\nlibrary(tigris)\ndc_tracts <- tracts(\n state = \"DC\",\n year = 2019,\n progress_bar = FALSE\n)\n\n# Add bubbles for firestations\nggplot() +\n geom_sf(data = dc_tracts, fill = palette_urbn_main[\"gray\"]) +\n geom_sf(\n data = dc_firestations,\n # Size bubbles by number of trucks at each station\n aes(size = TRUCK),\n color = palette_urbn_main[\"yellow\"],\n # Adjust transparency for readability\n alpha = 0.8\n )\n```\n\n## Dot-density Maps\n\nThese maps scatter dots within a geographic area. Typically each dot\nrepresents a unit (like 100 people, or 1000 houses). To create this kind\nof map, you need to start with an `sf` dataframe that is of `geometry`\ntype `POLYGON` or `MULTIPOLYGON` and then sample points within the\npolygon.\n\nThe below code generates a dot-density map representing people of\ndifferent races within Washington DC tracts The code may look a little\ncomplicated, but the key workhorse function is `st_sample()` which\nsamples points within each polygon to use in the dot density map:\n\n```{r dot_density_maps, cache = TRUE}\nlibrary(tidycensus)\n\n# Get counts by race of DC tracts\ndc_pop <- get_acs(\n geography = \"tract\",\n state = \"DC\",\n year = 2019,\n variables = c(\n Hispanic = \"DP05_0071\",\n White = \"DP05_0077\",\n Black = \"DP05_0078\",\n Asian = \"DP05_0080\"\n ),\n geometry = TRUE,\n progress_bar = FALSE\n)\n\n# Get unique groups (ie races)\ngroups <- unique(dc_pop$variable)\n\n# For each unique group (ie race), generate sampled points\ndc_race_dots <- map_dfr(groups, ~ {\n dc_pop %>%\n # .x = the group used in the loop\n filter(variable == .x) %>%\n # Use the projected MD state plane for accuracy\n st_transform(crs = \"EPSG:6487\") %>%\n # Have every dot represent 100 people\n mutate(est100 = as.integer(estimate / 100)) %>%\n st_sample(size = .$est100, exact = TRUE) %>%\n st_sf() %>%\n # Add group (ie race) as a column so we can use it when plotting\n mutate(group = .x)\n})\n\n\nggplot() +\n # Plot tracts, then dots on top of tracts\n geom_sf(\n data = dc_pop,\n # Make interior of tracts transparent and boundaries black\n fill = \"transparent\",\n color = \"black\"\n ) +\n geom_sf(\n data = dc_race_dots,\n # Color in dots by racial group\n aes(color = group),\n # Adjust transparency and size to be more readable\n alpha = 0.5,\n size = 1.1,\n stroke = FALSE\n )\n```\n\n## Geofacets\n\nGeofaceting arranges sub-geography-specific plots into a grid that\nresembles a larger geography (usually the US). This can be a useful\nalternative to choropleth maps, which tend to overemphasize\nlow-population density areas with large areas. To make geofacetted\ncharts, use the `facet_geo()` function from the `geofacet` library,\nwhich can be thought of as equivalent to ggplot2's `facet_wrap()`. For\nthis example, we'll use the built-in `state_ranks` data.\n\n```{r geofacet-data}\nlibrary(geofacet)\n\nhead(state_ranks %>% as_tibble())\n```\n\n```{r geofacet-ex, cache = TRUE}\nset_urbn_defaults(style = \"print\")\n\nstate_ranks %>%\n filter(variable %in% c(\"education\", \"employment\")) %>%\n ggplot(aes(x = rank, y = variable)) +\n geom_col() +\n facet_geo(\n facets = \"state\",\n # Use custom urban geofacet grid which is built into urbnthemes\n # For now we need to rename a few columns as urbnthemes has to be\n # updated\n grid = urbnthemes::urbn_geofacet %>%\n rename(\n code = state_code,\n name = state_name\n )\n )\n```\n\nInteractive geofacets of the United States have been used in Urban\nFeatures like [A Matter of\nTime](https://apps.urban.org/features/long-prison-terms/trends.html)\nwhich included geofaceted line charts showing trends in incarceration by\nstate. Static geofacets of the United States were included in [Barriers\nto Accessing Homeownership Down Payment, Credit, and\nAffordability](https://www.urban.org/sites/default/files/publication/94801/barriers-to-homeownership-down-payments-credit-access-and-affordability_3.pdf)\nby the Housing Finance Policy Center.\n\n### Tile grid map\n\nYou can select predefined grids, or create your own at https://hafen.github.io/grid-designer/ \n\n```{r}\n# create a grid with all of the US states and territories \nmygrid <- data.frame(\n code = c(\"ME\", \"AK\", \"WI\", \"VT\", \"NH\", \"IL\", \"ID\", \"WA\", \"MN\", \"MT\", \"ND\", \"MI\", \"NY\", \"MA\", \"IA\", \"IN\", \"CT\", \"RI\", \"NJ\", \"PA\", \"OH\", \"SD\", \"WY\", \"NV\", \"OR\", \"CA\", \"NE\", \"DE\", \"MD\", \"VA\", \"WV\", \"KY\", \"MO\", \"CO\", \"UT\", \"AZ\", \"KS\", \"AR\", \"DC\", \"SC\", \"NC\", \"TN\", \"NM\", \"LA\", \"AL\", \"GA\", \"MS\", \"OK\", \"HI\", \"FL\", \"TX\"),\n row = c(1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8),\n col = c(12, 2, 7, 11, 12, 7, 3, 2, 6, 4, 5, 8, 10, 11, 6, 7, 11, 12, 10, 9, 8, 5, 4, 3, 2, 2, 5, 11, 10, 9, 8, 7, 6, 4, 3, 3, 5, 6, 10, 9, 8, 7, 4, 6, 8, 9, 7, 5, 2, 10, 5),\n stringsAsFactors = FALSE\n)\n\n## Combine data into geo_grid for tiling:\ngeo_grid_data <- mygrid %>% \n left_join(chip_with_geographies, by=c(\"code\" = \"state_abbv\")) \n\n## plot tile grid\ngeo_grid_data %>% \n ggplot(aes(x = col, y = row, fill = chip_pct_interval)) +\n scale_fill_manual(values = palette_urbn_cyan[c(8, 7, 5, 3, 1)], \n \t\t\t\t\t\t\t\t\t name = \"CHIP Enrollment %\") +\n geom_tile(color = \"white\", linewidth = 1) +\n geom_text(aes(label = code), color=\"white\", size = 4) +\n scale_y_reverse() +\n coord_equal() +\n labs(fill=NULL)\n```\n\n\n## Cartograms\n\nCartograms are a modified form of a choropleth map with intentionally\ndistorted sizes that map to a variable in your data. Below we create a\ncartogram with `library(cartogram)` where the state sizes are\nproportional to the population.\n\n```{r cartogram-example, cache = TRUE}\nlibrary(cartogram)\n\nset_urbn_defaults(style = \"map\")\n\nchip_with_geographies_weighted <- chip_with_geographies %>%\n # Note column name needs to be in quotes for this package\n cartogram_cont(weight = \"population\")\n\nggplot() +\n geom_sf(\n data = chip_with_geographies_weighted,\n # Color in states by chip percentages\n aes(fill = chip_pct)\n )\n```\n\n## Interactive Maps\n\nInteractive maps can be a great exploratory tool to explore and\nunderstand your data. And luckily there are a lot of new R packages that\nmake it really easy to create them. Interactive maps are powerful but\n**we do not recommend them for official use in Urban publications** as\ngetting them in Urban styles and appropriate basemaps can be tricky\n(reach out to\n[anarayanan\\@urban.org](mailto:anarayanan@urban.org){.email} if you\nreally want to include them).\n\n### `library(mapview)`\n\n`library(mapview)` is probably the most user friendly of the interactive\nmapping R libraries. All you have to do to create an interactive map is:\n\n```{r show-mapview}\nlibrary(mapview)\n\n\nchip_with_geographies_for_interactive_mapping <- chip_with_geographies %>%\n # Filter out AL and HI bc they would appear in Mexico. If you want AL, HI and\n # in the correct place in interactive maps, make sure to use tigris::states()\n filter(!state_abbv %in% c(\"AK\", \"HI\"))\n\nmapview(chip_with_geographies_for_interactive_mapping)\n```\n\nWhen you click on an object, you get a popup table of all it's\nattributes. And when you hover over an object, you get a popup with an\nobject id.\n\nEach of the above behaviors can be changed if desired. As you'll see in\nthe below section, the syntax for `library(mapview)` is significantly\ndifferent from `library(ggplot2)` so be careful!\n\n#### Coloring in points/polygons\n\nIn order to create a choropleth map where we color in the\npoints/polygons by a variable, we need to feed in a column name *in\nquotes* to the`zcol` argument inside the `mapview()` function:\n\n```{r mapview_zcol}\n# Create interactive state map colored in by chip enrollment\nmapview(chip_with_geographies_for_interactive_mapping, zcol = \"chip_enrollment\")\n```\n\nIf you want more granular control over the color palette for the legend\ncan also feed in a vector of color hex codes to `col.regions` along with\na column name to `zcol`. This will create a continuous color range along\nthe provided colors. Be careful though as the color interpolation is not\nperfect.\n\n```{r mapview-colors-granular}\n# library(RColorBrewer)\nmapview(chip_with_geographies_for_interactive_mapping,\n col.regions = c(\n palette_urbn_green[6],\n \"white\",\n palette_urbn_cyan[6]\n ),\n zcol = \"chip_enrollment\"\n)\n```\n\nIf you want to color in all points/polygons as the same color, just feed\nin a single color hex code to the `col.regions` argument:\n\n```{r mapview-colors}\nmapview(chip_with_geographies_for_interactive_mapping,\n col.regions = palette_urbn_green[5]\n)\n```\n\n#### Adding layers\n\nYou can add multiple `sf` objects on the same map by using the `+`\noperator. This is very useful when comparing 2 or more spatial datasets.\n\n```{r mapview-layers}\nmapview(chip_with_geographies_for_interactive_mapping, col.regions = palette_urbn_green[5]) +\n mapview(state_capitals, col.regions = palette_urbn_cyan[5])\n```\n\nYou can even create slider maps by using the `|` operator!\n\n```{r mapview-sliders}\nmapview(chip_with_geographies_for_interactive_mapping, col.regions = palette_urbn_green[5]) |\n mapview(state_capitals, col.regions = palette_urbn_cyan[5])\n```\n\n### More details\n\nTo learn more about more advanced options with `mapview` maps, check out\nthe\n[documentation](https://r-spatial.github.io/mapview/articles/articles/mapview_02-advanced.html)\npage and the [reference\nmanual](https://cran.r-project.org/web/packages/mapview/mapview.pdf).\n\nThere are also other interactive map making packages in R like `leaflet`\n(which `mapview` is a more user friendly wrapper of), `tmap`, and\n`mapdeck`. To learn about these other packages, [this book\nchapter](https://geocompr.robinlovelace.net/adv-map.html#interactive-maps)\nis a good starting point.\n\n# Spatial Operations\n\n## Cropping\n\nCropping (or clipping) is geographically filtering an `sf` dataframe to\njust the area we are interested in. Say we wanted to look at the roads\naround Fire Station 24 in DC.\n\n```{r roads_cropping_before, cache = TRUE}\nlibrary(tigris)\nlibrary(units)\n\ndc_firestations <- dc_firestations %>%\n st_transform(\"EPSG:6487\")\n\n\n# Draw 500 meter circle around one fire station\nfire_station_24_buffered <- dc_firestations %>%\n filter(NAME == \"Engine 24 Station\") %>%\n st_buffer(set_units(500, \"meter\"))\n\n# Get listing of all roads in DC\ndc_roads <- roads(\n state = \"DC\",\n county = \"District of Columbia\",\n class = \"sf\",\n progress_bar = FALSE\n) %>%\n st_transform(\"EPSG:6487\")\n\n# View roads on top of fire_station\nggplot() +\n # Order matters! We need to plot fire_stations first, and then roads on top\n # to see overlapping firestations\n geom_sf(\n data = fire_station_24_buffered,\n fill = palette_urbn_cyan[1],\n color = palette_urbn_cyan[7]\n ) +\n geom_sf(\n data = dc_roads,\n color = palette_urbn_gray[7]\n ) +\n theme_urbn_map()\n```\n\nWe can clip the larger roads dataframe to just roads that overlap with\nthe circle around the fire station with `st_intersection()`.\n\n```{r roads_cropping_after}\n\n# Use st_intersection() to crop the roads data to just roads within the\n# fire_station radius\ndc_roads_around_fire_station_24_buffered <- fire_station_24_buffered %>%\n st_intersection(dc_roads)\n\nggplot() +\n geom_sf(\n data = fire_station_24_buffered,\n fill = palette_urbn_cyan[1],\n color = palette_urbn_cyan[7]\n ) +\n geom_sf(\n data = dc_roads_around_fire_station_24_buffered,\n color = palette_urbn_gray[7]\n ) +\n theme_urbn_map()\n```\n\n**More Coming Soon!**\n\n## Calculating Distance\n\n## Spatial Joins\n\n### Point to Polygon\n\n### Polygon to Polygon\n\n## Aggregating\n\n## Drive/Transit times\n\n## Geocoding\n\nGeocoding is the process of turning text (usually addresses) into\ngeographic coordinates (usually latitudes/longitudes) for use in\nmapping. For Urban researchers, we highly recommend using the [Urban\ngeocoder](https://tech-tools.urban.org/geocoding/) as it is fast,\naccurate, designed to work with sensitive/confidential data and most\nimportantly free to use for Urban researchers! To learn about how we set\nup and chose the geocoder for the Urban Institute, you can read our\n[Data\\@Urban\nblog](https://medium.com/@urban_institute/choosing-a-geocoder-for-the-urban-institute-86192f656c5f).\n\n### Cleaning Addresses\n\nThe single most important factor in getting accurate geocoded data is\nhaving cleaned, well structured address data. This can prove difficult\nas address data out in the wild is often messy and unstandardized. While\nthe rules for cleaning addresses are very data specific, below are some\nexamples of clean addresses you should aim for in your data cleaning\nprocess:\n\n```{r cleaned-addr, cache=TRUE,eval=TRUE,results=TRUE, echo=FALSE}\nlibrary(gt)\ncleaned_address_table <- tribble(\n ~\"f_address\", ~\"Type of address\",\n \"123 Troy Drive, Pillowtown, CO, 92432\", \"residnetial address\",\n \"789 Abed Avenue, Apt 666, Blankesburg, CO, 92489\", \"residential apartment address\",\n \"Shirley Boulevard and Britta Drive, Blanketsburg, CO, 92489\", \"street intersection\",\n \"Pillowtown, CO\", \"city\",\n \"92489, CO\", \"Zip Code\",\n)\n\ngt(cleaned_address_table) %>%\n # tab_header(title = md(\"Clean Address Examples\")) %>%\n opt_row_striping(row_striping = TRUE) %>%\n tab_style(\n style = list(\n cell_text(weight = \"bold\")\n ),\n locations = cells_column_labels(\n columns = vars(f_address, `Type of address`)\n )\n ) %>%\n opt_align_table_header(align = c(\"left\")) %>%\n tab_options(\n container.width = \"100%\",\n container.height = \"400px\",\n # column_labels.background.color = palette_urbn_cyan[1],\n table.border.top.width = 0,\n table.border.bottom.width = 0,\n column_labels.border.bottom.width = 0,\n )\n```\n\nAll that being said, our geocoder is pretty tolerant of different\naddress formats, typos/spelling errors and missing states, zip codes,\netc. So don't spend too much time cleaning every address in the data.\nAlso note that while our geocoder is able to geocode cities and zip\ncodes, it will return the lat/lon of the center of the city/zip code,\nwhich may not be what you want.\n\n### Instructions\n\nTo use the [Urban geocoder](https://tech-tools.urban.org/geocoding/),\nyou will need to:\n\n1) Generate a CSV with a column named `f_address` which contains the\n addresses in single line format (ie\n `123 Abed Avenue, Blanketsburg, CO, 94328`). This means that if you\n have the addresses split across multiple columns (ie `Address`,\n `City`, `State`, `Zip` columns), you will need to concatenate them\n into one column. Also see our Address cleaning section above.\n\n2) Go to the Urban geocoder and answer the initial questions. This will\n tell you whether your data is non-confidential or confidential data,\n and allow you to upload your CSV for geocoding.\n\n3) Wait for an email telling you your results are ready. If your data\n is non-confidential, this email will contain a link to your geocoded\n results. This link expires in 24 hours, so make sure to download\n your data before then. If you data is confidential, the email will\n contain a link to the location on the Y Drive where your\n confidential geocoded data is stored. You can specify this output\n folder when submitting the CSV in step 1.\n\n### Geocoder outputs\n\n

The geocoded file will be your original data, plus a few more columns\n(including latitude and longitude). each of the new columns that have\nbeen appended to your original data. [It's very important that you take\na look at the Addr_type\ncolumn]{style=\"background-color: #FFFF00; font-weight: bold\"} in the\nCSV before doing further analysis to check the accuracy of the geocoding\nprocess.

\n\n+---------------+---------------------------------------------------+\n| Column | Description |\n+:==============+:==================================================+\n| Match_addr | The actual address that the inputted address was |\n| | matched to. This is the address that the geocoder |\n| | used to get Latitudes / Longitudes. If there are |\n| | potentially many typos or non standard address |\n| | formats in your data file, you will want to take |\n| | a close look at this column to confirm that the |\n| | matched address correctly handled typos and badly |\n| | formatted addresses. |\n+---------------+---------------------------------------------------+\n| Longitude | The WGS 84 datum Longitude (EPSG code 4326) |\n+---------------+---------------------------------------------------+\n| Latitude | The WGS 84 datum Latitude (EPSG code 4326) |\n+---------------+---------------------------------------------------+\n| Addr_type | The match level for a geocode request. This |\n| | should be used as an indicator of the precision |\n| | of geocode results. Generally, Subaddress, |\n| | PointAddress, StreetAddress, and StreetInt |\n| | represent accurate matches. The list below |\n| | contains all possible values for this field. |\n| | **Green values represent High accuracy matches, |\n| | yellow represents Medium accuracy matches and red |\n| | represents Low accuracy/inaccurate matches**. If |\n| | you have many yellow and red values in your data, |\n| | you should manually check the results before |\n| | proceeding with analysis. All possible values:\\ |\n| | \\ |\n| | **Subaddress:** A subset of a PointAddress that |\n| | represents a house or building subaddress |\n| | location, such as an apartment unit, floor, or |\n| | individual building within a complex. The |\n| | UnitName, UnitType, LevelName, LevelType, |\n| | BldgName, and BldgType field values help to |\n| | distinguish subaddresses which may be associated |\n| | with the same PointAddress. Reference data |\n| | consists of point features with associated house |\n| | number, street name, and subaddress elements, |\n| | along with administrative divisions and optional |\n| | postal code; for example, 3836 Emerald Ave, Suite |\n| | C, La Verne, CA, 91750.\\ |\n| | \\ |\n| | **PointAddress:** A street address based on |\n| | points that represent house and building |\n| | locations. Typically, this is the most spatially |\n| | accurate match level. Reference data contains |\n| | address points with associated house numbers and |\n| | street names, along with administrative divisions |\n| | and optional postal code. The X / Y |\n| | (`Longitude`/`Latitude`) and `geometry` output |\n| | values for a PointAddress match represent the |\n| | street entry location for the address; this is |\n| | the location used for routing operations. The |\n| | `DisplayX` and `DisplayY` values represent the |\n| | rooftop, or actual, location of the address. |\n| | Example: 380 New York St, Redlands, CA, 92373.\\ |\n| | \\ |\n| | **StreetAddress** --- A street address that |\n| | differs from PointAddress because the house |\n| | number is interpolated from a range of numbers. |\n| | Reference data contains street center lines with |\n| | house number ranges, along with administrative |\n| | divisions and optional postal code information, |\n| | for example, 647 Haight St, San Francisco, CA, |\n| | 94117.\\ |\n| | \\ |\n| | **StreetInt:** A street address consisting of a |\n| | street intersection along with city and optional |\n| | state and postal code information. This is |\n| | derived from StreetAddress reference data, for |\n| | example, Redlands Blvd & New York St, Redlands, |\n| | CA, 92373.\\ |\n| | \\ |\n| | **StreetName:** Similar to a street address but |\n| | without the house number. Reference data contains |\n| | street centerlines with associated street names |\n| | (no numbered address ranges), along with |\n| | administrative divisions and optional postal |\n| | code, for example, W Olive Ave, Redlands, CA, |\n| | 92373.\\ |\n| | \\ |\n| | **StreetAddressExt:** An interpolated street |\n| | address match that is returned when parameter |\n| | matchOutOfRange=true and the input house number |\n| | exceeds the house number range for the matched |\n| | street segment.\\ |\n| | \\ |\n| | **DistanceMarker:** A street address that |\n| | represents the linear distance along a street, |\n| | typically in kilometers or miles, from a |\n| | designated origin location. Example: Carr 682 KM |\n| | 4, Barceloneta, 00617.\\ |\n| | \\ |\n| | **PostalExt:** A postal code with an additional |\n| | extension, such as the United States Postal |\n| | Service ZIP+4. Reference data is postal code |\n| | points with extensions, for example, 90210-3841.\\ |\n| | \\ |\n| | **POI:** ---Points of interest. Reference data |\n| | consists of administrative division place-names, |\n| | businesses, landmarks, and geographic features, |\n| | for example, Golden Gate Bridge.\\ |\n| | \\ |\n| | **Locality:** A place-name representing a |\n| | populated place. The Type output field provides |\n| | more detailed information about the type of |\n| | populated place. Possible Type values for |\n| | Locality matches include Block, Sector, |\n| | Neighborhood, District, City, MetroArea, County, |\n| | State or Province, Territory, Country, and Zone. |\n| | Example: Bogotá, COL,\\ |\n| | \\ |\n| | **PostalLoc:** A combination of postal code and |\n| | city name. Reference data is typically a union of |\n| | postal boundaries and administrative (locality) |\n| | boundaries, for example, 7132 Frauenkirchen.\\ |\n| | \\ |\n| | **Postal:** Postal code. Reference data is postal |\n| | code points, for example, 90210 USA. |\n+---------------+---------------------------------------------------+\n| Score | A number from 1--100 indicating the degree to |\n| | which the input tokens in a geocoding request |\n| | match the address components in a candidate |\n| | record. A score of 100 represents a perfect |\n| | match, while lower scores represent decreasing |\n| | match accuracy. |\n+---------------+---------------------------------------------------+\n| Status | Indicates whether a batch geocode request results |\n| | in a match, tie, or unmatched. Possible values |\n| | include\\ |\n| | \\ |\n| | M - Match. The returned address matches the input |\n| | address and is the highest scoring candidate.\\ |\n| | \\ |\n| | T - Tied. The returned address matches the input |\n| | address but has the same score as one or more |\n| | additional candidates.\\ |\n| | \\ |\n| | U - Unmatched. No addresses match the inputted |\n| | address. |\n+---------------+---------------------------------------------------+\n| geometry | The WKT (Well-known text) representation of the |\n| | latitudes and longitudes. This column may be |\n| | useful if you're reading the CSV into R, Python, |\n| | or ArcGIS |\n+---------------+---------------------------------------------------+\n| Region | The state that `Match_addr` is located in |\n+---------------+---------------------------------------------------+\n| RegionAbbr | Abbreviated State Name. For example, CA for |\n| | California |\n+---------------+---------------------------------------------------+\n| Subregion | The county that the input address is located in |\n+---------------+---------------------------------------------------+\n| MetroArea | The name of the Metropolitan area that |\n| | `Match_addr` is located in. This field may be |\n| | blank if the input address is not located within |\n| | a metro area. |\n+---------------+---------------------------------------------------+\n| City | The city that `Match_addr` is located in |\n+---------------+---------------------------------------------------+\n| Nbrhd | The Neighborhood that `Match_addr` is located in. |\n| | Note these are ESRI defined neighborhoods which |\n| | may or may not align with other sources |\n| | neighborhood definitions |\n+---------------+---------------------------------------------------+\n\n\\\n\n# Geospatial Modeling\n\nComing soon!\n\n# Bibliography and references\n\n------------------------------------------------------------------------\n\n```{r session-info}\n\nsessionInfo()\n```\n"},"formats":{"html":{"identifier":{"display-name":"HTML","target-format":"html","base-format":"html"},"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"knitr"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"inline-includes":false,"preserve-yaml":false,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[],"notebook-links":true,"format-links":true},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"mapping.html"},"language":{"toc-title-document":"Table of contents","toc-title-website":"On this page","related-formats-title":"Other Formats","related-notebooks-title":"Notebooks","source-notebooks-prefix":"Source","section-title-abstract":"Abstract","section-title-appendices":"Appendices","section-title-footnotes":"Footnotes","section-title-references":"References","section-title-reuse":"Reuse","section-title-copyright":"Copyright","section-title-citation":"Citation","appendix-attribution-cite-as":"For attribution, please cite this work as:","appendix-attribution-bibtex":"BibTeX citation:","title-block-author-single":"Author","title-block-author-plural":"Authors","title-block-affiliation-single":"Affiliation","title-block-affiliation-plural":"Affiliations","title-block-published":"Published","title-block-modified":"Modified","callout-tip-title":"Tip","callout-note-title":"Note","callout-warning-title":"Warning","callout-important-title":"Important","callout-caution-title":"Caution","code-summary":"Code","code-tools-menu-caption":"Code","code-tools-show-all-code":"Show All Code","code-tools-hide-all-code":"Hide All Code","code-tools-view-source":"View Source","code-tools-source-code":"Source Code","code-line":"Line","code-lines":"Lines","copy-button-tooltip":"Copy to Clipboard","copy-button-tooltip-success":"Copied!","repo-action-links-edit":"Edit this page","repo-action-links-source":"View source","repo-action-links-issue":"Report an issue","back-to-top":"Back to top","search-no-results-text":"No results","search-matching-documents-text":"matching documents","search-copy-link-title":"Copy link to search","search-hide-matches-text":"Hide additional matches","search-more-match-text":"more match in this document","search-more-matches-text":"more matches in this document","search-clear-button-title":"Clear","search-detached-cancel-button-title":"Cancel","search-submit-button-title":"Submit","search-label":"Search","toggle-section":"Toggle section","toggle-sidebar":"Toggle sidebar navigation","toggle-dark-mode":"Toggle dark mode","toggle-reader-mode":"Toggle reader mode","toggle-navigation":"Toggle navigation","crossref-fig-title":"Figure","crossref-tbl-title":"Table","crossref-lst-title":"Listing","crossref-thm-title":"Theorem","crossref-lem-title":"Lemma","crossref-cor-title":"Corollary","crossref-prp-title":"Proposition","crossref-cnj-title":"Conjecture","crossref-def-title":"Definition","crossref-exm-title":"Example","crossref-exr-title":"Exercise","crossref-ch-prefix":"Chapter","crossref-apx-prefix":"Appendix","crossref-sec-prefix":"Section","crossref-eq-prefix":"Equation","crossref-lof-title":"List of Figures","crossref-lot-title":"List of Tables","crossref-lol-title":"List of Listings","environment-proof-title":"Proof","environment-remark-title":"Remark","environment-solution-title":"Solution","listing-page-order-by":"Order By","listing-page-order-by-default":"Default","listing-page-order-by-date-asc":"Oldest","listing-page-order-by-date-desc":"Newest","listing-page-order-by-number-desc":"High to Low","listing-page-order-by-number-asc":"Low to High","listing-page-field-date":"Date","listing-page-field-title":"Title","listing-page-field-description":"Description","listing-page-field-author":"Author","listing-page-field-filename":"File Name","listing-page-field-filemodified":"Modified","listing-page-field-subtitle":"Subtitle","listing-page-field-readingtime":"Reading Time","listing-page-field-categories":"Categories","listing-page-minutes-compact":"{0} min","listing-page-category-all":"All","listing-page-no-matches":"No matching items"},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.3.433","editor_options":{"markdown":{"wrap":72}}},"extensions":{"book":{"multiFile":true}}}},"projectFormats":["html"]} \ No newline at end of file diff --git a/.quarto/idx/optimization.qmd.json b/.quarto/idx/optimization.qmd.json index baf3ce4..9712261 100644 --- a/.quarto/idx/optimization.qmd.json +++ b/.quarto/idx/optimization.qmd.json @@ -1 +1 @@ -{"title":"Introduction","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}}},"headingText":"Introduction","containsRefs":false,"markdown":"\n\n\n\n::: {#header}\n\n:::\n\n```{r markdown-setup, include=FALSE}\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\nknitr::opts_chunk$set(warning = FALSE)\n\noptions(scipen = 999)\n```\n\n\nThis guide outlines tools and tips for improving the speed and execution of R code.\n\nSometimes, simply tweaking a few lines of code can lead to large performance gains in the execution of a program. Other issues may take more time to work through but can be a huge benefit to a project in the long term.\n\nAn important lesson to learn when it comes to optimising an R (or any) program is knowing both if to start and when to stop. You most likely want to optimize your code because it is \"too slow\", but what that means will vary from project to project. Be sure to consider what \"fast enough\" is for your project and how much needs to be optimized. If your program takes an hour to complete, spending 5 hours trying to make it faster can be time well spent if the script will be run regularly, and a complete waste of time if it's an ad-hoc analysis.\n\nFor more information, see the CRAN Task View [High-Performance and Parallel Computing with R](https://CRAN.R-project.org/view=HighPerformanceComputing).\n\nThe \"Performant Code\" section of Hadley Wickham's [Advanced R](http://adv-r.had.co.nz/) is another great resource and provides a deeper dive into what is covered in this guide.\n\n------------------------------------------------------------------------\n\n# Update Your Installation\n\nOne of the easiest ways to improve the performance of R is to update R. In general, R will have a big annual release (i.e., 3.5.0) in the spring and around 3-4 smaller patch releases (i.e., 3.5.1) throughout the rest of the year. If the middle digit of your installation is behind the current release, you should consider updating.\n\nFor instance, R 3.5.0 implemented an improved read from text files. A 5GB file took over 5 minutes to read in 3.4.4:\n\n![](optimization/images/data-load-3-4.png){width=\"75%\"}\n\nWhile 3.5.0 took less than half the time:\n\n![](optimization/images/data-load-3-5.png){width=\"75%\"}\n\nTo see what the R-core development team is up to, check out the [NEWS](https://cran.r-project.org/doc/manuals/r-devel/NEWS.html) file from the R project.\n\n------------------------------------------------------------------------\n\n# Profiling & Benchmarking\n\nIn order to efficiently optimize your code, you'll first need to know where it's running slowest. The `profvis` package provides a nice way of visualizing the execution time and memory useage of your program.\n\n```{r profile-01}\nlibrary(profvis)\nlibrary(dplyr)\n\nprofvis({\n\tdiamonds <- read.csv(\"optimization/data/diamonds.csv\")\n\n\tdiamonds_by_cut <- diamonds %>%\n\t\tgroup_by(cut) %>%\n\t\tsummarise_if(is.numeric, mean)\n\n\twrite.csv(diamonds_by_cut, file = \"optimization/data/diamonds_by_cut.csv\")\n\n})\n\n```\n\nIn this toy example it looks like the `read.csv` function is the bottleneck, so\n\nwork on optimizing that first.\n\nOnce you find the bottleneck that needs to be optimized, it can be useful to\n\nbenchmark different potential solutions. The `microbenchmark` package can help\n\nyou choose between different options. Continuing with the simple example with\n\nthe `diamonds` dataset, compare the base `read.csv` function with `read_csv`\n\nfrom the `readr` package.\n\n```{r benchmark-01}\n\nlibrary(microbenchmark)\n\nmicrobenchmark(\n\n read.csv(\"optimization/data/diamonds.csv\"),\n\n readr::read_csv(\"optimization/data/diamonds.csv\")\n\n)\n\n```\n\nIn this case, `read_csv` is about twice as fast as the base R implementations.\n\n# Parallel Computing\n\nOften, time-intensive R code can be sped up by breaking the execution of\n\nthe job across additional cores of your computer. This is called parallel computing.\n\n## Learn `lapply`/`purrr::map`\n\nLearning the `lapply` (and variants) function from Base R or the `map` (and variants) function from the `purrr` package is the first step in learning to run R code in parallel. Once you understand how `lapply` and `map` work, running your code in parallel will be simple.\n\nSay you have a vector of numbers and want to find the square root of each one\n\n(ignore for now that `sqrt` is vectorized, which will be covered later).\n\nYou could write a for loop and iterate over each element of the vector:\n\n```{r apply-01}\n\nx <- c(1, 4, 9, 16)\n\nout <- vector(\"list\", length(x))\n\nfor (i in seq_along(x)) {\n\n out[[i]] <- sqrt(x[[i]])\n\n}\n\nunlist(out)\n\n```\n\nThe `lapply` function essentially handles the overhead of constructing a for\n\nloop for you. The syntax is:\n\n```{r apply-02, eval = FALSE}\n\nlapply(X, FUN, ...)\n\n```\n\n`lapply` will then take each element of `X` and apply the `FUN`ction to it.\n\nOur simple example then becomes:\n\n```{r apply-03}\n\nx <- c(1, 4, 9, 16)\n\nout <- lapply(x, sqrt)\n\nunlist(out)\n\n```\n\nThose working within the `tidyverse` may use `map` from the `purrr` package equivalently:\n\n```{r apply-04}\n\nlibrary(purrr)\n\nx <- c(1, 4, 9, 16)\n\nout <- map(x, sqrt)\n\nunlist(out)\n\n```\n\n## Motivating Example\n\nOnce you are comfortable with `lapply` and/or `map`, running the same code in\n\nparallel takes just an additional line of code.\n\nFor `lapply` users, the `future.apply` package contains an equivalent\n\n`future_lapply` function. Just be sure to call `plan(multiprocess)` beforehand,\n\nwhich will handle the back-end orchestration needed to run in parallel.\n\n```{r parallel-01}\n\n# install.packages(\"future.apply\")\n\nlibrary(future.apply)\n\nplan(multisession)\n\nout <- future_lapply(x, sqrt)\n\nunlist(out)\n```\n\nFor `purrr` users, the `furrr` (i.e., future purrr) package includes an\n\nequivalent `future_map` function:\n\n```{r parallel-02}\n\n# install.packages(\"furrr\")\n\nlibrary(furrr)\n\nplan(multisession)\n\ny <- future_map(x, sqrt)\n\nunlist(y)\n\n```\n\nHow much faster did this simple example run in parallel?\n\n```{r parallel-03}\n\nlibrary(future.apply)\n\nplan(multisession)\n\nx <- c(1, 4, 9, 16)\n\nmicrobenchmark::microbenchmark(\n\n sequential = lapply(x, sqrt),\n\n parallel = future_lapply(x, sqrt),\n\n unit = \"s\"\n\n)\n\n```\n\nParallelization was actually slower. In this case, the overhead of\n\nsetting the code to run in parallel far outweighed any performance gain. In\n\ngeneral, parallelization works well on long-running & compute intensive jobs.\n\n## A (somewhat) More Complex Example\n\nIn this example we'll use the `diamonds` dataset from `ggplot2` and perform a\n\nkmeans cluster. We'll use `lapply` to iterate the number of clusters from 2 to\n\n5:\n\n```{r kmeans-01}\n\ndf <- ggplot2::diamonds\n\ndf <- dplyr::select(df, -c(cut, color, clarity))\n\ncenters = 2:5\n\nsystem.time(\n\n lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n\n```\n\nA now running the same code in parallel:\n\n```{r kmeans-02}\n\nlibrary(future.apply)\n\nplan(multisession)\n\nsystem.time(\n\n future_lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n\n```\n\nWhile we didn't achieve perfect scaling, we still get a nice bump in execution\n\ntime.\n\n## Additional Packages\n\nFor the sake of ease and brevity, this guide focused on the `futures` framework\n\nfor parallelization. However, you should be aware that there are a number of\n\nother ways to parallelize your code.\n\n### The `parallel` Package\n\nThe `parallel` package is included in your base R installation. It includes\n\nanalogues of the various `apply` functions:\n\n- `parLapply`\n\n- `mclapply` - not available on Windows\n\nThese functions generally require more setup, especially on Windows machines.\n\n### The `doParallel` Package\n\nThe `doParallel` package builds off of `parallel` and is\n\nuseful for code that uses for loops instead of `lapply`. Like the parallel\n\npackage, it generally requires more setup, especially on Windows machines.\n\n### Machine Learning - `caret`\n\nFor those running machine learning models, the `caret` package can easily\n\nleverage `doParallel` to speed up the execution of multiple models. Lifting\n\nthe example from the package documentation:\n\n```{r caret-01, eval = FALSE}\n\nlibrary(doParallel)\n\ncl <- makePSOCKcluster(5) # number of cores to use\n\nregisterDoParallel(cl)\n\n## All subsequent models are then run in parallel\n\nmodel <- train(y ~ ., data = training, method = \"rf\")\n\n## When you are done:\n\nstopCluster(cl)\n\n```\n\nBe sure to check out the full\n\n[documentation](http://topepo.github.io/caret/parallel-processing.html)\n\nfor more detail.\n\n------------------------------------------------------------------------\n\n# Big Data\n\nAs data collection and storage becomes easier and cheaper, it is relatively\n\nsimple to obtain relatively large data files. An important point to keep in\n\nmind is that the size of your data will generally expand when it is read\n\nfrom a storage device into R. A general rule of thumb is that a file will take\n\nsomewhere around 3-4 times more space in memory than it does on disk.\n\nFor instance, compare the size of the `iris` data set when it is saved as a\n\n.csv file locally vs the size of the object when it is read in to an R session:\n\n```{r size-01, message = FALSE}\n\nfile.size(\"optimization/data/iris.csv\") / 1000\n\ndf <- readr::read_csv(\"optimization/data/iris.csv\")\n\npryr::object_size(df)\n\n```\n\nThis means that on a standard Urban Institute desktop, you may have issues\n\nreading in files that are larger than 4 GB.\n\n## Object Size\n\nThe type of your data can have a big impact on the size of your data frame\n\nwhen you are dealing with larger files. There are four main types of atomic\n\nvectors in R:\n\n1. `logical`\n\n2. `integer`\n\n3. `double` (also called `numeric`)\n\n4. `character`\n\n## Each of these data types occupies a different amount of space in memory\n\n`logical` and `integer` vectors use 4 bytes per element, while a `double` will\n\noccupy 8 bytes. R uses a global string pool, so `character` vectors are hard\n\nto estimate, but will generally take up more space for element.\n\nConsider the following example:\n\n```{r size-02}\n\nx <- 1:100\n\npryr::object_size(x)\n\npryr::object_size(as.double(x))\n\npryr::object_size(as.character(x))\n\n```\n\nAn incorrect data type can easily cost you a lot of space in memory, especially\n\nat scale. This often happens when reading data from a text or csv file - data\n\nmay have a format such as `c(1.0, 2.0, 3.0)` and will be read in as a `numeric`\n\ncolumn, when `integer` is more appropriate and compact.\n\nYou may also be familiar with `factor` variables within R. Essentially a\n\n`factor` will represent your data as integers, and map them back to their\n\ncharacter representation. This can save memory when you have a compact and\n\nunique level of factors:\n\n```{r size-03}\n\nx <- sample(letters, 10000, replace = TRUE)\n\npryr::object_size(as.character(x))\n\npryr::object_size(as.factor(x))\n\n```\n\nHowever if each element is unique, or if there is not a lot of overlap among\n\nelements, than the overhead will make a factor larger than its character\n\nrepresentation:\n\n```{r size-04}\n\npryr::object_size(as.factor(letters))\n\npryr::object_size(as.character(letters))\n\n```\n\n## Cloud Computing\n\nSometimes, you will have data that are simply too large to ever fit on your\n\nlocal desktop machine. If that is the case, then the Elastic Cloud Computing\n\nEnvironment from the Office of Technology and Data Science can provide you with\n\neasy access to powerful analytic tools for computationally intensive project.\n\nThe Elastic Cloud Computing Environment allows researchers to quickly spin-up\n\nan Amazon Web Services (AWS) Elastic Cloud Compute (EC2) instance. These\n\ninstances offer increased memory to read in large datasets, along with\n\nadditional CPUs to provide the ability to process data in parallel at an\n\nimpressive scale.\n\n| Instance \\| CPU \\| Memory (GB) \\|\n\n\\|----------\\|-----\\|--------\\|\n\n| Desktop \\| 8 \\| 16 \\|\n\n| c5.4xlarge \\| 16 \\| 32 \\|\n\n| c5.9xlarge \\| 36 \\| 72 \\|\n\n| c5.18xlarge \\| 72 \\| 144 \\|\n\n| x1e.8xlarge \\| 32 \\| 976 \\|\n\n| x1e.16xlarge \\| 64 \\| 1952 \\|\n\nFeel free to contact Erika Tyagi (etyagi\\@urban.org) if this would be useful\n\nfor your project.\n\n------------------------------------------------------------------------\n\n# Common Pitfalls\n\n## For Loops and Vector Allocation\n\nA refrain you will often hear is that for loops in R are slow and need to be\n\navoided at all costs. This is not true! Rather, an improperly constructed loop\n\nin R can bring the execution of your program to a near standstill.\n\nA common for loop structure may look something like:\n\n```{r loop-01, eval = FALSE}\n\nx <- 1:100\n\nout <- c()\n\nfor (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n\n```\n\nThe bottleneck in this loop is with the allocation of the vector `out`. Every\n\ntime we iterate over an item in `x` and append it to `out`, R makes a copy\n\nof all the items already in `out`. As the size of the loop grows, your code\n\nwill take longer and longer to run.\n\nA better practice is to pre-allocate `out` to be the correct length, and then\n\ninsert the results as the loop runs.\n\n```{r loop-03, eval = FALSE}\n\nx <- 1:100\n\nout <- rep(NA, length(x))\n\nfor (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n}\n\n```\n\nA quick benchmark shows how much more efficient a loop with a pre-allocated\n\nresults vector is:\n\n```{r loop-04}\n\nbad_loop <- function(x) {\n\n out <- c()\n\n for (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n\n}\n\ngood_loop <- function(x) {\n\n out <- rep(NA, length(x))\n\n for (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n }\n\n}\n\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(x),\n\n good_loop(x)\n\n)\n\n```\n\nAnd note how performance of the \"bad\" loop degrades as the loop size grows.\n\n```{r loop-05}\n\ny <- 1:250\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(y),\n\n good_loop(y)\n\n)\n\n```\n\n## Vectorized Functions\n\nMany functions in R are vectorized, meaning they can accept an entire vector\n\n(and not just a single value) as input. The `sqrt` function from the\n\nprior examples is one:\n\n```{r vectorised-01}\n\nx <- c(1, 4, 9, 16)\n\nsqrt(x)\n\n```\n\nThis removes the need to use `lapply` or a for loop. Vectorized functions in\n\nR are generally written in a compiled language like C, C++, or FORTRAN, which\n\nmakes their implementation faster.\n\n```{r vectorised-02}\n\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n lapply(x, sqrt),\n\n sqrt(x)\n\n)\n\n```\n"},"formats":{"html":{"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"knitr"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[]},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"optimization.html"},"language":{},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.2.269"},"extensions":{"book":{"multiFile":true}}}}} \ No newline at end of file +{"title":"Introduction","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}}},"headingText":"Introduction","containsRefs":false,"markdown":"\n\n\n\n::: {#header}\n\n:::\n\n```{r markdown-setup, include=FALSE}\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\nknitr::opts_chunk$set(warning = FALSE)\n\noptions(scipen = 999)\n```\n\n\nThis guide outlines tools and tips for improving the speed and execution of R code.\n\nSometimes, simply tweaking a few lines of code can lead to large performance gains in the execution of a program. Other issues may take more time to work through but can be a huge benefit to a project in the long term.\n\nAn important lesson to learn when it comes to optimising an R (or any) program is knowing both if to start and when to stop. You most likely want to optimize your code because it is \"too slow\", but what that means will vary from project to project. Be sure to consider what \"fast enough\" is for your project and how much needs to be optimized. If your program takes an hour to complete, spending 5 hours trying to make it faster can be time well spent if the script will be run regularly, and a complete waste of time if it's an ad-hoc analysis.\n\nFor more information, see the CRAN Task View [High-Performance and Parallel Computing with R](https://CRAN.R-project.org/view=HighPerformanceComputing).\n\nThe \"Performant Code\" section of Hadley Wickham's [Advanced R](http://adv-r.had.co.nz/) is another great resource and provides a deeper dive into what is covered in this guide.\n\n------------------------------------------------------------------------\n\n# Update Your Installation\n\nOne of the easiest ways to improve the performance of R is to update R. In general, R will have a big annual release (i.e., 3.5.0) in the spring and around 3-4 smaller patch releases (i.e., 3.5.1) throughout the rest of the year. If the middle digit of your installation is behind the current release, you should consider updating.\n\nFor instance, R 3.5.0 implemented an improved read from text files. A 5GB file took over 5 minutes to read in 3.4.4:\n\n![](optimization/images/data-load-3-4.png){width=\"75%\"}\n\nWhile 3.5.0 took less than half the time:\n\n![](optimization/images/data-load-3-5.png){width=\"75%\"}\n\nTo see what the R-core development team is up to, check out the [NEWS](https://cran.r-project.org/doc/manuals/r-devel/NEWS.html) file from the R project.\n\n------------------------------------------------------------------------\n\n# Profiling & Benchmarking\n\nIn order to efficiently optimize your code, you'll first need to know where it's running slowest. The `profvis` package provides a nice way of visualizing the execution time and memory useage of your program.\n\n```{r profile-01}\nlibrary(profvis)\nlibrary(dplyr)\n\nprofvis({\n\tdiamonds <- read.csv(\"optimization/data/diamonds.csv\")\n\n\tdiamonds_by_cut <- diamonds %>%\n\t\tgroup_by(cut) %>%\n\t\tsummarise_if(is.numeric, mean)\n\n\twrite.csv(diamonds_by_cut, file = \"optimization/data/diamonds_by_cut.csv\")\n\n})\n\n```\n\nIn this toy example it looks like the `read.csv` function is the bottleneck, so\n\nwork on optimizing that first.\n\nOnce you find the bottleneck that needs to be optimized, it can be useful to\n\nbenchmark different potential solutions. The `microbenchmark` package can help\n\nyou choose between different options. Continuing with the simple example with\n\nthe `diamonds` dataset, compare the base `read.csv` function with `read_csv`\n\nfrom the `readr` package.\n\n```{r benchmark-01}\n\nlibrary(microbenchmark)\n\nmicrobenchmark(\n\n read.csv(\"optimization/data/diamonds.csv\"),\n\n readr::read_csv(\"optimization/data/diamonds.csv\")\n\n)\n\n```\n\nIn this case, `read_csv` is about twice as fast as the base R implementations.\n\n# Parallel Computing\n\nOften, time-intensive R code can be sped up by breaking the execution of\n\nthe job across additional cores of your computer. This is called parallel computing.\n\n## Learn `lapply`/`purrr::map`\n\nLearning the `lapply` (and variants) function from Base R or the `map` (and variants) function from the `purrr` package is the first step in learning to run R code in parallel. Once you understand how `lapply` and `map` work, running your code in parallel will be simple.\n\nSay you have a vector of numbers and want to find the square root of each one\n\n(ignore for now that `sqrt` is vectorized, which will be covered later).\n\nYou could write a for loop and iterate over each element of the vector:\n\n```{r apply-01}\n\nx <- c(1, 4, 9, 16)\n\nout <- vector(\"list\", length(x))\n\nfor (i in seq_along(x)) {\n\n out[[i]] <- sqrt(x[[i]])\n\n}\n\nunlist(out)\n\n```\n\nThe `lapply` function essentially handles the overhead of constructing a for\n\nloop for you. The syntax is:\n\n```{r apply-02, eval = FALSE}\n\nlapply(X, FUN, ...)\n\n```\n\n`lapply` will then take each element of `X` and apply the `FUN`ction to it.\n\nOur simple example then becomes:\n\n```{r apply-03}\n\nx <- c(1, 4, 9, 16)\n\nout <- lapply(x, sqrt)\n\nunlist(out)\n\n```\n\nThose working within the `tidyverse` may use `map` from the `purrr` package equivalently:\n\n```{r apply-04}\n\nlibrary(purrr)\n\nx <- c(1, 4, 9, 16)\n\nout <- map(x, sqrt)\n\nunlist(out)\n\n```\n\n## Motivating Example\n\nOnce you are comfortable with `lapply` and/or `map`, running the same code in\n\nparallel takes just an additional line of code.\n\nFor `lapply` users, the `future.apply` package contains an equivalent\n\n`future_lapply` function. Just be sure to call `plan(multiprocess)` beforehand,\n\nwhich will handle the back-end orchestration needed to run in parallel.\n\n```{r parallel-01}\n\n# install.packages(\"future.apply\")\n\nlibrary(future.apply)\n\nplan(multisession)\n\nout <- future_lapply(x, sqrt)\n\nunlist(out)\n```\n\nFor `purrr` users, the `furrr` (i.e., future purrr) package includes an\n\nequivalent `future_map` function:\n\n```{r parallel-02}\n\n# install.packages(\"furrr\")\n\nlibrary(furrr)\n\nplan(multisession)\n\ny <- future_map(x, sqrt)\n\nunlist(y)\n\n```\n\nHow much faster did this simple example run in parallel?\n\n```{r parallel-03}\n\nlibrary(future.apply)\n\nplan(multisession)\n\nx <- c(1, 4, 9, 16)\n\nmicrobenchmark::microbenchmark(\n\n sequential = lapply(x, sqrt),\n\n parallel = future_lapply(x, sqrt),\n\n unit = \"s\"\n\n)\n\n```\n\nParallelization was actually slower. In this case, the overhead of\n\nsetting the code to run in parallel far outweighed any performance gain. In\n\ngeneral, parallelization works well on long-running & compute intensive jobs.\n\n## A (somewhat) More Complex Example\n\nIn this example we'll use the `diamonds` dataset from `ggplot2` and perform a\n\nkmeans cluster. We'll use `lapply` to iterate the number of clusters from 2 to\n\n5:\n\n```{r kmeans-01}\n\ndf <- ggplot2::diamonds\n\ndf <- dplyr::select(df, -c(cut, color, clarity))\n\ncenters = 2:5\n\nsystem.time(\n\n lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n\n```\n\nA now running the same code in parallel:\n\n```{r kmeans-02}\n\nlibrary(future.apply)\n\nplan(multisession)\n\nsystem.time(\n\n future_lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n\n```\n\nWhile we didn't achieve perfect scaling, we still get a nice bump in execution\n\ntime.\n\n## Additional Packages\n\nFor the sake of ease and brevity, this guide focused on the `futures` framework\n\nfor parallelization. However, you should be aware that there are a number of\n\nother ways to parallelize your code.\n\n### The `parallel` Package\n\nThe `parallel` package is included in your base R installation. It includes\n\nanalogues of the various `apply` functions:\n\n- `parLapply`\n\n- `mclapply` - not available on Windows\n\nThese functions generally require more setup, especially on Windows machines.\n\n### The `doParallel` Package\n\nThe `doParallel` package builds off of `parallel` and is\n\nuseful for code that uses for loops instead of `lapply`. Like the parallel\n\npackage, it generally requires more setup, especially on Windows machines.\n\n### Machine Learning - `caret`\n\nFor those running machine learning models, the `caret` package can easily\n\nleverage `doParallel` to speed up the execution of multiple models. Lifting\n\nthe example from the package documentation:\n\n```{r caret-01, eval = FALSE}\n\nlibrary(doParallel)\n\ncl <- makePSOCKcluster(5) # number of cores to use\n\nregisterDoParallel(cl)\n\n## All subsequent models are then run in parallel\n\nmodel <- train(y ~ ., data = training, method = \"rf\")\n\n## When you are done:\n\nstopCluster(cl)\n\n```\n\nBe sure to check out the full\n\n[documentation](http://topepo.github.io/caret/parallel-processing.html)\n\nfor more detail.\n\n------------------------------------------------------------------------\n\n# Big Data\n\nAs data collection and storage becomes easier and cheaper, it is relatively\n\nsimple to obtain relatively large data files. An important point to keep in\n\nmind is that the size of your data will generally expand when it is read\n\nfrom a storage device into R. A general rule of thumb is that a file will take\n\nsomewhere around 3-4 times more space in memory than it does on disk.\n\nFor instance, compare the size of the `iris` data set when it is saved as a\n\n.csv file locally vs the size of the object when it is read in to an R session:\n\n```{r size-01, message = FALSE}\n\nfile.size(\"optimization/data/iris.csv\") / 1000\n\ndf <- readr::read_csv(\"optimization/data/iris.csv\")\n\npryr::object_size(df)\n\n```\n\nThis means that on a standard Urban Institute desktop, you may have issues\n\nreading in files that are larger than 4 GB.\n\n## Object Size\n\nThe type of your data can have a big impact on the size of your data frame\n\nwhen you are dealing with larger files. There are four main types of atomic\n\nvectors in R:\n\n1. `logical`\n\n2. `integer`\n\n3. `double` (also called `numeric`)\n\n4. `character`\n\n## Each of these data types occupies a different amount of space in memory\n\n`logical` and `integer` vectors use 4 bytes per element, while a `double` will\n\noccupy 8 bytes. R uses a global string pool, so `character` vectors are hard\n\nto estimate, but will generally take up more space for element.\n\nConsider the following example:\n\n```{r size-02}\n\nx <- 1:100\n\npryr::object_size(x)\n\npryr::object_size(as.double(x))\n\npryr::object_size(as.character(x))\n\n```\n\nAn incorrect data type can easily cost you a lot of space in memory, especially\n\nat scale. This often happens when reading data from a text or csv file - data\n\nmay have a format such as `c(1.0, 2.0, 3.0)` and will be read in as a `numeric`\n\ncolumn, when `integer` is more appropriate and compact.\n\nYou may also be familiar with `factor` variables within R. Essentially a\n\n`factor` will represent your data as integers, and map them back to their\n\ncharacter representation. This can save memory when you have a compact and\n\nunique level of factors:\n\n```{r size-03}\n\nx <- sample(letters, 10000, replace = TRUE)\n\npryr::object_size(as.character(x))\n\npryr::object_size(as.factor(x))\n\n```\n\nHowever if each element is unique, or if there is not a lot of overlap among\n\nelements, than the overhead will make a factor larger than its character\n\nrepresentation:\n\n```{r size-04}\n\npryr::object_size(as.factor(letters))\n\npryr::object_size(as.character(letters))\n\n```\n\n## Cloud Computing\n\nSometimes, you will have data that are simply too large to ever fit on your\n\nlocal desktop machine. If that is the case, then the Elastic Cloud Computing\n\nEnvironment from the Office of Technology and Data Science can provide you with\n\neasy access to powerful analytic tools for computationally intensive project.\n\nThe Elastic Cloud Computing Environment allows researchers to quickly spin-up\n\nan Amazon Web Services (AWS) Elastic Cloud Compute (EC2) instance. These\n\ninstances offer increased memory to read in large datasets, along with\n\nadditional CPUs to provide the ability to process data in parallel at an\n\nimpressive scale.\n\n| Instance \\| CPU \\| Memory (GB) \\|\n\n\\|----------\\|-----\\|--------\\|\n\n| Desktop \\| 8 \\| 16 \\|\n\n| c5.4xlarge \\| 16 \\| 32 \\|\n\n| c5.9xlarge \\| 36 \\| 72 \\|\n\n| c5.18xlarge \\| 72 \\| 144 \\|\n\n| x1e.8xlarge \\| 32 \\| 976 \\|\n\n| x1e.16xlarge \\| 64 \\| 1952 \\|\n\nFeel free to contact Erika Tyagi (etyagi\\@urban.org) if this would be useful\n\nfor your project.\n\n------------------------------------------------------------------------\n\n# Common Pitfalls\n\n## For Loops and Vector Allocation\n\nA refrain you will often hear is that for loops in R are slow and need to be\n\navoided at all costs. This is not true! Rather, an improperly constructed loop\n\nin R can bring the execution of your program to a near standstill.\n\nA common for loop structure may look something like:\n\n```{r loop-01, eval = FALSE}\n\nx <- 1:100\n\nout <- c()\n\nfor (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n\n```\n\nThe bottleneck in this loop is with the allocation of the vector `out`. Every\n\ntime we iterate over an item in `x` and append it to `out`, R makes a copy\n\nof all the items already in `out`. As the size of the loop grows, your code\n\nwill take longer and longer to run.\n\nA better practice is to pre-allocate `out` to be the correct length, and then\n\ninsert the results as the loop runs.\n\n```{r loop-03, eval = FALSE}\n\nx <- 1:100\n\nout <- rep(NA, length(x))\n\nfor (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n}\n\n```\n\nA quick benchmark shows how much more efficient a loop with a pre-allocated\n\nresults vector is:\n\n```{r loop-04}\n\nbad_loop <- function(x) {\n\n out <- c()\n\n for (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n\n}\n\ngood_loop <- function(x) {\n\n out <- rep(NA, length(x))\n\n for (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n }\n\n}\n\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(x),\n\n good_loop(x)\n\n)\n\n```\n\nAnd note how performance of the \"bad\" loop degrades as the loop size grows.\n\n```{r loop-05}\n\ny <- 1:250\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(y),\n\n good_loop(y)\n\n)\n\n```\n\n## Vectorized Functions\n\nMany functions in R are vectorized, meaning they can accept an entire vector\n\n(and not just a single value) as input. The `sqrt` function from the\n\nprior examples is one:\n\n```{r vectorised-01}\n\nx <- c(1, 4, 9, 16)\n\nsqrt(x)\n\n```\n\nThis removes the need to use `lapply` or a for loop. Vectorized functions in\n\nR are generally written in a compiled language like C, C++, or FORTRAN, which\n\nmakes their implementation faster.\n\n```{r vectorised-02}\n\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n lapply(x, sqrt),\n\n sqrt(x)\n\n)\n\n```\n","srcMarkdownNoYaml":"\n\n\n\n::: {#header}\n\n:::\n\n```{r markdown-setup, include=FALSE}\nknitr::opts_chunk$set(message = FALSE)\nknitr::opts_chunk$set(echo = TRUE)\nknitr::opts_chunk$set(warning = FALSE)\n\noptions(scipen = 999)\n```\n\n# Introduction\n\nThis guide outlines tools and tips for improving the speed and execution of R code.\n\nSometimes, simply tweaking a few lines of code can lead to large performance gains in the execution of a program. Other issues may take more time to work through but can be a huge benefit to a project in the long term.\n\nAn important lesson to learn when it comes to optimising an R (or any) program is knowing both if to start and when to stop. You most likely want to optimize your code because it is \"too slow\", but what that means will vary from project to project. Be sure to consider what \"fast enough\" is for your project and how much needs to be optimized. If your program takes an hour to complete, spending 5 hours trying to make it faster can be time well spent if the script will be run regularly, and a complete waste of time if it's an ad-hoc analysis.\n\nFor more information, see the CRAN Task View [High-Performance and Parallel Computing with R](https://CRAN.R-project.org/view=HighPerformanceComputing).\n\nThe \"Performant Code\" section of Hadley Wickham's [Advanced R](http://adv-r.had.co.nz/) is another great resource and provides a deeper dive into what is covered in this guide.\n\n------------------------------------------------------------------------\n\n# Update Your Installation\n\nOne of the easiest ways to improve the performance of R is to update R. In general, R will have a big annual release (i.e., 3.5.0) in the spring and around 3-4 smaller patch releases (i.e., 3.5.1) throughout the rest of the year. If the middle digit of your installation is behind the current release, you should consider updating.\n\nFor instance, R 3.5.0 implemented an improved read from text files. A 5GB file took over 5 minutes to read in 3.4.4:\n\n![](optimization/images/data-load-3-4.png){width=\"75%\"}\n\nWhile 3.5.0 took less than half the time:\n\n![](optimization/images/data-load-3-5.png){width=\"75%\"}\n\nTo see what the R-core development team is up to, check out the [NEWS](https://cran.r-project.org/doc/manuals/r-devel/NEWS.html) file from the R project.\n\n------------------------------------------------------------------------\n\n# Profiling & Benchmarking\n\nIn order to efficiently optimize your code, you'll first need to know where it's running slowest. The `profvis` package provides a nice way of visualizing the execution time and memory useage of your program.\n\n```{r profile-01}\nlibrary(profvis)\nlibrary(dplyr)\n\nprofvis({\n\tdiamonds <- read.csv(\"optimization/data/diamonds.csv\")\n\n\tdiamonds_by_cut <- diamonds %>%\n\t\tgroup_by(cut) %>%\n\t\tsummarise_if(is.numeric, mean)\n\n\twrite.csv(diamonds_by_cut, file = \"optimization/data/diamonds_by_cut.csv\")\n\n})\n\n```\n\nIn this toy example it looks like the `read.csv` function is the bottleneck, so\n\nwork on optimizing that first.\n\nOnce you find the bottleneck that needs to be optimized, it can be useful to\n\nbenchmark different potential solutions. The `microbenchmark` package can help\n\nyou choose between different options. Continuing with the simple example with\n\nthe `diamonds` dataset, compare the base `read.csv` function with `read_csv`\n\nfrom the `readr` package.\n\n```{r benchmark-01}\n\nlibrary(microbenchmark)\n\nmicrobenchmark(\n\n read.csv(\"optimization/data/diamonds.csv\"),\n\n readr::read_csv(\"optimization/data/diamonds.csv\")\n\n)\n\n```\n\nIn this case, `read_csv` is about twice as fast as the base R implementations.\n\n# Parallel Computing\n\nOften, time-intensive R code can be sped up by breaking the execution of\n\nthe job across additional cores of your computer. This is called parallel computing.\n\n## Learn `lapply`/`purrr::map`\n\nLearning the `lapply` (and variants) function from Base R or the `map` (and variants) function from the `purrr` package is the first step in learning to run R code in parallel. Once you understand how `lapply` and `map` work, running your code in parallel will be simple.\n\nSay you have a vector of numbers and want to find the square root of each one\n\n(ignore for now that `sqrt` is vectorized, which will be covered later).\n\nYou could write a for loop and iterate over each element of the vector:\n\n```{r apply-01}\n\nx <- c(1, 4, 9, 16)\n\nout <- vector(\"list\", length(x))\n\nfor (i in seq_along(x)) {\n\n out[[i]] <- sqrt(x[[i]])\n\n}\n\nunlist(out)\n\n```\n\nThe `lapply` function essentially handles the overhead of constructing a for\n\nloop for you. The syntax is:\n\n```{r apply-02, eval = FALSE}\n\nlapply(X, FUN, ...)\n\n```\n\n`lapply` will then take each element of `X` and apply the `FUN`ction to it.\n\nOur simple example then becomes:\n\n```{r apply-03}\n\nx <- c(1, 4, 9, 16)\n\nout <- lapply(x, sqrt)\n\nunlist(out)\n\n```\n\nThose working within the `tidyverse` may use `map` from the `purrr` package equivalently:\n\n```{r apply-04}\n\nlibrary(purrr)\n\nx <- c(1, 4, 9, 16)\n\nout <- map(x, sqrt)\n\nunlist(out)\n\n```\n\n## Motivating Example\n\nOnce you are comfortable with `lapply` and/or `map`, running the same code in\n\nparallel takes just an additional line of code.\n\nFor `lapply` users, the `future.apply` package contains an equivalent\n\n`future_lapply` function. Just be sure to call `plan(multiprocess)` beforehand,\n\nwhich will handle the back-end orchestration needed to run in parallel.\n\n```{r parallel-01}\n\n# install.packages(\"future.apply\")\n\nlibrary(future.apply)\n\nplan(multisession)\n\nout <- future_lapply(x, sqrt)\n\nunlist(out)\n```\n\nFor `purrr` users, the `furrr` (i.e., future purrr) package includes an\n\nequivalent `future_map` function:\n\n```{r parallel-02}\n\n# install.packages(\"furrr\")\n\nlibrary(furrr)\n\nplan(multisession)\n\ny <- future_map(x, sqrt)\n\nunlist(y)\n\n```\n\nHow much faster did this simple example run in parallel?\n\n```{r parallel-03}\n\nlibrary(future.apply)\n\nplan(multisession)\n\nx <- c(1, 4, 9, 16)\n\nmicrobenchmark::microbenchmark(\n\n sequential = lapply(x, sqrt),\n\n parallel = future_lapply(x, sqrt),\n\n unit = \"s\"\n\n)\n\n```\n\nParallelization was actually slower. In this case, the overhead of\n\nsetting the code to run in parallel far outweighed any performance gain. In\n\ngeneral, parallelization works well on long-running & compute intensive jobs.\n\n## A (somewhat) More Complex Example\n\nIn this example we'll use the `diamonds` dataset from `ggplot2` and perform a\n\nkmeans cluster. We'll use `lapply` to iterate the number of clusters from 2 to\n\n5:\n\n```{r kmeans-01}\n\ndf <- ggplot2::diamonds\n\ndf <- dplyr::select(df, -c(cut, color, clarity))\n\ncenters = 2:5\n\nsystem.time(\n\n lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n\n```\n\nA now running the same code in parallel:\n\n```{r kmeans-02}\n\nlibrary(future.apply)\n\nplan(multisession)\n\nsystem.time(\n\n future_lapply(centers,\n\n function(x) kmeans(df, centers = x, nstart = 500)\n\n )\n\n )\n\n```\n\nWhile we didn't achieve perfect scaling, we still get a nice bump in execution\n\ntime.\n\n## Additional Packages\n\nFor the sake of ease and brevity, this guide focused on the `futures` framework\n\nfor parallelization. However, you should be aware that there are a number of\n\nother ways to parallelize your code.\n\n### The `parallel` Package\n\nThe `parallel` package is included in your base R installation. It includes\n\nanalogues of the various `apply` functions:\n\n- `parLapply`\n\n- `mclapply` - not available on Windows\n\nThese functions generally require more setup, especially on Windows machines.\n\n### The `doParallel` Package\n\nThe `doParallel` package builds off of `parallel` and is\n\nuseful for code that uses for loops instead of `lapply`. Like the parallel\n\npackage, it generally requires more setup, especially on Windows machines.\n\n### Machine Learning - `caret`\n\nFor those running machine learning models, the `caret` package can easily\n\nleverage `doParallel` to speed up the execution of multiple models. Lifting\n\nthe example from the package documentation:\n\n```{r caret-01, eval = FALSE}\n\nlibrary(doParallel)\n\ncl <- makePSOCKcluster(5) # number of cores to use\n\nregisterDoParallel(cl)\n\n## All subsequent models are then run in parallel\n\nmodel <- train(y ~ ., data = training, method = \"rf\")\n\n## When you are done:\n\nstopCluster(cl)\n\n```\n\nBe sure to check out the full\n\n[documentation](http://topepo.github.io/caret/parallel-processing.html)\n\nfor more detail.\n\n------------------------------------------------------------------------\n\n# Big Data\n\nAs data collection and storage becomes easier and cheaper, it is relatively\n\nsimple to obtain relatively large data files. An important point to keep in\n\nmind is that the size of your data will generally expand when it is read\n\nfrom a storage device into R. A general rule of thumb is that a file will take\n\nsomewhere around 3-4 times more space in memory than it does on disk.\n\nFor instance, compare the size of the `iris` data set when it is saved as a\n\n.csv file locally vs the size of the object when it is read in to an R session:\n\n```{r size-01, message = FALSE}\n\nfile.size(\"optimization/data/iris.csv\") / 1000\n\ndf <- readr::read_csv(\"optimization/data/iris.csv\")\n\npryr::object_size(df)\n\n```\n\nThis means that on a standard Urban Institute desktop, you may have issues\n\nreading in files that are larger than 4 GB.\n\n## Object Size\n\nThe type of your data can have a big impact on the size of your data frame\n\nwhen you are dealing with larger files. There are four main types of atomic\n\nvectors in R:\n\n1. `logical`\n\n2. `integer`\n\n3. `double` (also called `numeric`)\n\n4. `character`\n\n## Each of these data types occupies a different amount of space in memory\n\n`logical` and `integer` vectors use 4 bytes per element, while a `double` will\n\noccupy 8 bytes. R uses a global string pool, so `character` vectors are hard\n\nto estimate, but will generally take up more space for element.\n\nConsider the following example:\n\n```{r size-02}\n\nx <- 1:100\n\npryr::object_size(x)\n\npryr::object_size(as.double(x))\n\npryr::object_size(as.character(x))\n\n```\n\nAn incorrect data type can easily cost you a lot of space in memory, especially\n\nat scale. This often happens when reading data from a text or csv file - data\n\nmay have a format such as `c(1.0, 2.0, 3.0)` and will be read in as a `numeric`\n\ncolumn, when `integer` is more appropriate and compact.\n\nYou may also be familiar with `factor` variables within R. Essentially a\n\n`factor` will represent your data as integers, and map them back to their\n\ncharacter representation. This can save memory when you have a compact and\n\nunique level of factors:\n\n```{r size-03}\n\nx <- sample(letters, 10000, replace = TRUE)\n\npryr::object_size(as.character(x))\n\npryr::object_size(as.factor(x))\n\n```\n\nHowever if each element is unique, or if there is not a lot of overlap among\n\nelements, than the overhead will make a factor larger than its character\n\nrepresentation:\n\n```{r size-04}\n\npryr::object_size(as.factor(letters))\n\npryr::object_size(as.character(letters))\n\n```\n\n## Cloud Computing\n\nSometimes, you will have data that are simply too large to ever fit on your\n\nlocal desktop machine. If that is the case, then the Elastic Cloud Computing\n\nEnvironment from the Office of Technology and Data Science can provide you with\n\neasy access to powerful analytic tools for computationally intensive project.\n\nThe Elastic Cloud Computing Environment allows researchers to quickly spin-up\n\nan Amazon Web Services (AWS) Elastic Cloud Compute (EC2) instance. These\n\ninstances offer increased memory to read in large datasets, along with\n\nadditional CPUs to provide the ability to process data in parallel at an\n\nimpressive scale.\n\n| Instance \\| CPU \\| Memory (GB) \\|\n\n\\|----------\\|-----\\|--------\\|\n\n| Desktop \\| 8 \\| 16 \\|\n\n| c5.4xlarge \\| 16 \\| 32 \\|\n\n| c5.9xlarge \\| 36 \\| 72 \\|\n\n| c5.18xlarge \\| 72 \\| 144 \\|\n\n| x1e.8xlarge \\| 32 \\| 976 \\|\n\n| x1e.16xlarge \\| 64 \\| 1952 \\|\n\nFeel free to contact Erika Tyagi (etyagi\\@urban.org) if this would be useful\n\nfor your project.\n\n------------------------------------------------------------------------\n\n# Common Pitfalls\n\n## For Loops and Vector Allocation\n\nA refrain you will often hear is that for loops in R are slow and need to be\n\navoided at all costs. This is not true! Rather, an improperly constructed loop\n\nin R can bring the execution of your program to a near standstill.\n\nA common for loop structure may look something like:\n\n```{r loop-01, eval = FALSE}\n\nx <- 1:100\n\nout <- c()\n\nfor (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n\n```\n\nThe bottleneck in this loop is with the allocation of the vector `out`. Every\n\ntime we iterate over an item in `x` and append it to `out`, R makes a copy\n\nof all the items already in `out`. As the size of the loop grows, your code\n\nwill take longer and longer to run.\n\nA better practice is to pre-allocate `out` to be the correct length, and then\n\ninsert the results as the loop runs.\n\n```{r loop-03, eval = FALSE}\n\nx <- 1:100\n\nout <- rep(NA, length(x))\n\nfor (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n}\n\n```\n\nA quick benchmark shows how much more efficient a loop with a pre-allocated\n\nresults vector is:\n\n```{r loop-04}\n\nbad_loop <- function(x) {\n\n out <- c()\n\n for (i in x) {\n\n out <- c(out, sqrt(x))\n\n }\n\n}\n\ngood_loop <- function(x) {\n\n out <- rep(NA, length(x))\n\n for (i in seq_along(x)) {\n\n out[i] <- sqrt(x[i])\n\n }\n\n}\n\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(x),\n\n good_loop(x)\n\n)\n\n```\n\nAnd note how performance of the \"bad\" loop degrades as the loop size grows.\n\n```{r loop-05}\n\ny <- 1:250\n\nmicrobenchmark::microbenchmark(\n\n bad_loop(y),\n\n good_loop(y)\n\n)\n\n```\n\n## Vectorized Functions\n\nMany functions in R are vectorized, meaning they can accept an entire vector\n\n(and not just a single value) as input. The `sqrt` function from the\n\nprior examples is one:\n\n```{r vectorised-01}\n\nx <- c(1, 4, 9, 16)\n\nsqrt(x)\n\n```\n\nThis removes the need to use `lapply` or a for loop. Vectorized functions in\n\nR are generally written in a compiled language like C, C++, or FORTRAN, which\n\nmakes their implementation faster.\n\n```{r vectorised-02}\n\nx <- 1:100\n\nmicrobenchmark::microbenchmark(\n\n lapply(x, sqrt),\n\n sqrt(x)\n\n)\n\n```\n"},"formats":{"html":{"identifier":{"display-name":"HTML","target-format":"html","base-format":"html"},"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","code_folding":"show","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"knitr"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"inline-includes":false,"preserve-yaml":false,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[],"notebook-links":true,"format-links":true},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"optimization.html"},"language":{"toc-title-document":"Table of contents","toc-title-website":"On this page","related-formats-title":"Other Formats","related-notebooks-title":"Notebooks","source-notebooks-prefix":"Source","section-title-abstract":"Abstract","section-title-appendices":"Appendices","section-title-footnotes":"Footnotes","section-title-references":"References","section-title-reuse":"Reuse","section-title-copyright":"Copyright","section-title-citation":"Citation","appendix-attribution-cite-as":"For attribution, please cite this work as:","appendix-attribution-bibtex":"BibTeX citation:","title-block-author-single":"Author","title-block-author-plural":"Authors","title-block-affiliation-single":"Affiliation","title-block-affiliation-plural":"Affiliations","title-block-published":"Published","title-block-modified":"Modified","callout-tip-title":"Tip","callout-note-title":"Note","callout-warning-title":"Warning","callout-important-title":"Important","callout-caution-title":"Caution","code-summary":"Code","code-tools-menu-caption":"Code","code-tools-show-all-code":"Show All Code","code-tools-hide-all-code":"Hide All Code","code-tools-view-source":"View Source","code-tools-source-code":"Source Code","code-line":"Line","code-lines":"Lines","copy-button-tooltip":"Copy to Clipboard","copy-button-tooltip-success":"Copied!","repo-action-links-edit":"Edit this page","repo-action-links-source":"View source","repo-action-links-issue":"Report an issue","back-to-top":"Back to top","search-no-results-text":"No results","search-matching-documents-text":"matching documents","search-copy-link-title":"Copy link to search","search-hide-matches-text":"Hide additional matches","search-more-match-text":"more match in this document","search-more-matches-text":"more matches in this document","search-clear-button-title":"Clear","search-detached-cancel-button-title":"Cancel","search-submit-button-title":"Submit","search-label":"Search","toggle-section":"Toggle section","toggle-sidebar":"Toggle sidebar navigation","toggle-dark-mode":"Toggle dark mode","toggle-reader-mode":"Toggle reader mode","toggle-navigation":"Toggle navigation","crossref-fig-title":"Figure","crossref-tbl-title":"Table","crossref-lst-title":"Listing","crossref-thm-title":"Theorem","crossref-lem-title":"Lemma","crossref-cor-title":"Corollary","crossref-prp-title":"Proposition","crossref-cnj-title":"Conjecture","crossref-def-title":"Definition","crossref-exm-title":"Example","crossref-exr-title":"Exercise","crossref-ch-prefix":"Chapter","crossref-apx-prefix":"Appendix","crossref-sec-prefix":"Section","crossref-eq-prefix":"Equation","crossref-lof-title":"List of Figures","crossref-lot-title":"List of Tables","crossref-lol-title":"List of Listings","environment-proof-title":"Proof","environment-remark-title":"Remark","environment-solution-title":"Solution","listing-page-order-by":"Order By","listing-page-order-by-default":"Default","listing-page-order-by-date-asc":"Oldest","listing-page-order-by-date-desc":"Newest","listing-page-order-by-number-desc":"High to Low","listing-page-order-by-number-asc":"Low to High","listing-page-field-date":"Date","listing-page-field-title":"Title","listing-page-field-description":"Description","listing-page-field-author":"Author","listing-page-field-filename":"File Name","listing-page-field-filemodified":"Modified","listing-page-field-subtitle":"Subtitle","listing-page-field-readingtime":"Reading Time","listing-page-field-categories":"Categories","listing-page-minutes-compact":"{0} min","listing-page-category-all":"All","listing-page-no-matches":"No matching items"},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.3.433"},"extensions":{"book":{"multiFile":true}}}},"projectFormats":["html"]} \ No newline at end of file diff --git a/.quarto/idx/resources.qmd.json b/.quarto/idx/resources.qmd.json index 6489c7e..66ed437 100644 --- a/.quarto/idx/resources.qmd.json +++ b/.quarto/idx/resources.qmd.json @@ -1 +1 @@ -{"title":"Free Books","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}}},"headingText":"Free Books","containsRefs":false,"markdown":"\n\n\n\n
\n\n
\n\n\n### Intro\n\n* [R for Data Science](https://r4ds.had.co.nz/) by Garrett Grolemund and Hadley Wickham\n\n### Data Viz\n\n* [ggplot2: Elegant Graphics for Data Analysis](https://ggplot2-book.org/) by Hadley Wickham\n* [Data Visualization - A practical introduction](http://socviz.co/index.html#preface) by Kieran Healy\n\n### *down\n\n* [R Markdown: The Definitive Guide](https://bookdown.org/yihui/rmarkdown/) by Yihui Xie, J. J. Allaire, and Garrett Grolemund\n* [blogdown: Creating Websites with R Markdown](https://bookdown.org/yihui/blogdown/) by Yihui Xie, Amber Thomas, and Alison Presmanes Hill\n* [bookdown: Authoring Books and Technical Documents with R Markdown](https://bookdown.org/yihui/bookdown/) by Yihui Xie\n\n### Statistics\n\n* [Learning Statistics with R](https://learningstatisticswithr.com/) by Danielle Navarro\n* [Introduction to Econometrics with R](https://www.econometrics-with-r.org/) by Christoph Hanck, Martin Arnold, Alexander Gerber and Martin Schmelzer\n* [An Introduction to Bayesian Thinking](https://statswithr.github.io/book/) by Merlise Clyde et. al.\n* [Statistical Inference via Data Science](https://moderndive.com/index.html) by Chester Ismay and Albert Y. Kim\n\n### Machine Learning\n\n* [Hands-On Machine Learning with R](https://bradleyboehmke.github.io/HOML/) by Bradley Boehmke & Brandon Greenwell\n* [Feature Engineering and Selection: A Practical Approach for Predictive Models](http://www.feat.engineering/) by Max Kuhn and Kjell Johnson\n\n### Mapping and Geospatial Analysis\n\n* [Geocomputation with R](https://geocompr.robinlovelace.net/) by Robin Lovelace, Jakub Nowosad, Jannes Muenchow\n\n### Text Analysis\n\n* [Text Mining with R A Tidy Approach](https://www.tidytextmining.com/) by Julia Silge and David Robinson\n\n### Programming\n\n* [Advanced R](https://adv-r.hadley.nz/) by Hadley Wickham\n* [R Packages](https://r-pkgs.org/) by Hadley Wickham\n* [Master Spark with R](https://therinspark.com/) by Javier Luraschi, Kevin Kuo, and Edgar Ruiz\n* [Functional programming and unit testing for data munging with R](https://b-rodrigues.github.io/fput/) by Bruno Rodrigues\n\n# Websites\n\n* [RStudio Essentials](https://resources.rstudio.com/)\n* [RStudio Education](https://education.rstudio.com/)\n* [R Cheat Sheets](https://rstudio.com/resources/cheatsheets/)\n* Andrew Heiss' free [Data Viz Course](https://datavizm20.classes.andrewheiss.com/)\n"},"formats":{"html":{"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"markdown"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[]},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"resources.html"},"language":{},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.2.269"},"extensions":{"book":{"multiFile":true}}}}} \ No newline at end of file +{"title":"Free Books","markdown":{"yaml":{"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}}},"headingText":"Free Books","containsRefs":false,"markdown":"\n\n\n\n
\n\n
\n\n\n### Intro\n\n* [R for Data Science](https://r4ds.had.co.nz/) by Garrett Grolemund and Hadley Wickham\n\n### Data Viz\n\n* [ggplot2: Elegant Graphics for Data Analysis](https://ggplot2-book.org/) by Hadley Wickham\n* [Data Visualization - A practical introduction](http://socviz.co/index.html#preface) by Kieran Healy\n\n### *down\n\n* [R Markdown: The Definitive Guide](https://bookdown.org/yihui/rmarkdown/) by Yihui Xie, J. J. Allaire, and Garrett Grolemund\n* [blogdown: Creating Websites with R Markdown](https://bookdown.org/yihui/blogdown/) by Yihui Xie, Amber Thomas, and Alison Presmanes Hill\n* [bookdown: Authoring Books and Technical Documents with R Markdown](https://bookdown.org/yihui/bookdown/) by Yihui Xie\n\n### Statistics\n\n* [Learning Statistics with R](https://learningstatisticswithr.com/) by Danielle Navarro\n* [Introduction to Econometrics with R](https://www.econometrics-with-r.org/) by Christoph Hanck, Martin Arnold, Alexander Gerber and Martin Schmelzer\n* [An Introduction to Bayesian Thinking](https://statswithr.github.io/book/) by Merlise Clyde et. al.\n* [Statistical Inference via Data Science](https://moderndive.com/index.html) by Chester Ismay and Albert Y. Kim\n\n### Machine Learning\n\n* [Hands-On Machine Learning with R](https://bradleyboehmke.github.io/HOML/) by Bradley Boehmke & Brandon Greenwell\n* [Feature Engineering and Selection: A Practical Approach for Predictive Models](http://www.feat.engineering/) by Max Kuhn and Kjell Johnson\n\n### Mapping and Geospatial Analysis\n\n* [Geocomputation with R](https://geocompr.robinlovelace.net/) by Robin Lovelace, Jakub Nowosad, Jannes Muenchow\n\n### Text Analysis\n\n* [Text Mining with R A Tidy Approach](https://www.tidytextmining.com/) by Julia Silge and David Robinson\n\n### Programming\n\n* [Advanced R](https://adv-r.hadley.nz/) by Hadley Wickham\n* [R Packages](https://r-pkgs.org/) by Hadley Wickham\n* [Master Spark with R](https://therinspark.com/) by Javier Luraschi, Kevin Kuo, and Edgar Ruiz\n* [Functional programming and unit testing for data munging with R](https://b-rodrigues.github.io/fput/) by Bruno Rodrigues\n\n# Websites\n\n* [RStudio Essentials](https://resources.rstudio.com/)\n* [RStudio Education](https://education.rstudio.com/)\n* [R Cheat Sheets](https://rstudio.com/resources/cheatsheets/)\n* Andrew Heiss' free [Data Viz Course](https://datavizm20.classes.andrewheiss.com/)\n","srcMarkdownNoYaml":"\n\n\n\n
\n\n
\n\n# Free Books\n\n### Intro\n\n* [R for Data Science](https://r4ds.had.co.nz/) by Garrett Grolemund and Hadley Wickham\n\n### Data Viz\n\n* [ggplot2: Elegant Graphics for Data Analysis](https://ggplot2-book.org/) by Hadley Wickham\n* [Data Visualization - A practical introduction](http://socviz.co/index.html#preface) by Kieran Healy\n\n### *down\n\n* [R Markdown: The Definitive Guide](https://bookdown.org/yihui/rmarkdown/) by Yihui Xie, J. J. Allaire, and Garrett Grolemund\n* [blogdown: Creating Websites with R Markdown](https://bookdown.org/yihui/blogdown/) by Yihui Xie, Amber Thomas, and Alison Presmanes Hill\n* [bookdown: Authoring Books and Technical Documents with R Markdown](https://bookdown.org/yihui/bookdown/) by Yihui Xie\n\n### Statistics\n\n* [Learning Statistics with R](https://learningstatisticswithr.com/) by Danielle Navarro\n* [Introduction to Econometrics with R](https://www.econometrics-with-r.org/) by Christoph Hanck, Martin Arnold, Alexander Gerber and Martin Schmelzer\n* [An Introduction to Bayesian Thinking](https://statswithr.github.io/book/) by Merlise Clyde et. al.\n* [Statistical Inference via Data Science](https://moderndive.com/index.html) by Chester Ismay and Albert Y. Kim\n\n### Machine Learning\n\n* [Hands-On Machine Learning with R](https://bradleyboehmke.github.io/HOML/) by Bradley Boehmke & Brandon Greenwell\n* [Feature Engineering and Selection: A Practical Approach for Predictive Models](http://www.feat.engineering/) by Max Kuhn and Kjell Johnson\n\n### Mapping and Geospatial Analysis\n\n* [Geocomputation with R](https://geocompr.robinlovelace.net/) by Robin Lovelace, Jakub Nowosad, Jannes Muenchow\n\n### Text Analysis\n\n* [Text Mining with R A Tidy Approach](https://www.tidytextmining.com/) by Julia Silge and David Robinson\n\n### Programming\n\n* [Advanced R](https://adv-r.hadley.nz/) by Hadley Wickham\n* [R Packages](https://r-pkgs.org/) by Hadley Wickham\n* [Master Spark with R](https://therinspark.com/) by Javier Luraschi, Kevin Kuo, and Edgar Ruiz\n* [Functional programming and unit testing for data munging with R](https://b-rodrigues.github.io/fput/) by Bruno Rodrigues\n\n# Websites\n\n* [RStudio Essentials](https://resources.rstudio.com/)\n* [RStudio Education](https://education.rstudio.com/)\n* [R Cheat Sheets](https://rstudio.com/resources/cheatsheets/)\n* Andrew Heiss' free [Data Viz Course](https://datavizm20.classes.andrewheiss.com/)\n"},"formats":{"html":{"identifier":{"display-name":"HTML","target-format":"html","base-format":"html"},"execute":{"fig-width":7,"fig-height":5,"fig-format":"retina","fig-dpi":96,"df-print":"default","error":false,"eval":true,"cache":null,"freeze":false,"echo":true,"output":{"html_document":{"includes":{"in_header":"analytics.html"},"css":"styles.css","toc":true,"toc_float":true,"pandoc_args":"--tab-stop=2"}},"warning":true,"include":true,"keep-md":false,"keep-ipynb":false,"ipynb":null,"enabled":null,"daemon":null,"daemon-restart":false,"debug":false,"ipynb-filters":[],"engine":"markdown"},"render":{"keep-tex":false,"keep-source":false,"keep-hidden":false,"prefer-html":false,"output-divs":true,"output-ext":"html","fig-align":"default","fig-pos":null,"fig-env":null,"code-fold":"none","code-overflow":"scroll","code-link":false,"code-line-numbers":false,"code-tools":false,"tbl-colwidths":"auto","merge-includes":true,"inline-includes":false,"preserve-yaml":false,"latex-auto-mk":true,"latex-auto-install":true,"latex-clean":true,"latex-max-runs":10,"latex-makeindex":"makeindex","latex-makeindex-opts":[],"latex-tlmgr-opts":[],"latex-input-paths":[],"latex-output-dir":null,"link-external-icon":false,"link-external-newwindow":false,"self-contained-math":false,"format-resources":[],"notebook-links":true,"format-links":true},"pandoc":{"standalone":true,"wrap":"none","default-image-extension":"png","to":"html","include-in-header":["analytics.html"],"css":["styles.css"],"toc":true,"output-file":"resources.html"},"language":{"toc-title-document":"Table of contents","toc-title-website":"On this page","related-formats-title":"Other Formats","related-notebooks-title":"Notebooks","source-notebooks-prefix":"Source","section-title-abstract":"Abstract","section-title-appendices":"Appendices","section-title-footnotes":"Footnotes","section-title-references":"References","section-title-reuse":"Reuse","section-title-copyright":"Copyright","section-title-citation":"Citation","appendix-attribution-cite-as":"For attribution, please cite this work as:","appendix-attribution-bibtex":"BibTeX citation:","title-block-author-single":"Author","title-block-author-plural":"Authors","title-block-affiliation-single":"Affiliation","title-block-affiliation-plural":"Affiliations","title-block-published":"Published","title-block-modified":"Modified","callout-tip-title":"Tip","callout-note-title":"Note","callout-warning-title":"Warning","callout-important-title":"Important","callout-caution-title":"Caution","code-summary":"Code","code-tools-menu-caption":"Code","code-tools-show-all-code":"Show All Code","code-tools-hide-all-code":"Hide All Code","code-tools-view-source":"View Source","code-tools-source-code":"Source Code","code-line":"Line","code-lines":"Lines","copy-button-tooltip":"Copy to Clipboard","copy-button-tooltip-success":"Copied!","repo-action-links-edit":"Edit this page","repo-action-links-source":"View source","repo-action-links-issue":"Report an issue","back-to-top":"Back to top","search-no-results-text":"No results","search-matching-documents-text":"matching documents","search-copy-link-title":"Copy link to search","search-hide-matches-text":"Hide additional matches","search-more-match-text":"more match in this document","search-more-matches-text":"more matches in this document","search-clear-button-title":"Clear","search-detached-cancel-button-title":"Cancel","search-submit-button-title":"Submit","search-label":"Search","toggle-section":"Toggle section","toggle-sidebar":"Toggle sidebar navigation","toggle-dark-mode":"Toggle dark mode","toggle-reader-mode":"Toggle reader mode","toggle-navigation":"Toggle navigation","crossref-fig-title":"Figure","crossref-tbl-title":"Table","crossref-lst-title":"Listing","crossref-thm-title":"Theorem","crossref-lem-title":"Lemma","crossref-cor-title":"Corollary","crossref-prp-title":"Proposition","crossref-cnj-title":"Conjecture","crossref-def-title":"Definition","crossref-exm-title":"Example","crossref-exr-title":"Exercise","crossref-ch-prefix":"Chapter","crossref-apx-prefix":"Appendix","crossref-sec-prefix":"Section","crossref-eq-prefix":"Equation","crossref-lof-title":"List of Figures","crossref-lot-title":"List of Tables","crossref-lol-title":"List of Listings","environment-proof-title":"Proof","environment-remark-title":"Remark","environment-solution-title":"Solution","listing-page-order-by":"Order By","listing-page-order-by-default":"Default","listing-page-order-by-date-asc":"Oldest","listing-page-order-by-date-desc":"Newest","listing-page-order-by-number-desc":"High to Low","listing-page-order-by-number-asc":"Low to High","listing-page-field-date":"Date","listing-page-field-title":"Title","listing-page-field-description":"Description","listing-page-field-author":"Author","listing-page-field-filename":"File Name","listing-page-field-filemodified":"Modified","listing-page-field-subtitle":"Subtitle","listing-page-field-readingtime":"Reading Time","listing-page-field-categories":"Categories","listing-page-minutes-compact":"{0} min","listing-page-category-all":"All","listing-page-no-matches":"No matching items"},"metadata":{"lang":"en","fig-responsive":true,"quarto-version":"1.3.433"},"extensions":{"book":{"multiFile":true}}}},"projectFormats":["html"]} \ No newline at end of file diff --git a/docs/.DS_Store b/docs/.DS_Store deleted file mode 100644 index 5f93ba0..0000000 Binary files a/docs/.DS_Store and /dev/null differ diff --git a/docs/getting-data.html b/docs/getting-data.html index ac4dae6..c51eec7 100644 --- a/docs/getting-data.html +++ b/docs/getting-data.html @@ -2,7 +2,7 @@ - + @@ -11,10 +11,16 @@ @@ -112,7 +88,8 @@ "search-more-matches-text": "more matches in this document", "search-clear-button-title": "Clear", "search-detached-cancel-button-title": "Cancel", - "search-submit-button-title": "Submit" + "search-submit-button-title": "Submit", + "search-label": "Search" } } @@ -136,37 +113,48 @@
@@ -176,7 +164,7 @@ + @@ -272,6 +254,7 @@

On this page

  • Centroids
  • Saving Plots
  • +
  • Interactive Plots
  • urbnthemes
    • Overview
    • @@ -1672,6 +1655,45 @@

      Saving Plots

      ggsave(filename = "cars.pdf", plot = plot2, width = 6.5, height = 4, device = cairo_pdf)
      +
      +

      Interactive Plots

      +

      We can make any of the previous plots interactive with the powerful and easy plotly library. All we have to do is wrap a ggplot object in the ggplotly function. Note: You can’t add ggplotly to the end of a ggplot object, but have to actually save the ggplot as a variable and then wrap that in the function call as shown below.

      +

      You can customize the tooltip text by adding a value to text in aes() and then specifying tooltip = "text" in the ggplotly call.

      +
      +
      library(plotly)
      +
      +stock_plot <- as_tibble(EuStockMarkets) %>% 
      +    mutate(date = time(EuStockMarkets)) %>% 
      +    gather(key = "key", value = "value", -date) %>% 
      +    ggplot(mapping = aes(x = date, y = value, color = key,
      +                                             # sometimes ggplotly messes with line charts,
      +                                             # adding a group value usually helps with that
      +                                             group = key,
      +                                             # customize the tooltip with the text aes
      +                                             text = paste0("Value: ", round(value, 2), "<br>",
      +                                                                        "Date: ", round(date, 3), "<br>",
      +                                                                        "Key: ", key))
      +                                             ) +
      +    geom_line() +
      +  scale_x_continuous(expand = expansion(mult = c(0.002, 0)), 
      +                     limits = c(1991, 1999), 
      +                     breaks = c(1991, 1993, 1995, 1997, 1999)) +
      +  scale_y_continuous(expand = expansion(mult = c(0, 0.002)), 
      +                     breaks = 0:4 * 2500,
      +                     labels = scales::dollar, 
      +                     limits = c(0, 10000)) +  
      +    labs(x = "Date",
      +             y = "Value")
      +
      +# make interactive with gggplotly
      +# Uncomment pipe to hide the interative toolbar in the top right 
      +ggplotly(stock_plot, tooltip = "text")  # %>%  config(displayModeBar = FALSE)
      +
      +
      + +
      +
      +

      urbnthemes

      @@ -1683,17 +1705,17 @@

      Overview

      Usage

      Use set_urbn_defaults(style = "print") to set the default styles. scatter_grid(), remove_ticks(), add_axis(), and remove_axis() can all be used to improve graphics.

      -
      library(ggplot2)
      -library(urbnthemes)
      -
      -set_urbn_defaults(style = "print")
      -
      -ggplot(data = mtcars, mapping = aes(factor(cyl))) +
      -  geom_bar() + 
      -  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
      -  labs(x = "Number of Cylinders",
      -       y = "Count") +
      -  remove_ticks()
      +
      library(ggplot2)
      +library(urbnthemes)
      +
      +set_urbn_defaults(style = "print")
      +
      +ggplot(data = mtcars, mapping = aes(factor(cyl))) +
      +  geom_bar() + 
      +  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
      +  labs(x = "Number of Cylinders",
      +       y = "Count") +
      +  remove_ticks()

      @@ -1711,48 +1733,48 @@

      Combining elements

    • urbn_geofacet
    -
    library(ggplot2)
    -library(urbnthemes)
    -
    -set_urbn_defaults(style = "print")
    -
    -plot <- ggplot(data = mtcars, mapping = aes(factor(cyl))) +
    -  geom_bar() + 
    -  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
    -  labs(x = "Number of Cylinders",
    -       y = "Count") +
    -  remove_ticks()
    -
    -urbn_plot(plot, urbn_logo_text(), ncol = 1, heights = c(30, 1))
    +
    library(ggplot2)
    +library(urbnthemes)
    +
    +set_urbn_defaults(style = "print")
    +
    +plot <- ggplot(data = mtcars, mapping = aes(factor(cyl))) +
    +  geom_bar() + 
    +  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
    +  labs(x = "Number of Cylinders",
    +       y = "Count") +
    +  remove_ticks()
    +
    +urbn_plot(plot, urbn_logo_text(), ncol = 1, heights = c(30, 1))

    Sometimes it’s important to horizontally add the y-axis title above the plot. urbn_y_title() can be sued for this task. The following example goes one step further and adds the title between the legend and the plot.

    -
    library(ggplot2)
    -library(urbnthemes)
    -
    -set_urbn_defaults()
    -
    -plot <- ggplot(data = mtcars, mapping = aes(x = wt, y = mpg, color = factor(cyl))) +
    -  geom_point() + 
    -    scale_x_continuous(expand = c(0, 0),
    -                                         limits = c(0, 8)) +
    -  scale_y_continuous(expand = c(0, 0),
    -                                     limits = c(0, 40)) +
    -  remove_ticks() +
    -    labs("") +
    -    scatter_grid()
    -
    -urbn_plot(get_legend(plot),
    -                    urbn_y_title("Miles per gallon"),
    -                    remove_legend(plot), 
    -                    urbn_logo_text(), 
    -                    ncol = 1, 
    -                    heights = c(3, 1, 30, 1))
    +
    library(ggplot2)
    +library(urbnthemes)
    +
    +set_urbn_defaults()
    +
    +plot <- ggplot(data = mtcars, mapping = aes(x = wt, y = mpg, color = factor(cyl))) +
    +  geom_point() + 
    +    scale_x_continuous(expand = c(0, 0),
    +                                         limits = c(0, 8)) +
    +  scale_y_continuous(expand = c(0, 0),
    +                                     limits = c(0, 40)) +
    +  remove_ticks() +
    +    labs("") +
    +    scatter_grid()
    +
    +urbn_plot(get_legend(plot),
    +                    urbn_y_title("Miles per gallon"),
    +                    remove_legend(plot), 
    +                    urbn_logo_text(), 
    +                    ncol = 1, 
    +                    heights = c(3, 1, 30, 1))
    -

    +

    @@ -1777,7 +1799,7 @@

    Palettes

    Use view_palette() to see the palette:

    -
    view_palette(palette_urbn_magenta)
    +
    view_palette(palette_urbn_magenta)
    [1] "c(#351123, #761548, #af1f6b, #e90989, #e54096, #e46aa7, #eb99c2, #f5cbdf)"
    @@ -1787,14 +1809,14 @@

    Palettes

    The vectors can be subset using base R syntax. This allows for the quick selection of specific colors from a palette.

    -
    palette_urbn_main[1:4]
    +
    palette_urbn_main[1:4]
         cyan    yellow     black      gray 
     "#1696d2" "#fdbf11" "#000000" "#d2d2d2" 
    -
    palette_urbn_spacegray[1:5]
    +
    palette_urbn_spacegray[1:5]
    [1] "#d5d5d4" "#adabac" "#848081" "#5c5859" "#332d2f"
    @@ -1828,7 +1850,7 @@

    Bibli

    Winston Chang, (2014). extrafont: Tools for using fonts. R package version 0.17. https://CRAN.R-project.org/package=extrafont

    Yihui Xie (2018). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.19.

    -
    sessionInfo()
    +
    sessionInfo()
    R version 4.2.2 (2022-10-31)
     Platform: aarch64-apple-darwin20 (64-bit)
    @@ -1845,41 +1867,43 @@ 

    Bibli [1] stats graphics grDevices datasets utils methods base other attached packages: - [1] AmesHousing_0.0.4 gghighlight_0.4.0 fivethirtyeight_0.6.2 - [4] ggsankey_0.0.99999 ggridges_0.5.4 ggbeeswarm_0.7.1 - [7] ggrepel_0.9.2 gapminder_0.3.0 urbnthemes_0.0.2 -[10] forcats_0.5.2 stringr_1.4.1 dplyr_1.0.10 -[13] purrr_0.3.5 readr_2.1.3 tidyr_1.2.1 -[16] tibble_3.1.8 ggplot2_3.4.0 tidyverse_1.3.2 -[19] knitr_1.40 + [1] plotly_4.10.4 AmesHousing_0.0.4 gghighlight_0.4.0 + [4] fivethirtyeight_0.6.2 ggsankey_0.0.99999 ggridges_0.5.4 + [7] ggbeeswarm_0.7.1 ggrepel_0.9.2 gapminder_0.3.0 +[10] urbnthemes_0.0.2 forcats_1.0.0 stringr_1.5.1 +[13] dplyr_1.1.4 purrr_1.0.2 readr_2.1.3 +[16] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.0 +[19] tidyverse_1.3.2 knitr_1.40 loaded via a namespace (and not attached): [1] nlme_3.1-160 fs_1.5.2 lubridate_1.9.0 [4] bit64_4.0.5 httr_1.4.4 tools_4.2.2 - [7] backports_1.4.1 utf8_1.2.2 R6_2.5.1 -[10] vipor_0.4.5 DBI_1.1.3 mgcv_1.8-41 -[13] colorspace_2.0-3 withr_2.5.0 tidyselect_1.2.0 -[16] gridExtra_2.3 bit_4.0.5 curl_4.3.3 -[19] compiler_4.2.2 extrafontdb_1.0 cli_3.4.1 -[22] rvest_1.0.3 xml2_1.3.3 labeling_0.4.2 -[25] scales_1.2.1 hexbin_1.28.2 digest_0.6.30 -[28] rmarkdown_2.18 pkgconfig_2.0.3 htmltools_0.5.4 -[31] extrafont_0.18 dbplyr_2.2.1 fastmap_1.1.0 -[34] htmlwidgets_1.6.1 rlang_1.0.6 readxl_1.4.1 -[37] rstudioapi_0.14 farver_2.1.1 generics_0.1.3 -[40] jsonlite_1.8.3 vroom_1.6.0 googlesheets4_1.0.1 -[43] magrittr_2.0.3 Matrix_1.5-1 Rcpp_1.0.9 -[46] munsell_0.5.0 fansi_1.0.3 lifecycle_1.0.3 -[49] stringi_1.7.8 yaml_2.3.6 grid_4.2.2 -[52] parallel_4.2.2 crayon_1.5.2 lattice_0.20-45 -[55] splines_4.2.2 haven_2.5.1 hms_1.1.2 -[58] pillar_1.8.1 urbnmapr_0.0.0.9002 reprex_2.0.2 -[61] glue_1.6.2 evaluate_0.18 remotes_2.4.2 -[64] renv_0.16.0 modelr_0.1.10 vctrs_0.5.1 -[67] tzdb_0.3.0 Rttf2pt1_1.3.11 cellranger_1.1.0 -[70] gtable_0.3.1 assertthat_0.2.1 xfun_0.34 -[73] broom_1.0.1 googledrive_2.0.0 gargle_1.2.1 -[76] beeswarm_0.4.0 timechange_0.1.1 ellipsis_0.3.2

    + [7] backports_1.4.1 utf8_1.2.4 R6_2.5.1 +[10] vipor_0.4.5 lazyeval_0.2.2 DBI_1.1.3 +[13] mgcv_1.8-41 colorspace_2.1-0 withr_3.0.0 +[16] tidyselect_1.2.1 gridExtra_2.3 bit_4.0.5 +[19] curl_4.3.3 compiler_4.2.2 extrafontdb_1.0 +[22] cli_3.6.2 rvest_1.0.3 xml2_1.3.3 +[25] labeling_0.4.3 scales_1.3.0 hexbin_1.28.2 +[28] digest_0.6.30 rmarkdown_2.18 pkgconfig_2.0.3 +[31] htmltools_0.5.4 extrafont_0.18 dbplyr_2.2.1 +[34] fastmap_1.1.0 htmlwidgets_1.6.1 rlang_1.1.3 +[37] readxl_1.4.1 rstudioapi_0.14 farver_2.1.1 +[40] generics_0.1.3 jsonlite_1.8.3 crosstalk_1.2.0 +[43] vroom_1.6.0 googlesheets4_1.0.1 magrittr_2.0.3 +[46] Matrix_1.5-1 Rcpp_1.0.9 munsell_0.5.1 +[49] fansi_1.0.6 lifecycle_1.0.4 stringi_1.8.3 +[52] yaml_2.3.6 grid_4.2.2 parallel_4.2.2 +[55] crayon_1.5.2 lattice_0.20-45 splines_4.2.2 +[58] haven_2.5.1 hms_1.1.2 pillar_1.9.0 +[61] urbnmapr_0.0.0.9002 reprex_2.0.2 glue_1.7.0 +[64] evaluate_0.18 data.table_1.14.4 remotes_2.4.2 +[67] renv_0.16.0 modelr_0.1.10 vctrs_0.6.5 +[70] tzdb_0.3.0 Rttf2pt1_1.3.11 cellranger_1.1.0 +[73] gtable_0.3.4 assertthat_0.2.1 xfun_0.34 +[76] broom_1.0.1 viridisLite_0.4.2 googledrive_2.0.0 +[79] gargle_1.2.1 beeswarm_0.4.0 timechange_0.1.1 +[82] ellipsis_0.3.2
    @@ -1921,9 +1945,23 @@

    Bibli icon: icon }; anchorJS.add('.anchored'); + const isCodeAnnotation = (el) => { + for (const clz of el.classList) { + if (clz.startsWith('code-annotation-')) { + return true; + } + } + return false; + } const clipboard = new window.ClipboardJS('.code-copy-button', { - target: function(trigger) { - return trigger.previousElementSibling; + text: function(trigger) { + const codeEl = trigger.previousElementSibling.cloneNode(true); + for (const childEl of codeEl.children) { + if (isCodeAnnotation(childEl)) { + childEl.remove(); + } + } + return codeEl.innerText; } }); clipboard.on('success', function(e) { @@ -1988,6 +2026,92 @@

    Bibli return note.innerHTML; }); } + let selectedAnnoteEl; + const selectorForAnnotation = ( cell, annotation) => { + let cellAttr = 'data-code-cell="' + cell + '"'; + let lineAttr = 'data-code-annotation="' + annotation + '"'; + const selector = 'span[' + cellAttr + '][' + lineAttr + ']'; + return selector; + } + const selectCodeLines = (annoteEl) => { + const doc = window.document; + const targetCell = annoteEl.getAttribute("data-target-cell"); + const targetAnnotation = annoteEl.getAttribute("data-target-annotation"); + const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation)); + const lines = annoteSpan.getAttribute("data-code-lines").split(","); + const lineIds = lines.map((line) => { + return targetCell + "-" + line; + }) + let top = null; + let height = null; + let parent = null; + if (lineIds.length > 0) { + //compute the position of the single el (top and bottom and make a div) + const el = window.document.getElementById(lineIds[0]); + top = el.offsetTop; + height = el.offsetHeight; + parent = el.parentElement.parentElement; + if (lineIds.length > 1) { + const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]); + const bottom = lastEl.offsetTop + lastEl.offsetHeight; + height = bottom - top; + } + if (top !== null && height !== null && parent !== null) { + // cook up a div (if necessary) and position it + let div = window.document.getElementById("code-annotation-line-highlight"); + if (div === null) { + div = window.document.createElement("div"); + div.setAttribute("id", "code-annotation-line-highlight"); + div.style.position = 'absolute'; + parent.appendChild(div); + } + div.style.top = top - 2 + "px"; + div.style.height = height + 4 + "px"; + let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter"); + if (gutterDiv === null) { + gutterDiv = window.document.createElement("div"); + gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter"); + gutterDiv.style.position = 'absolute'; + const codeCell = window.document.getElementById(targetCell); + const gutter = codeCell.querySelector('.code-annotation-gutter'); + gutter.appendChild(gutterDiv); + } + gutterDiv.style.top = top - 2 + "px"; + gutterDiv.style.height = height + 4 + "px"; + } + selectedAnnoteEl = annoteEl; + } + }; + const unselectCodeLines = () => { + const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"]; + elementsIds.forEach((elId) => { + const div = window.document.getElementById(elId); + if (div) { + div.remove(); + } + }); + selectedAnnoteEl = undefined; + }; + // Attach click handler to the DT + const annoteDls = window.document.querySelectorAll('dt[data-target-cell]'); + for (const annoteDlNode of annoteDls) { + annoteDlNode.addEventListener('click', (event) => { + const clickedEl = event.target; + if (clickedEl !== selectedAnnoteEl) { + unselectCodeLines(); + const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active'); + if (activeEl) { + activeEl.classList.remove('code-annotation-active'); + } + selectCodeLines(clickedEl); + clickedEl.classList.add('code-annotation-active'); + } else { + // Unselect the line + unselectCodeLines(); + clickedEl.classList.remove('code-annotation-active'); + } + }); + } const findCites = (el) => { const parentEl = el.parentElement; if (parentEl) { diff --git a/docs/graphics-guide/www/images/3-color-barplot-1 2.png b/docs/graphics-guide/www/images/3-color-barplot-1 2.png deleted file mode 100644 index a916c0c..0000000 Binary files a/docs/graphics-guide/www/images/3-color-barplot-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/3-color-barplot-1.png b/docs/graphics-guide/www/images/3-color-barplot-1.png index a916c0c..cabcae2 100644 Binary files a/docs/graphics-guide/www/images/3-color-barplot-1.png and b/docs/graphics-guide/www/images/3-color-barplot-1.png differ diff --git a/docs/graphics-guide/www/images/annotate-point-1 2.png b/docs/graphics-guide/www/images/annotate-point-1 2.png deleted file mode 100644 index aa2d8b5..0000000 Binary files a/docs/graphics-guide/www/images/annotate-point-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/annotate-rect-1 2.png b/docs/graphics-guide/www/images/annotate-rect-1 2.png deleted file mode 100644 index 697f46b..0000000 Binary files a/docs/graphics-guide/www/images/annotate-rect-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/area-plot-fill-1 2.png b/docs/graphics-guide/www/images/area-plot-fill-1 2.png deleted file mode 100644 index 4024355..0000000 Binary files a/docs/graphics-guide/www/images/area-plot-fill-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/area-plot-fill-1.png b/docs/graphics-guide/www/images/area-plot-fill-1.png index 4024355..039cefa 100644 Binary files a/docs/graphics-guide/www/images/area-plot-fill-1.png and b/docs/graphics-guide/www/images/area-plot-fill-1.png differ diff --git a/docs/graphics-guide/www/images/area-plot-stack-1 2.png b/docs/graphics-guide/www/images/area-plot-stack-1 2.png deleted file mode 100644 index ac1c52a..0000000 Binary files a/docs/graphics-guide/www/images/area-plot-stack-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/area-plot-stack-1.png b/docs/graphics-guide/www/images/area-plot-stack-1.png index ac1c52a..2b229ef 100644 Binary files a/docs/graphics-guide/www/images/area-plot-stack-1.png and b/docs/graphics-guide/www/images/area-plot-stack-1.png differ diff --git a/docs/graphics-guide/www/images/bar-geom_text-1 2.png b/docs/graphics-guide/www/images/bar-geom_text-1 2.png deleted file mode 100644 index bfe36e4..0000000 Binary files a/docs/graphics-guide/www/images/bar-geom_text-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/barplot-rotated-1 2.png b/docs/graphics-guide/www/images/barplot-rotated-1 2.png deleted file mode 100644 index 39914ac..0000000 Binary files a/docs/graphics-guide/www/images/barplot-rotated-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/barplots-1 2.png b/docs/graphics-guide/www/images/barplots-1 2.png deleted file mode 100644 index 2e6870b..0000000 Binary files a/docs/graphics-guide/www/images/barplots-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/beanplot-1 2.png b/docs/graphics-guide/www/images/beanplot-1 2.png deleted file mode 100644 index faa7f7f..0000000 Binary files a/docs/graphics-guide/www/images/beanplot-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/beeswarm-1 2.png b/docs/graphics-guide/www/images/beeswarm-1 2.png deleted file mode 100644 index 67a5c36..0000000 Binary files a/docs/graphics-guide/www/images/beeswarm-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/before-scatter-plot-1 2.png b/docs/graphics-guide/www/images/before-scatter-plot-1 2.png deleted file mode 100644 index 5baa7ed..0000000 Binary files a/docs/graphics-guide/www/images/before-scatter-plot-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/box-plot-1 2.png b/docs/graphics-guide/www/images/box-plot-1 2.png deleted file mode 100644 index 766b22e..0000000 Binary files a/docs/graphics-guide/www/images/box-plot-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/centroids-1.png b/docs/graphics-guide/www/images/centroids-1.png index 13dce43..b5119b6 100644 Binary files a/docs/graphics-guide/www/images/centroids-1.png and b/docs/graphics-guide/www/images/centroids-1.png differ diff --git a/docs/graphics-guide/www/images/cleveland-dot-plot-1 2.png b/docs/graphics-guide/www/images/cleveland-dot-plot-1 2.png deleted file mode 100644 index dc8685f..0000000 Binary files a/docs/graphics-guide/www/images/cleveland-dot-plot-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/dodged-bar-plot-1 2.png b/docs/graphics-guide/www/images/dodged-bar-plot-1 2.png deleted file mode 100644 index 692a0b8..0000000 Binary files a/docs/graphics-guide/www/images/dodged-bar-plot-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/dodged-bar-plot-1.png b/docs/graphics-guide/www/images/dodged-bar-plot-1.png index 692a0b8..30f3977 100644 Binary files a/docs/graphics-guide/www/images/dodged-bar-plot-1.png and b/docs/graphics-guide/www/images/dodged-bar-plot-1.png differ diff --git a/docs/graphics-guide/www/images/faceting-1 2.png b/docs/graphics-guide/www/images/faceting-1 2.png deleted file mode 100644 index 3f835a9..0000000 Binary files a/docs/graphics-guide/www/images/faceting-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/filled-scatter-plot-1 2.png b/docs/graphics-guide/www/images/filled-scatter-plot-1 2.png deleted file mode 100644 index 68bdb8e..0000000 Binary files a/docs/graphics-guide/www/images/filled-scatter-plot-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/filled-scatter-plot-1.png b/docs/graphics-guide/www/images/filled-scatter-plot-1.png index 68bdb8e..111caab 100644 Binary files a/docs/graphics-guide/www/images/filled-scatter-plot-1.png and b/docs/graphics-guide/www/images/filled-scatter-plot-1.png differ diff --git a/docs/graphics-guide/www/images/geom_point-size-1 2.png b/docs/graphics-guide/www/images/geom_point-size-1 2.png deleted file mode 100644 index 4df0f65..0000000 Binary files a/docs/graphics-guide/www/images/geom_point-size-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/geom_point-size-1.png b/docs/graphics-guide/www/images/geom_point-size-1.png index 4df0f65..952caa4 100644 Binary files a/docs/graphics-guide/www/images/geom_point-size-1.png and b/docs/graphics-guide/www/images/geom_point-size-1.png differ diff --git a/docs/graphics-guide/www/images/geom_smooth-1 2.png b/docs/graphics-guide/www/images/geom_smooth-1 2.png deleted file mode 100644 index 8321d7e..0000000 Binary files a/docs/graphics-guide/www/images/geom_smooth-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/gghighlight-faceting-1 2.png b/docs/graphics-guide/www/images/gghighlight-faceting-1 2.png deleted file mode 100644 index 9865706..0000000 Binary files a/docs/graphics-guide/www/images/gghighlight-faceting-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/gghighlight-rank-1 2.png b/docs/graphics-guide/www/images/gghighlight-rank-1 2.png deleted file mode 100644 index 5620c97..0000000 Binary files a/docs/graphics-guide/www/images/gghighlight-rank-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/gghighlight-rank-1.png b/docs/graphics-guide/www/images/gghighlight-rank-1.png index 5620c97..baf2843 100644 Binary files a/docs/graphics-guide/www/images/gghighlight-rank-1.png and b/docs/graphics-guide/www/images/gghighlight-rank-1.png differ diff --git a/docs/graphics-guide/www/images/gghighlight-threshold-1 2.png b/docs/graphics-guide/www/images/gghighlight-threshold-1 2.png deleted file mode 100644 index 1f7ffd2..0000000 Binary files a/docs/graphics-guide/www/images/gghighlight-threshold-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/gghighlight-threshold-1.png b/docs/graphics-guide/www/images/gghighlight-threshold-1.png index 1f7ffd2..ff76f84 100644 Binary files a/docs/graphics-guide/www/images/gghighlight-threshold-1.png and b/docs/graphics-guide/www/images/gghighlight-threshold-1.png differ diff --git a/docs/graphics-guide/www/images/heat-map-1 2.png b/docs/graphics-guide/www/images/heat-map-1 2.png deleted file mode 100644 index f0d56c7..0000000 Binary files a/docs/graphics-guide/www/images/heat-map-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/heat-map-1.png b/docs/graphics-guide/www/images/heat-map-1.png index f0d56c7..3ecc08d 100644 Binary files a/docs/graphics-guide/www/images/heat-map-1.png and b/docs/graphics-guide/www/images/heat-map-1.png differ diff --git a/docs/graphics-guide/www/images/histogram-1 2.png b/docs/graphics-guide/www/images/histogram-1 2.png deleted file mode 100644 index a615001..0000000 Binary files a/docs/graphics-guide/www/images/histogram-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/jitter-plot-1 2.png b/docs/graphics-guide/www/images/jitter-plot-1 2.png deleted file mode 100644 index 2918087..0000000 Binary files a/docs/graphics-guide/www/images/jitter-plot-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/kernel-density-plot-1 2.png b/docs/graphics-guide/www/images/kernel-density-plot-1 2.png deleted file mode 100644 index 5997c89..0000000 Binary files a/docs/graphics-guide/www/images/kernel-density-plot-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/kernel-density-plot-filled-1 2.png b/docs/graphics-guide/www/images/kernel-density-plot-filled-1 2.png deleted file mode 100644 index 4fd5e53..0000000 Binary files a/docs/graphics-guide/www/images/kernel-density-plot-filled-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/kernel-density-plot-filled-1.png b/docs/graphics-guide/www/images/kernel-density-plot-filled-1.png index 4fd5e53..c6bd1bc 100644 Binary files a/docs/graphics-guide/www/images/kernel-density-plot-filled-1.png and b/docs/graphics-guide/www/images/kernel-density-plot-filled-1.png differ diff --git a/docs/graphics-guide/www/images/layering-geoms-design-1 2.png b/docs/graphics-guide/www/images/layering-geoms-design-1 2.png deleted file mode 100644 index 9741fcd..0000000 Binary files a/docs/graphics-guide/www/images/layering-geoms-design-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/layering-geoms-design-1.png b/docs/graphics-guide/www/images/layering-geoms-design-1.png index 9741fcd..da1d574 100644 Binary files a/docs/graphics-guide/www/images/layering-geoms-design-1.png and b/docs/graphics-guide/www/images/layering-geoms-design-1.png differ diff --git a/docs/graphics-guide/www/images/layering-geoms-design-gray-1 2.png b/docs/graphics-guide/www/images/layering-geoms-design-gray-1 2.png deleted file mode 100644 index 52a5763..0000000 Binary files a/docs/graphics-guide/www/images/layering-geoms-design-gray-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/layering-geoms-design-gray-1.png b/docs/graphics-guide/www/images/layering-geoms-design-gray-1.png index 52a5763..c256b40 100644 Binary files a/docs/graphics-guide/www/images/layering-geoms-design-gray-1.png and b/docs/graphics-guide/www/images/layering-geoms-design-gray-1.png differ diff --git a/docs/graphics-guide/www/images/line-plots-1 2.png b/docs/graphics-guide/www/images/line-plots-1 2.png deleted file mode 100644 index 3f19c6e..0000000 Binary files a/docs/graphics-guide/www/images/line-plots-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/lollipop-plot-1 2.png b/docs/graphics-guide/www/images/lollipop-plot-1 2.png deleted file mode 100644 index 89e8386..0000000 Binary files a/docs/graphics-guide/www/images/lollipop-plot-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/multiple-line-charts1-1 2.png b/docs/graphics-guide/www/images/multiple-line-charts1-1 2.png deleted file mode 100644 index 323bdcb..0000000 Binary files a/docs/graphics-guide/www/images/multiple-line-charts1-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/multiple-line-charts1-1.png b/docs/graphics-guide/www/images/multiple-line-charts1-1.png index 323bdcb..3e02580 100644 Binary files a/docs/graphics-guide/www/images/multiple-line-charts1-1.png and b/docs/graphics-guide/www/images/multiple-line-charts1-1.png differ diff --git a/docs/graphics-guide/www/images/multiple-line-charts2-1 2.png b/docs/graphics-guide/www/images/multiple-line-charts2-1 2.png deleted file mode 100644 index 4e5b4f9..0000000 Binary files a/docs/graphics-guide/www/images/multiple-line-charts2-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/multiple-line-charts2-1.png b/docs/graphics-guide/www/images/multiple-line-charts2-1.png index 4e5b4f9..d420fa3 100644 Binary files a/docs/graphics-guide/www/images/multiple-line-charts2-1.png and b/docs/graphics-guide/www/images/multiple-line-charts2-1.png differ diff --git a/docs/graphics-guide/www/images/one-color-scatter-plot-1 2.png b/docs/graphics-guide/www/images/one-color-scatter-plot-1 2.png deleted file mode 100644 index 6f82a74..0000000 Binary files a/docs/graphics-guide/www/images/one-color-scatter-plot-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/path-plot-1 2.png b/docs/graphics-guide/www/images/path-plot-1 2.png deleted file mode 100644 index 7a932fd..0000000 Binary files a/docs/graphics-guide/www/images/path-plot-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/path-plot-1.png b/docs/graphics-guide/www/images/path-plot-1.png index 7a932fd..d351e98 100644 Binary files a/docs/graphics-guide/www/images/path-plot-1.png and b/docs/graphics-guide/www/images/path-plot-1.png differ diff --git a/docs/graphics-guide/www/images/scatter-plot-hex-1 2.png b/docs/graphics-guide/www/images/scatter-plot-hex-1 2.png deleted file mode 100644 index 52a4671..0000000 Binary files a/docs/graphics-guide/www/images/scatter-plot-hex-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/scatter-plot-hex-1.png b/docs/graphics-guide/www/images/scatter-plot-hex-1.png index 52a4671..87565bb 100644 Binary files a/docs/graphics-guide/www/images/scatter-plot-hex-1.png and b/docs/graphics-guide/www/images/scatter-plot-hex-1.png differ diff --git a/docs/graphics-guide/www/images/scatterplot-geom_text-1 2.png b/docs/graphics-guide/www/images/scatterplot-geom_text-1 2.png deleted file mode 100644 index 05f731c..0000000 Binary files a/docs/graphics-guide/www/images/scatterplot-geom_text-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/slope-plot-1 2.png b/docs/graphics-guide/www/images/slope-plot-1 2.png deleted file mode 100644 index cd5b1c6..0000000 Binary files a/docs/graphics-guide/www/images/slope-plot-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/slope-plot-1.png b/docs/graphics-guide/www/images/slope-plot-1.png index cd5b1c6..591901b 100644 Binary files a/docs/graphics-guide/www/images/slope-plot-1.png and b/docs/graphics-guide/www/images/slope-plot-1.png differ diff --git a/docs/graphics-guide/www/images/small-multiples-1 2.png b/docs/graphics-guide/www/images/small-multiples-1 2.png deleted file mode 100644 index cb7f869..0000000 Binary files a/docs/graphics-guide/www/images/small-multiples-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/stacked-bar-plot-1 2.png b/docs/graphics-guide/www/images/stacked-bar-plot-1 2.png deleted file mode 100644 index 6568031..0000000 Binary files a/docs/graphics-guide/www/images/stacked-bar-plot-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/stacked-bar-plot-1.png b/docs/graphics-guide/www/images/stacked-bar-plot-1.png index 6568031..7cf9be8 100644 Binary files a/docs/graphics-guide/www/images/stacked-bar-plot-1.png and b/docs/graphics-guide/www/images/stacked-bar-plot-1.png differ diff --git a/docs/graphics-guide/www/images/stacked-bar-plot-fill-1 2.png b/docs/graphics-guide/www/images/stacked-bar-plot-fill-1 2.png deleted file mode 100644 index acb0632..0000000 Binary files a/docs/graphics-guide/www/images/stacked-bar-plot-fill-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/stacked-bar-plot-fill-1.png b/docs/graphics-guide/www/images/stacked-bar-plot-fill-1.png index acb0632..cc919c8 100644 Binary files a/docs/graphics-guide/www/images/stacked-bar-plot-fill-1.png and b/docs/graphics-guide/www/images/stacked-bar-plot-fill-1.png differ diff --git a/docs/graphics-guide/www/images/step-plot-1 2.png b/docs/graphics-guide/www/images/step-plot-1 2.png deleted file mode 100644 index f995398..0000000 Binary files a/docs/graphics-guide/www/images/step-plot-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/stripchart-1 2.png b/docs/graphics-guide/www/images/stripchart-1 2.png deleted file mode 100644 index 4c0ec0f..0000000 Binary files a/docs/graphics-guide/www/images/stripchart-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/stripchart-with-highlighting-1 2.png b/docs/graphics-guide/www/images/stripchart-with-highlighting-1 2.png deleted file mode 100644 index 7a11ca4..0000000 Binary files a/docs/graphics-guide/www/images/stripchart-with-highlighting-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/stripchart-with-highlighting-1.png b/docs/graphics-guide/www/images/stripchart-with-highlighting-1.png index 7a11ca4..e591827 100644 Binary files a/docs/graphics-guide/www/images/stripchart-with-highlighting-1.png and b/docs/graphics-guide/www/images/stripchart-with-highlighting-1.png differ diff --git a/docs/graphics-guide/www/images/subset-geom-smooth-lm-1 2.png b/docs/graphics-guide/www/images/subset-geom-smooth-lm-1 2.png deleted file mode 100644 index 1b991ce..0000000 Binary files a/docs/graphics-guide/www/images/subset-geom-smooth-lm-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/subset-geom-smooth-lm-1.png b/docs/graphics-guide/www/images/subset-geom-smooth-lm-1.png index 1b991ce..5cb9487 100644 Binary files a/docs/graphics-guide/www/images/subset-geom-smooth-lm-1.png and b/docs/graphics-guide/www/images/subset-geom-smooth-lm-1.png differ diff --git a/docs/graphics-guide/www/images/subset-geom_smooth-1 2.png b/docs/graphics-guide/www/images/subset-geom_smooth-1 2.png deleted file mode 100644 index a01c5cf..0000000 Binary files a/docs/graphics-guide/www/images/subset-geom_smooth-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/subset-geom_smooth-1.png b/docs/graphics-guide/www/images/subset-geom_smooth-1.png index a01c5cf..635ff5e 100644 Binary files a/docs/graphics-guide/www/images/subset-geom_smooth-1.png and b/docs/graphics-guide/www/images/subset-geom_smooth-1.png differ diff --git a/docs/graphics-guide/www/images/subsetted-stripchart-1 2.png b/docs/graphics-guide/www/images/subsetted-stripchart-1 2.png deleted file mode 100644 index 06eb368..0000000 Binary files a/docs/graphics-guide/www/images/subsetted-stripchart-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/unnamed-chunk-1-1 2.png b/docs/graphics-guide/www/images/unnamed-chunk-1-1 2.png deleted file mode 100644 index 5725e95..0000000 Binary files a/docs/graphics-guide/www/images/unnamed-chunk-1-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/unnamed-chunk-1-1.png b/docs/graphics-guide/www/images/unnamed-chunk-1-1.png index 5b5c2a1..7eaa8c1 100644 Binary files a/docs/graphics-guide/www/images/unnamed-chunk-1-1.png and b/docs/graphics-guide/www/images/unnamed-chunk-1-1.png differ diff --git a/docs/graphics-guide/www/images/unnamed-chunk-2-1.png b/docs/graphics-guide/www/images/unnamed-chunk-2-1.png deleted file mode 100644 index 5725e95..0000000 Binary files a/docs/graphics-guide/www/images/unnamed-chunk-2-1.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/unnamed-chunk-3-1.png b/docs/graphics-guide/www/images/unnamed-chunk-3-1.png new file mode 100644 index 0000000..04217d0 Binary files /dev/null and b/docs/graphics-guide/www/images/unnamed-chunk-3-1.png differ diff --git a/docs/graphics-guide/www/images/violin-plot-1 2.png b/docs/graphics-guide/www/images/violin-plot-1 2.png deleted file mode 100644 index 7670150..0000000 Binary files a/docs/graphics-guide/www/images/violin-plot-1 2.png and /dev/null differ diff --git a/docs/graphics-guide/www/images/violin-plot-1.png b/docs/graphics-guide/www/images/violin-plot-1.png index 7670150..6a20585 100644 Binary files a/docs/graphics-guide/www/images/violin-plot-1.png and b/docs/graphics-guide/www/images/violin-plot-1.png differ diff --git a/docs/mapping.html b/docs/mapping.html index 2ecf170..a310280 100644 --- a/docs/mapping.html +++ b/docs/mapping.html @@ -2,7 +2,7 @@ - + @@ -17,9 +17,10 @@ ul.task-list{list-style: none;} ul.task-list li input[type="checkbox"] { width: 0.8em; - margin: 0 0.8em 0.2em -1.6em; + margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ vertical-align: middle; } +/* CSS for syntax highlighting */ pre > code.sourceCode { white-space: pre; position: relative; } pre > code.sourceCode > span { display: inline-block; line-height: 1.25; } pre > code.sourceCode > span:empty { height: 1.2em; } @@ -46,43 +47,13 @@ -khtml-user-select: none; -moz-user-select: none; -ms-user-select: none; user-select: none; padding: 0 4px; width: 4em; - color: #aaaaaa; } -pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; } +pre.numberSource { margin-left: 3em; padding-left: 4px; } div.sourceCode { } @media screen { pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; } } -code span.al { color: #ff0000; font-weight: bold; } /* Alert */ -code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */ -code span.at { color: #7d9029; } /* Attribute */ -code span.bn { color: #40a070; } /* BaseN */ -code span.bu { color: #008000; } /* BuiltIn */ -code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */ -code span.ch { color: #4070a0; } /* Char */ -code span.cn { color: #880000; } /* Constant */ -code span.co { color: #60a0b0; font-style: italic; } /* Comment */ -code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */ -code span.do { color: #ba2121; font-style: italic; } /* Documentation */ -code span.dt { color: #902000; } /* DataType */ -code span.dv { color: #40a070; } /* DecVal */ -code span.er { color: #ff0000; font-weight: bold; } /* Error */ -code span.ex { } /* Extension */ -code span.fl { color: #40a070; } /* Float */ -code span.fu { color: #06287e; } /* Function */ -code span.im { color: #008000; font-weight: bold; } /* Import */ -code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */ -code span.kw { color: #007020; font-weight: bold; } /* Keyword */ -code span.op { color: #666666; } /* Operator */ -code span.ot { color: #007020; } /* Other */ -code span.pp { color: #bc7a00; } /* Preprocessor */ -code span.sc { color: #4070a0; } /* SpecialChar */ -code span.ss { color: #bb6688; } /* SpecialString */ -code span.st { color: #4070a0; } /* String */ -code span.va { color: #19177c; } /* Variable */ -code span.vs { color: #4070a0; } /* VerbatimString */ -code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */ @@ -117,7 +88,8 @@ "search-more-matches-text": "more matches in this document", "search-clear-button-title": "Clear", "search-detached-cancel-button-title": "Cancel", - "search-submit-button-title": "Submit" + "search-submit-button-title": "Submit", + "search-label": "Search" } } @@ -165,41 +137,43 @@ R@URBAN

    + @@ -1359,7 +1333,7 @@

    Cleaning Addresses

    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Helvetica Neue', 'Fira Sans', 'Droid Sans', Arial, sans-serif; } -:where(#zlgmrxpxhj) .gt_table { +#zlgmrxpxhj .gt_table { display: table; border-collapse: collapse; margin-left: auto; @@ -1384,7 +1358,7 @@

    Cleaning Addresses

    border-left-color: #D3D3D3; } -:where(#zlgmrxpxhj) .gt_heading { +#zlgmrxpxhj .gt_heading { background-color: #FFFFFF; text-align: left; border-bottom-color: #FFFFFF; @@ -1396,7 +1370,7 @@

    Cleaning Addresses

    border-right-color: #D3D3D3; } -:where(#zlgmrxpxhj) .gt_title { +#zlgmrxpxhj .gt_title { color: #333333; font-size: 125%; font-weight: initial; @@ -1406,7 +1380,7 @@

    Cleaning Addresses

    border-bottom-width: 0; } -:where(#zlgmrxpxhj) .gt_subtitle { +#zlgmrxpxhj .gt_subtitle { color: #333333; font-size: 85%; font-weight: initial; @@ -1416,13 +1390,13 @@

    Cleaning Addresses

    border-top-width: 0; } -:where(#zlgmrxpxhj) .gt_bottom_border { +#zlgmrxpxhj .gt_bottom_border { border-bottom-style: solid; border-bottom-width: 2px; border-bottom-color: #D3D3D3; } -:where(#zlgmrxpxhj) .gt_col_headings { +#zlgmrxpxhj .gt_col_headings { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; @@ -1437,7 +1411,7 @@

    Cleaning Addresses

    border-right-color: #D3D3D3; } -:where(#zlgmrxpxhj) .gt_col_heading { +#zlgmrxpxhj .gt_col_heading { color: #333333; background-color: #FFFFFF; font-size: 100%; @@ -1457,7 +1431,7 @@

    Cleaning Addresses

    overflow-x: hidden; } -:where(#zlgmrxpxhj) .gt_column_spanner_outer { +#zlgmrxpxhj .gt_column_spanner_outer { color: #333333; background-color: #FFFFFF; font-size: 100%; @@ -1469,15 +1443,15 @@

    Cleaning Addresses

    padding-right: 4px; } -:where(#zlgmrxpxhj) .gt_column_spanner_outer:first-child { +#zlgmrxpxhj .gt_column_spanner_outer:first-child { padding-left: 0; } -:where(#zlgmrxpxhj) .gt_column_spanner_outer:last-child { +#zlgmrxpxhj .gt_column_spanner_outer:last-child { padding-right: 0; } -:where(#zlgmrxpxhj) .gt_column_spanner { +#zlgmrxpxhj .gt_column_spanner { border-bottom-style: solid; border-bottom-width: 0px; border-bottom-color: #D3D3D3; @@ -1489,7 +1463,7 @@

    Cleaning Addresses

    width: 100%; } -:where(#zlgmrxpxhj) .gt_group_heading { +#zlgmrxpxhj .gt_group_heading { padding: 8px; color: #333333; background-color: #FFFFFF; @@ -1511,7 +1485,7 @@

    Cleaning Addresses

    vertical-align: middle; } -:where(#zlgmrxpxhj) .gt_empty_group_heading { +#zlgmrxpxhj .gt_empty_group_heading { padding: 0.5px; color: #333333; background-color: #FFFFFF; @@ -1526,15 +1500,15 @@

    Cleaning Addresses

    vertical-align: middle; } -:where(#zlgmrxpxhj) .gt_from_md > :first-child { +#zlgmrxpxhj .gt_from_md > :first-child { margin-top: 0; } -:where(#zlgmrxpxhj) .gt_from_md > :last-child { +#zlgmrxpxhj .gt_from_md > :last-child { margin-bottom: 0; } -:where(#zlgmrxpxhj) .gt_row { +#zlgmrxpxhj .gt_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; @@ -1553,7 +1527,7 @@

    Cleaning Addresses

    overflow-x: hidden; } -:where(#zlgmrxpxhj) .gt_stub { +#zlgmrxpxhj .gt_stub { color: #333333; background-color: #FFFFFF; font-size: 100%; @@ -1565,7 +1539,7 @@

    Cleaning Addresses

    padding-left: 12px; } -:where(#zlgmrxpxhj) .gt_summary_row { +#zlgmrxpxhj .gt_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; @@ -1575,7 +1549,7 @@

    Cleaning Addresses

    padding-right: 5px; } -:where(#zlgmrxpxhj) .gt_first_summary_row { +#zlgmrxpxhj .gt_first_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; @@ -1585,7 +1559,7 @@

    Cleaning Addresses

    border-top-color: #D3D3D3; } -:where(#zlgmrxpxhj) .gt_grand_summary_row { +#zlgmrxpxhj .gt_grand_summary_row { color: #333333; background-color: #FFFFFF; text-transform: inherit; @@ -1595,7 +1569,7 @@

    Cleaning Addresses

    padding-right: 5px; } -:where(#zlgmrxpxhj) .gt_first_grand_summary_row { +#zlgmrxpxhj .gt_first_grand_summary_row { padding-top: 8px; padding-bottom: 8px; padding-left: 5px; @@ -1605,11 +1579,11 @@

    Cleaning Addresses

    border-top-color: #D3D3D3; } -:where(#zlgmrxpxhj) .gt_striped { +#zlgmrxpxhj .gt_striped { background-color: rgba(128, 128, 128, 0.05); } -:where(#zlgmrxpxhj) .gt_table_body { +#zlgmrxpxhj .gt_table_body { border-top-style: solid; border-top-width: 2px; border-top-color: #D3D3D3; @@ -1618,7 +1592,7 @@

    Cleaning Addresses

    border-bottom-color: #D3D3D3; } -:where(#zlgmrxpxhj) .gt_footnotes { +#zlgmrxpxhj .gt_footnotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; @@ -1632,13 +1606,13 @@

    Cleaning Addresses

    border-right-color: #D3D3D3; } -:where(#zlgmrxpxhj) .gt_footnote { +#zlgmrxpxhj .gt_footnote { margin: 0px; font-size: 90%; padding: 4px; } -:where(#zlgmrxpxhj) .gt_sourcenotes { +#zlgmrxpxhj .gt_sourcenotes { color: #333333; background-color: #FFFFFF; border-bottom-style: none; @@ -1652,69 +1626,78 @@

    Cleaning Addresses

    border-right-color: #D3D3D3; } -:where(#zlgmrxpxhj) .gt_sourcenote { +#zlgmrxpxhj .gt_sourcenote { font-size: 90%; padding: 4px; } -:where(#zlgmrxpxhj) .gt_left { +#zlgmrxpxhj .gt_left { text-align: left; } -:where(#zlgmrxpxhj) .gt_center { +#zlgmrxpxhj .gt_center { text-align: center; } -:where(#zlgmrxpxhj) .gt_right { +#zlgmrxpxhj .gt_right { text-align: right; font-variant-numeric: tabular-nums; } -:where(#zlgmrxpxhj) .gt_font_normal { +#zlgmrxpxhj .gt_font_normal { font-weight: normal; } -:where(#zlgmrxpxhj) .gt_font_bold { +#zlgmrxpxhj .gt_font_bold { font-weight: bold; } -:where(#zlgmrxpxhj) .gt_font_italic { +#zlgmrxpxhj .gt_font_italic { font-style: italic; } -:where(#zlgmrxpxhj) .gt_super { +#zlgmrxpxhj .gt_super { font-size: 65%; } -:where(#zlgmrxpxhj) .gt_footnote_marks { +#zlgmrxpxhj .gt_footnote_marks { font-style: italic; font-weight: normal; font-size: 65%; } - - - - - - - - - - - - - - - - - - - - - - + +
    f_addressType of address
    123 Troy Drive, Pillowtown, CO, 92432residnetial address
    789 Abed Avenue, Apt 666, Blankesburg, CO, 92489residential apartment address
    Shirley Boulevard and Britta Drive, Blanketsburg, CO, 92489street intersection
    Pillowtown, COcity
    92489, COZip Code
    + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    f_addressType of address
    123 Troy Drive, Pillowtown, CO, 92432residnetial address
    789 Abed Avenue, Apt 666, Blankesburg, CO, 92489residential apartment address
    Shirley Boulevard and Britta Drive, Blanketsburg, CO, 92489street intersection
    Pillowtown, COcity
    92489, COZip Code
    + @@ -1938,9 +1921,23 @@

    Bibliography and references

    icon: icon }; anchorJS.add('.anchored'); + const isCodeAnnotation = (el) => { + for (const clz of el.classList) { + if (clz.startsWith('code-annotation-')) { + return true; + } + } + return false; + } const clipboard = new window.ClipboardJS('.code-copy-button', { - target: function(trigger) { - return trigger.previousElementSibling; + text: function(trigger) { + const codeEl = trigger.previousElementSibling.cloneNode(true); + for (const childEl of codeEl.children) { + if (isCodeAnnotation(childEl)) { + childEl.remove(); + } + } + return codeEl.innerText; } }); clipboard.on('success', function(e) { @@ -2005,6 +2002,92 @@

    Bibliography and references

    return note.innerHTML; }); } + let selectedAnnoteEl; + const selectorForAnnotation = ( cell, annotation) => { + let cellAttr = 'data-code-cell="' + cell + '"'; + let lineAttr = 'data-code-annotation="' + annotation + '"'; + const selector = 'span[' + cellAttr + '][' + lineAttr + ']'; + return selector; + } + const selectCodeLines = (annoteEl) => { + const doc = window.document; + const targetCell = annoteEl.getAttribute("data-target-cell"); + const targetAnnotation = annoteEl.getAttribute("data-target-annotation"); + const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation)); + const lines = annoteSpan.getAttribute("data-code-lines").split(","); + const lineIds = lines.map((line) => { + return targetCell + "-" + line; + }) + let top = null; + let height = null; + let parent = null; + if (lineIds.length > 0) { + //compute the position of the single el (top and bottom and make a div) + const el = window.document.getElementById(lineIds[0]); + top = el.offsetTop; + height = el.offsetHeight; + parent = el.parentElement.parentElement; + if (lineIds.length > 1) { + const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]); + const bottom = lastEl.offsetTop + lastEl.offsetHeight; + height = bottom - top; + } + if (top !== null && height !== null && parent !== null) { + // cook up a div (if necessary) and position it + let div = window.document.getElementById("code-annotation-line-highlight"); + if (div === null) { + div = window.document.createElement("div"); + div.setAttribute("id", "code-annotation-line-highlight"); + div.style.position = 'absolute'; + parent.appendChild(div); + } + div.style.top = top - 2 + "px"; + div.style.height = height + 4 + "px"; + let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter"); + if (gutterDiv === null) { + gutterDiv = window.document.createElement("div"); + gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter"); + gutterDiv.style.position = 'absolute'; + const codeCell = window.document.getElementById(targetCell); + const gutter = codeCell.querySelector('.code-annotation-gutter'); + gutter.appendChild(gutterDiv); + } + gutterDiv.style.top = top - 2 + "px"; + gutterDiv.style.height = height + 4 + "px"; + } + selectedAnnoteEl = annoteEl; + } + }; + const unselectCodeLines = () => { + const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"]; + elementsIds.forEach((elId) => { + const div = window.document.getElementById(elId); + if (div) { + div.remove(); + } + }); + selectedAnnoteEl = undefined; + }; + // Attach click handler to the DT + const annoteDls = window.document.querySelectorAll('dt[data-target-cell]'); + for (const annoteDlNode of annoteDls) { + annoteDlNode.addEventListener('click', (event) => { + const clickedEl = event.target; + if (clickedEl !== selectedAnnoteEl) { + unselectCodeLines(); + const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active'); + if (activeEl) { + activeEl.classList.remove('code-annotation-active'); + } + selectCodeLines(clickedEl); + clickedEl.classList.add('code-annotation-active'); + } else { + // Unselect the line + unselectCodeLines(); + clickedEl.classList.remove('code-annotation-active'); + } + }); + } const findCites = (el) => { const parentEl = el.parentElement; if (parentEl) { diff --git a/docs/optimization.html b/docs/optimization.html index 9a9441d..9b6ff93 100644 --- a/docs/optimization.html +++ b/docs/optimization.html @@ -264,9 +264,8 @@

    Profiling & Benchmarking

    })
    - -
    - +
    +

    In this toy example it looks like the read.csv function is the bottleneck, so

    @@ -288,12 +287,12 @@

    Profiling & Benchmarking

    )
    Unit: milliseconds
    -                                              expr      min       lq     mean
    -        read.csv("optimization/data/diamonds.csv") 64.77053 66.14555 71.04118
    - readr::read_csv("optimization/data/diamonds.csv") 32.04847 33.20362 37.12877
    -   median       uq      max neval
    - 68.25725 69.80660 134.2046   100
    - 34.07731 35.69267 158.6962   100
    + expr min lq + read.csv("optimization/data/diamonds.csv") 103.14624 111.61502 + readr::read_csv("optimization/data/diamonds.csv") 55.57689 59.80873 + mean median uq max neval + 135.10956 115.71928 127.93492 453.5855 100 + 75.97688 63.64992 71.88532 372.4557 100

    In this case, read_csv is about twice as fast as the base R implementations.

    @@ -411,12 +410,12 @@

    Motivating Example

    )
    Unit: seconds
    -       expr         min         lq          mean       median          uq
    - sequential 0.000001066 0.00000123 0.00000177858 0.0000016195 0.000002132
    -   parallel 0.014658443 0.01518220 0.01828126942 0.0157106875 0.017450646
    +       expr         min           lq         mean       median          uq
    + sequential 0.000001763 0.0000020705 0.0000029848 0.0000029315 0.000003772
    +   parallel 0.026585548 0.0282111980 0.0333569452 0.0291357070 0.030628517
              max neval
    - 0.000006847   100
    - 0.198878864   100
    + 0.000009799 100 + 0.337903181 100

    Parallelization was actually slower. In this case, the overhead of

    @@ -446,7 +445,7 @@

    A (somewha )
       user  system elapsed 
    - 21.846   0.663  22.535 
    + 35.229 2.688 42.309

    A now running the same code in parallel:

    @@ -466,7 +465,7 @@

    A (somewha )
       user  system elapsed 
    -  0.376   0.121  11.164 
    + 0.876 0.210 21.655

    While we didn’t achieve perfect scaling, we still get a nice bump in execution

    @@ -709,9 +708,9 @@

    For Loops )
    Unit: microseconds
    -         expr     min       lq       mean    median        uq       max neval
    -  bad_loop(x) 896.465 967.5590 2027.18719 1054.7250 1132.6660 55959.588   100
    - good_loop(x)   4.346   4.7355   21.39134    5.8425    7.9745  1437.009   100
    + expr min lq mean median uq max neval + bad_loop(x) 1042.179 1267.577 1891.78264 1328.5640 1446.9720 10173.125 100 + good_loop(x) 6.191 6.437 32.23338 6.7035 11.2545 2366.725 100

    And note how performance of the “bad” loop degrades as the loop size grows.

    @@ -727,9 +726,9 @@

    For Loops )
    Unit: microseconds
    -         expr       min         lq        mean     median         uq       max
    -  bad_loop(y) 13175.473 17043.3310 18404.84383 17790.6995 18655.1230 65857.726
    - good_loop(y)     9.717    10.2705    14.36558    11.3775    16.7485    35.711
    +         expr       min         lq       mean    median        uq       max
    +  bad_loop(y) 19249.582 22663.9595 24695.7231 23492.774 24909.407 81335.882
    + good_loop(y)    14.022    14.5345    21.2626    23.329    26.486    64.616
      neval
        100
        100
    @@ -764,9 +763,9 @@

    Vectorized Functions< )

    Unit: nanoseconds
    -            expr   min    lq     mean median    uq   max neval
    - lapply(x, sqrt) 14801 15047 15325.39  15170 15334 21279   100
    -         sqrt(x)   205   246   341.53    287   369  1107   100
    + expr min lq mean median uq max neval + lapply(x, sqrt) 20172 20418 20847.27 20541 20725.5 37228 100 + sqrt(x) 287 328 397.70 369 369.0 2296 100
    diff --git a/docs/resources.html b/docs/resources.html index 6e30f7a..a50eef5 100644 --- a/docs/resources.html +++ b/docs/resources.html @@ -2,7 +2,7 @@ - + @@ -11,10 +11,15 @@ @@ -49,7 +54,8 @@ "search-more-matches-text": "more matches in this document", "search-clear-button-title": "Clear", "search-detached-cancel-button-title": "Cancel", - "search-submit-button-title": "Submit" + "search-submit-button-title": "Submit", + "search-label": "Search" } } @@ -73,37 +79,48 @@
    @@ -113,7 +130,7 @@