.Rhistory

prop.table(output)[, 1:5]                 # Proportionality of "regions" 1-5
hist(parks$Attendance)
hist(parks$Year)
hist(parks$Attendance)
hist(parks$Attendance, breaks = 20)
hist(parks$Attendance, breaks = 30)
hist(parks$Attendance, breaks = 40)
hist(parks$Attendance, breaks = 50)
hist(parks$Attendance, breaks = 100)
boxplot(parks$Region, parks$Attendance)
boxplot(x = parks$Region, y = parks$Attendance)
boxplot(x = parks[ , c("Region", "Attendance")])
boxplot(x = parks$Region)
boxplot(x = parks$Region, y = parks$Attendance)
boxplot(x = factor(parks$Region), y = parks$Attendance)
boxplot(x = as.factor(parks$Region), y = parks$Attendance)
?boxplot
boxplot(Region ~ Attendance, data = parks)
parks
boxplot(parks$Region ~ parks$Attendance)
boxplot(parks$Attendance ~ parks$Region)
pairs(parks)
pairs(parks$Year)
pairs(parks$Year, parks$County)
pairs(parks$Year, parks$Attendance)
pairs(c(parks$Year, parks$Attendance))
?pairs
pairs(iris[1:4], main = "Anderson's Iris Data -- 3 species",
pch = 21, bg = c("red", "green3", "blue")[unclass(iris$Species)])
plot(x = parks$Year, y = parks$Attendance)
plot(x = parks$Year, y = parks$Attendance)
plot(x = parks$Year, y = parks$Attendance)
plot(x = as.numeric(parks$Year), y = parks$Attendance)
plot(x = parks$Year, y = parks$Attendance)
library(ggplot2)
library(ggplot2)
plot(x = parks$Year, y = parks$Attendance)
plot(x = economics$psavert, y = economics$uempmed)
plot(x = economics$psavert, y = economics$unemploy)
plot(x = economics$uempmed, y = economics$unemploy)
?economics
library(GGally)
?ggpairs
pairs(x = economics)
library(ggplot2)
pairs(x = parks)
pairs(x = iris)
library(ggplot2)
pairs(x = economics)
model <- lm(Attendance ~ Year + Region,
data = parks)                   # Create linear model: "model"
model
plot(model)
par(mfrow = c(2, 2))                        # Specify dimensions in `par()`
plot(model)
library(GGally)
ggpairs(economics)
ggpairs(iris,
aes(color = Species), diag = FALSE)
ggpairs(iris,
aes(color = Species))
ggpairs(iris,
aes(color = Species)) +
theme_minimal()
ggpairs(iris,
aes(color = Species)) +
theme_light()
ggpairs(iris,
aes(color = Species)) +
theme_classic()
ggpairs(iris,
aes(color = Species)) +
theme_minmal()
ggpairs(iris,
aes(color = Species)) +
theme_minimal()
library(ggplot2)
library(GGally)
ggpairs(iris,
aes(color = Species)) +
theme_minimal()
---
output:
html_document:
theme: readable
highlight: tango
self_contained: false
css: textbook.css
---
# Summarizing Vectors
<br>
<br>
<div class="tip">
## Key Concepts
In this chapter, we'll explore the following key concepts:
* Table counts with function `table()`
* Table Proportionality: `prop.table()`
* Table Margins: `margin.table()`
* Functions for Descriptive Stats:
- `min()`
- `max()`
- `mean()`
- `summary()`
- `quantile()`
* Combining Functions `summary()` & `data.frame()`
* Creating Tables of Descriptives: Factor & Numeric Variables
## New Packages
This chapter uses the following packages:
* Package A
## Key Takeaways
Too long; didn't read? Here's what you need to know:
* Item A
* Item B
* Item C
<br>
<br>
<br>
</div>
```{r echo=F}
# ATTENTION : GLOBAL CHUNK DEFAULTS
knitr::opts_chunk$set(message = FALSE,
warning = FALSE)
```
```{r include=F}
tutorial::go_interactive(greedy = FALSE)
```
<br>
<br>
## Exploratory Data Analysis
**Exploratory Data Analysis** (**EDA**) is the implementation of exploratory techniques to better understand *new data*. Typically, **EDA** uses visualizing and summarizing functions to detect patterns and anomalies in data beyond initial hypotheses and research questions.
<br>
**Practice Data:** To demonstrate, we'll use [state park annual attendance](https://data.ny.gov/Recreation/State-Park-Annual-Attendance-Figures-by-Facility-B/8f3n-xj78) from the State of New York's *Office for Parks, Recreation, and Historic Preservation* (OPRHP).
```{r cache=T}
library(readr)
url <- paste0("https://data.ny.gov/api/views/8f3n",
"-xj78/rows.csv?accessType=DOWNLOAD")   # Assign URL: "url"
parks <- read_csv(url)                                # Read in data: "parks"
```
<br>
<br>
### Common Initial Analysis Techniques
**Base R** has a litany of functions commonly used in **Initial Data Analysis**, or **IDA**.
* **IDA** is the opening salvo of functions in **Exploratory Data Analysis**.
* **IDA** techniques aid in understanding the nuances of your data.
<br>
**Data Structure:** Function `str()` is a go-to function for understanding:
* The *class* of the dataset, e.g. **matrix** or **data.frame**
* The *dimensions* of a dataset (rows and columns)
* The *class* of each variable in the dataset
* The first several values of each variable
* The **levels** in each **factor** variable
```{r}
str(parks)
```
<br>
**Dimensions:** Like measuring width and height, we can do the same with datasets:
* Function `nrow()` prints the total number of rows
* Function `ncol()` prints the total number of columns
* Function `dim()` prints the total number of rows and columns
Recall that in R, dimensions are printed or specified with rows first, then columns.
```{r}
nrow(parks)     # Print total rows
ncol(parks)     # Print total columns
dim(parks)      # Print rows and columns
```
<br>
**Length:** Function `length()` prints the number of values for a single variable or vector.
```{r}
length(parks$Facility)
```
<br>
**Row & Column Names:** Three functions are ideal for printing row and column names:
* Function `rownames()` prints the names of each row, though rows are rarely named
* Function `colnames()` prints the names of each column (i.e. variable)
* Function `names()` also prints the names of each variable
* Rename variables by assigning new names to their output
```{r}
rownames(parks)[1:5]                              # Print row names 1-5
colnames(parks)                                   # Print variable names
names(parks)                                      # Print variable names
names(parks) <- c("Year", "Region", "County",
"Facility", "Attendance")       # Reassign new names
names(parks)                                      # Print new names
```
<br>
**Classes:** We can determine the class of any object using function `class()`.
* Determine classes of entire datasets
* Determine classes of individual variables
* Determine classes of other objects, e.g. models
```{r}
class(parks)                                      # Dataset class
class(parks$Year)                                 # Variable class
model <- lm(Attendance ~ Year + Region,
data = parks)                         # Assign linear model
class(model)                                      # Model class
```
<br>
**Categorical Levels:** Print each category ("level") of **factor** variables with `levels()`:
```{r}
fctr <- as.factor(parks$Region)                   # Coerce to "factor"
levels(fctr)                                      # Print levels
```
<br>
**First & Last Observations**: Functions `head()` and `tail()` print first and last rows:
* Function `head()` prints the first rows of your data
* Function `tail()` prints the last rows of your data
* Specify the number of rows with argument `n =`
* By default, six rows are printed
```{r}
head(parks, n = 3)      # Print first 3 rows
tail(parks, n = 3)      # Print last 3 rows
```
<br>
**Summaries:** Function `summary()` describes individual variables according to their class:
* Class **numeric**, **integer**, or **double** prints descriptive statistics
* Class **character** includes total values and missing values
* Class **factor** tallies the total occurences in each level
```{r}
summary(parks)
```
<br>
**View Interactively:** In RStudio, function `View()` presents data in an interactive table.
```{r eval=F}
View(parks)
```
<center>
```{r echo=F, fig.align="center", fig.cap="*An interactive table resulting from function `View()` in RStudio's IDE.*", out.width="90%"}
knitr::include_graphics("~/Projects/DS4PS/DS4PS Repo/dp4ss-textbook/figures/function_view.jpg")
```
</center>
<br>
**Documentation:** If data are from an R package, `?` or `help()` opens documentation.
```{r eval=F}
library(ggplot2)            # Load package containing data
?economics                  # Open documentation with `?`
help(economics)             # Open documentation with help()
```
<center>
```{r echo=F, fig.align="center", fig.cap="*Interactive documentation in RStudio using `?` or `help()`.*", out.width="90%"}
knitr::include_graphics("~/Projects/DS4PS/DS4PS Repo/dp4ss-textbook/figures/help_documentation.jpg")
```
</center>
<br>
<br>
### Techniques for Tallies & Proportions
Many functions allow tallying frequencies and proportions for **character** and **factor** variables.
<br>
**Contingency Tables:** Function `table()` prints total of occurences for qualitative values.
These tables are also called **Contingency Tables**.
```{r}
table(parks$Region)
```
<br>
**Proportionality:** Function `prop.table()`, with `table()` output, shows proportionality.
```{r}
regions <- table(parks$Region)      # Assign `table()` output: "regions"
prop.table(regions)                 # Print proportionality
```
<br>
Functions `table()` or `prop.table()` can also weigh variables against eachother.
```{r}
subset <- parks[, c("Year", "Region")]    # Subset two variables
table(subset)[, 1:5]                      # Frequency of "regions" 1-5
output <- table(subset)                   # Assign `table()` output
prop.table(output)[, 1:5]                 # Proportionality of "regions" 1-5
```
<br>
<br>
### Initial Analysis Techniques from Packages
Many R packages are helpful in **Initial Data Analysis**, e.g. **DescTools** and **dplyr**.
<br>
**Advanced Summaries:** In **DescTools**, function `Desc()` is an enhanced `summary()`.
```{r}
library(DescTools)
Desc(parks$Year)          # Function `Desc()` on a quantitative variable
Desc(parks$Region)        # Function `Desc()` on a qualitative variable
```
<br>
**Advanced Structures:** In **dplyr**, function `glimpse()` is a more organized `str()`.
```{r}
library(dplyr)
glimpse(parks)
```
<br>
<br>
## Exploratory Data Visualization
**Exploratory Data Visualization** or **EDV** is critical to exploratory analyses.
* Allows "quick and dirty" visualizations of your new data's variables
* Used internally to benefit yourself, collaborators, or specialized audiences
* Assists analysts in decoding and identifying patterns and anomalies in new data
<br>
<br>
### Common Exploratory Visualization Techniques
Several functions exist for exploring data visually in base R.
<br>
**Histograms:** Quickly view the distribution of quantitative variables with `hist()`.
* Histograms are univariate and show the freqency of a range of numeric values
* Increase their resolution by increasing the number of ranges (`breaks =`)
```{r}
hist(parks$Attendance,        # Specify a single variable
breaks = 100)            # Specify number of breaks and "bins"
```
<br>
**Box Plots:** View several distributions across categorical variables with `boxplot()`.
* The beginning and end of boxplots represent the first and third quartiles, resp.
* The width of the box, itself, is the **Interquartile Range**, or **IQR**
* The middle of each boxplot represents the median (50%)
* "Whiskers" are calculated by `1.5 * IQR`
* Outliers are demarcated beyond whiskers
* Both variables are separated with `~`
```{r}
boxplot(parks$Attendance ~ parks$Region)
```
<br>
**Scatter Plots:** View relationships between quantitative variables with `plot()`.
Since `parks` only contains one quantitative variable, we use `economics` from **ggplot2**.
```{r}
library(ggplot2)
plot(x = economics$uempmed,     # Median duration of unemployment, in weeks
y = economics$unemploy)    # Number of unemployed, in thousands
```
<br>
**Pairs Plots:** Pairs plots create a matrix of small multiples for each variable.
* Small multiples allow multiple side-by-side comparisons of plots on common axes
* Depending on the **class** of each variable, different plot methods are used
Again, for want of class **numeric** variables, we use `economics` from **ggplot2**.
```{r}
library(ggplot2)
pairs(x = economics)
```
<br>
**Model Summaries:** Function `plot()`, used with a model, produces four summary plots.
* By adjusting the global graphics parameters of base R, we can print all four
* In function `par()`, specify total rows and columns in function `c()`
* Argument `mfrow =` accepts these two values in function `par()`
```{r}
model <- lm(Attendance ~ Year + Region,
data = parks)                   # Create linear model: "model"
par(mfrow = c(2, 2))                        # Specify dimensions in `par()`
plot(model)                                 # Call `plot()` on model
```
<br>
**Advanced Pairs Plots:** Use package **ggplot2** extension **GGally** and `ggpairs()`.
As a more colorful example, we'll use base R dataset `iris`.
```{r eval=F}
library(ggplot2)
library(GGally)                     # Load packages
ggpairs(iris,                       # Specify dataset
aes(color = Species)) +     # Map colors to variable "Species"
theme_minimal()                   # Preset theme cleans output
```
```{r include=F, cache=T}
library(ggplot2)
library(GGally)
plot <- ggpairs(iris,
aes(color = Species)) +
theme_minimal()
```
```{r echo=F}
library(ggplot2)
library(GGally)
plot
```
<br>
<br>
## Descriptive Statistics
**Descriptive** or **Summary Statistics** concisely *describe* datasets or individual variables with summary information, e.g. mean, median, mode, minimum value, maxium value, variance, and more.
While **descriptive statistics** can be the be-all and end-all of a descriptive analysis, they're also integral to **exploratory data analysis**.
<br>
<br>
### Common Functions for Descriptive Statistics
Again, base R has no shortage of functions for **descriptive** or **summary statistics**.
<br>
**Averages:** The average or **mean** value of quantitative data is calculated with `mean()`.
```{r}
mean(park$)
```
mean(parks$Attendance)
mode(parks$Attendance)
mode(parks)
max(parks$Attendance)
min(parks$Attendance)
var(parks$Attendance)
summary(parks$Attendance)
as.data.frame(summary(parks$Attendance))
data.frame(summary(parks$Attendance))
data.frame(summary(parks$Attendance), row.names = c("1", "2", "3", "4", "5", "6"))
quantile(parks$Attendance)
quantile(x = parks$Attendance, type = 4)
quantile(x = parks$Attendance, type = 8)
quantile(parks$Attendance)
data.frame(summary(parks))
data.frame(parks)
sumstats[, 2:3]
sumstats <- summary(parks)          # Assign summary() output: "park_stats"
sumstats <- data.frame(sumstats)    # Coerce to data frame
sumstats[, 2:3]
?ftable
ftable(x = parks$Region)
data.frame(ftable(x = parks$Region))
prop.table(ftable(x = parks$Region))
data.frame(prop.table(ftable(x = parks$Region)))
ftable(parks$Region)
region_freq <- ftable(parks$Region)
reg_freq <- ftable(parks$Region)
data.frame(reg_freq)
ftable(x = parks, row.vars = "Region", col.vars = c("Year", "Attendance"))
ftable(row.vars = parks$Region, col.vars = c("Year", "Attendance"))
ftable(row.vars = parks$Region, col.vars = parks[, c(1,5)])
ftable(row.vars = parks[, 2], col.vars = parks[, c(1,5)])
ftable(parks, row.vars = parks[, 2], col.vars = parks[, c(1,5)])
table(parks, row.vars = parks[, 2], col.vars = parks[, c(1,5)])
table(parks, row.vars = parks[, 2], col.vars = parks[, 1])
table(c = parks, row.vars = parks[, 2], col.vars = parks[, 1])
table(c = parks$Region, row.vars = parks[, 2], col.vars = parks[, 1])
table(c = parks$Region, row.vars = 2], col.vars = 1)
table(c = parks$Region, row.vars = 2, col.vars = 1)
table(c = parks$Region, row.vars = c(2,3), col.vars = 1)
table(c = parks$Region, row.vars = c(2,3), col.vars = c(1))
table(x = parks$Region, row.vars = c(2,3), col.vars = c(1))
table(x = parks$Region, row.vars = c(2,3), col.vars = 1)
names(parks)
table(x = parks$Region, row.vars = c(1,5), col.vars = 2)
table(x = parks, row.vars = c(1,5), col.vars = 2)
table(x = parks, row.vars = parks[, c(1,5)], col.vars = parks[, 2])
ftable(x = parks, row.vars = parks[, c(1,5)], col.vars = parks[, 2])
ftable(x = parks, row.vars = parks[, c(1,5)])
ftable(x = parks, row.vars = parks[, c(5)], col.vars = parks[, 2])
ftable(x = parks, row.vars = parks[, 5], col.vars = parks[, 2])
<- ftable(parks$Region)
ftable(Titanic, row.vars = 1:3)
ftable(Titanic, row.vars = 1:2, col.vars = "Survived")
ftable(Titanic, row.vars = 2:1, col.vars = "Survived")
ftable(parks, row.vars = c(1, 5))
ftable(parks, row.vars = c(1))
ftable(parks, row.vars = c(5))
parks
ftable(parks, row.vars = 2)
ftable(parks, row.vars = 2, col.vars = 5)
ftable(parks, row.vars = 1:2)
parks
unique(parks$Facility)
parks %>%
select(Year, Region, Attendance)
select(Year, Region, Attendance)
parks %>%  select(Year, Region, Attendance)
parks %>%                                 # Invoke "parks"
group_by(Year) %>%                      # Group by "Year"
summarize(Average = mean(Attendance))   # Create "Average" with `mean()`
parks %>%                                 # Invoke "parks"
group_by(Year) %>%                      # Group by "Year"
summarize(Average = mean(Attendance))   # Create "Average" with `mean()`
parks %>%
group_by(Year) %>%
summarize(Mean = mean(Attendance),
Median = median(Attendance),
Maximum = max(Attendance),
Records = n(),)
parks %>%
group_by(Year) %>%
summarize(Mean = mean(Attendance),
Median = median(Attendance),
Maximum = max(Attendance),
Records = n())
parks %>%
group_by(Year) %>%
summarize(Mean = mean(Attendance),
Median = median(Attendance),
Maximum = max(Attendance),
Records = n() / nrow(parks))
parks %>%
group_by(Year) %>%
summarize(Mean = mean(Attendance),
Median = median(Attendance),
Maximum = max(Attendance),
`Proportion of Records` = n() / nrow(parks))
parks %>%
group_by(Year) %>%
summarize(Mean = mean(Attendance),
Median = median(Attendance),
Maximum = max(Attendance),
Records = n())
parks %>%
group_by(Year, Region) %>%              # Group on variables "Year", "Region"
summarize(Mean = mean(Attendance),
Median = median(Attendance))
parks %>%
group_by(Year) %>%
summarize(Mean = mean(Attendance),
Total = sum(Attendance),
Proportion = Total / sum(parks$Attendance))
t <- parks %>%
group_by(Year) %>%
summarize(Mean = mean(Attendance),
Total = sum(Attendance),
Proportion = Total / sum(parks$Attendance))
sum(t$Proportion)
install.packages("tutorial")
install.packages("skimr")
install.packages("pander")
knitr::opts_chunk$set(echo = TRUE, message=F, warning=F)
library( dplyr )
library( pander )
createGame <- function( )
{
a.game <- sample( x=c("goat","goat","car"), size=3, replace=F )
return( a.game )
}
# try three times to see randomization
createGame()
createGame()
createGame()