04_Group_1_Final_Report.Rmd

---
title: "Group 1 Covid"
author: "Shoshana Farber and Christopher Stewart"
date: "6/23/2022"
output:
  html_document: default
  pdf_document: default
---

```{r setup, include=FALSE}
library(tidycensus)
library(tidyverse)
library(tigris, options(tigris_use_cache = TRUE))
library(sf)
library(dplyr)
library(modelr)
library(scales)

# install.packages("sf")
```

# Load the ACS Data

```{r}
load("Data/master.RData")

master <- master %>%
  filter(GEOID != 11005) # filtering out an outlier zipcode
```

# Figure 1

### Proportion of 18-64 Year Olds Uninsured

```{r}
uninsured_18to64 <- master %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  transmute(GEOID, Borough, NAME, geometry, total = total_18to34 + total_35to64, uninsured = uninsured_18to34 + uninsured_35to64) %>%
  mutate(prop = uninsured / total)

median(uninsured_18to64$prop, na.rm = TRUE)
```

```{r}
ggplot(uninsured_18to64, aes(fill = prop)) +
  geom_sf(aes(geometry = geometry)) +
  labs(title = "Proportion of 18 to 64 year olds uninsured") +
  scale_fill_distiller(palette = "YlOrRd", 
                       direction = 1) +
  theme_void() 
```

### Median Income

```{r}
income <- master %>%
  filter(variable == "median_income") %>%
  mutate(income_mill = estimate / 1000000)

median(income$estimate, na.rm = TRUE)
```

```{r}
ggplot(income, aes(fill = income_mill)) + 
  geom_sf(aes(geometry = geometry)) + 
  labs(title = "Median income (in millions)") +
  scale_fill_distiller(palette = "YlGn", 
                       direction = 1) +
  theme_void()
```

### Proportion of Population Self-Identifying as White

```{r}
white <- master %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  transmute(GEOID, Borough, NAME, geometry, total = total_race, white = identify_white) %>%
  mutate(prop = white / total)

median(white$prop, na.rm = TRUE)
```

```{r}
ggplot(white, aes(fill = prop)) + 
  geom_sf(aes(geometry = geometry)) + 
  labs(title = "Proportion of people who self-identify as white") +
  scale_fill_distiller(palette = "Purples", 
                       direction = 1) +
  theme_void()
```

### Proportion in Households of 4 or More

```{r}
household <- master %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  transmute(GEOID, Borough, NAME, geometry, total = total_household, 
            four_plus = fam_four_household + fam_five_household + fam_six_household + fam_seven_plus_household + nonfam_four_household + nonfam_five_household + nonfam_six_household + nonfam_seven_plus_household) %>%
  mutate(prop = four_plus / total)

median(household$prop, na.rm = TRUE)
```

```{r}
ggplot(household, aes(fill = prop)) + 
  geom_sf(aes(geometry = geometry)) + 
  labs(title = "Proportion in household of 4 or more") +
  scale_fill_distiller(palette = "YlOrRd", 
                       direction = 1) +
  theme_void()
```

### Proportion that Commutes by Bus

```{r}
bus <- master %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  transmute(GEOID, Borough, NAME, geometry, total = total_transport, bus) %>%
  mutate(prop = bus / total)

median(bus$prop, na.rm = TRUE)
```

```{r}
ggplot(bus, aes(fill = prop)) + 
  geom_sf(aes(geometry = geometry)) + 
  labs(title = "Proportion of population that commuted by bus") +
  scale_fill_distiller(palette = "YlOrRd", 
                       direction = 1) +
  theme_void()
```

### Proportion 65 and Older

```{r}
elderly <-  master %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  transmute(GEOID, Borough, NAME, geometry, total = total_age, 
            elderly = male_65to66 + male_67to69 + male_70to74 + male_75to79 + male_80to84 + male_85plus + female_65to66 + female_67to69 + female_70to74 + female_75to79 + female_80to84 + female_85plus) %>%
  mutate(prop = elderly / total) %>%
  filter(prop < 0.8)

median(elderly$prop, na.rm = TRUE)
```

```{r}
ggplot(elderly, aes(fill = prop)) + 
  geom_sf(aes(geometry = geometry)) + 
  labs(title = "Proportion of population 65+ years in age") +
  scale_fill_distiller(palette = "YlOrRd", 
                       direction = 1) +
  theme_void()
```

# Figure 2

### Loading the Data

```{r}
april_1_cases <- read_csv("Data/april-1-cases.csv", show_col_types = FALSE)

april_1_cases <- april_1_cases %>%
  mutate(GEOID = as.character(MODZCTA), prop_pos = Positive / Total)

load('/data/safegraph/safegraph.Rdata')

zipcodes <- read_csv("Data/nyc-zip-codes.csv") %>%
  mutate(GEOID = as.character(ZipCode))

mobility <- safegraph %>%
  mutate(GEOID = as.character(postal_code)) %>%
  inner_join(zipcodes, by = "GEOID")

feb_mobility <- mobility %>%
  filter(date >= "2020-03-01" & date <= "2020-04-30")
```

### February Baseline for Mobility

```{r}
median_daily_visits <- safegraph %>% 
  filter(date >= "2020-02-01" & date <= "2020-02-28") %>%
  group_by(postal_code) %>%
  summarize(median_visits = median(avg_visits_per_day))
```

### Change in Mobility

```{r}
change_in_mobility <- feb_mobility %>% 
  left_join(median_daily_visits, by = "postal_code") %>%
  mutate(change_in_mobility = (avg_visits_per_day - median_visits) / median_visits) 

median_change_in_mobility <- change_in_mobility %>%
  group_by(date) %>%
  summarize(median_change_in_mobility = median(change_in_mobility, na.rm = TRUE),
            Q1 = quantile(change_in_mobility, 0.25, na.rm = TRUE),
            Q3 = quantile(change_in_mobility, 0.75, na.rm = TRUE))

change_in_mobility <- change_in_mobility %>%
  filter(abs(change_in_mobility) < 1.25)
```

### Plotting Change in Mobility

```{r}
change_in_mobility %>%
  filter(change_in_mobility < 1) %>%
ggplot() +
  geom_violin(aes(y = as.factor(date), x = change_in_mobility), color = "orange") +
 geom_pointrange(aes(y = as.factor(date), xmin = Q1, xmax = Q3, x = median_change_in_mobility), median_change_in_mobility, color = "red") +
  xlim(-1, 1.25) + 
  labs(x = "Change in mobility relative to baseline", y = "Date")
```

# April 1, 2020 Cases

# Individual Regressions

### 4 Plus Person Household

```{r}
april_1_cases_household <- inner_join(household, april_1_cases, by = "GEOID") %>%
  transmute(GEOID, Borough, prop_four_plus = prop, prop_pos)

household_model <- lm(prop_pos ~ prop_four_plus, april_1_cases_household)

april_1_cases_household <- april_1_cases_household %>%
  add_predictions(household_model)

household_r_square <- summary(household_model)$r.squared
```

### Uninsured 18 to 64

```{r}
april_1_cases_uninsured <- inner_join(uninsured_18to64, april_1_cases, by = "GEOID") %>%
  transmute(GEOID, Borough, prop_uninsured = prop, prop_pos)

uninsured_model <- lm(prop_pos ~ prop_uninsured, april_1_cases_uninsured)

april_1_cases_uninsured <- april_1_cases_uninsured %>%
  add_predictions(uninsured_model)

uninsured_r_square <- summary(uninsured_model)$r.squared
```

### Self-Identifying as White

```{r}
april_1_cases_white <- inner_join(white, april_1_cases, by = "GEOID") %>%
  transmute(GEOID, Borough, prop_white = prop, prop_pos)

white_model <- lm(prop_pos ~ prop_white, april_1_cases_white)

april_1_cases_white <- april_1_cases_white %>%
  add_predictions(white_model)

white_r_square <- summary(white_model)$r.squared
```

### Median Income

```{r}
april_1_cases_income <- inner_join(income, april_1_cases, by = "GEOID") %>%
  transmute(GEOID, Borough, income_mill, prop_pos)

income_model <- lm(prop_pos ~ income_mill, april_1_cases_income)

april_1_cases_income <- april_1_cases_income %>%
  add_predictions(income_model)

income_r_square <- summary(income_model)$r.squared
```

### Mobility

```{r}
mar_23_mobility <- change_in_mobility %>%
  filter(date == "2020-03-23") %>%
  mutate(GEOID = as.character(postal_code))

april_1_cases_mobility <- inner_join(mar_23_mobility, april_1_cases, by = "GEOID") %>%
  transmute(GEOID, date, prop_pos, change_in_mobility)

mobility_model <- lm(prop_pos ~ change_in_mobility, april_1_cases_mobility)

april_1_cases_mobility <- april_1_cases_mobility %>%
  add_predictions(mobility_model)

mobility_r_square <- summary(mobility_model)$r.squared
```

### Using Bus for Commute

```{r}
april_1_cases_bus <- inner_join(bus, april_1_cases, by = "GEOID") %>%
  transmute(GEOID, Borough, prop_bus = prop, prop_pos)

bus_model <- lm(prop_pos ~ prop_bus, april_1_cases_bus)

april_1_cases_bus <- april_1_cases_bus %>%
  add_predictions(bus_model)

bus_r_square <- summary(bus_model)$r.squared
```

### Elderly

```{r}
april_1_cases_elderly <- inner_join(elderly, april_1_cases, by = "GEOID") %>%
  transmute(GEOID, Borough, prop_elderly = prop, prop_pos)

elderly_model <- lm(prop_pos ~ prop_elderly, april_1_cases_elderly)

april_1_cases_elderly <- april_1_cases_elderly %>%
  add_predictions(elderly_model)

elderly_r_square <- summary(elderly_model)$r.squared
```

## Comparing R^2^ Values

|Variable|Published| Our Estimate|
|:--:|:--:|:--:|
|Household|41%|`r round(household_r_square * 100, 0) `%|
|Uninsured|38%|`r round(uninsured_r_square * 100, 0) `%|
|White|34%|`r round(white_r_square * 100, 0) `%|
|Income|32%|`r round(income_r_square * 100, 0) `%|
|Mobility|19%|`r round(mobility_r_square * 100, 0) `%|
|Bus|13%|`r round(bus_r_square * 100, 0) `%|
|Elderly|3%|`r round(elderly_r_square * 100, 0) `%|

# Table 1 Multi-Variable Regression

### Joining the Tables

```{r}
april_1_cases_table1 <- april_1_cases_household %>%
  transmute(GEOID, prop_four_plus) %>%
  inner_join(april_1_cases_uninsured, by = "GEOID") %>%
  transmute(GEOID, prop_four_plus, prop_uninsured) %>%
  inner_join(april_1_cases_white, by = "GEOID") %>%
  transmute(GEOID, prop_four_plus, prop_uninsured, prop_white) %>%
  inner_join(april_1_cases_income, by = "GEOID") %>%
  transmute(GEOID, Borough, prop_four_plus, prop_uninsured, prop_white, income_mill, prop_pos, geometry)
```

```{r}
april_1_cases_table2 <- april_1_cases_household %>%
  transmute(GEOID, prop_four_plus) %>%
  inner_join(april_1_cases_uninsured, by = "GEOID") %>%
  transmute(GEOID, prop_four_plus, prop_uninsured) %>%
  inner_join(april_1_cases_white, by = "GEOID") %>%
  transmute(GEOID, prop_four_plus, prop_uninsured, prop_white) %>%
  inner_join(april_1_cases_income, by = "GEOID") %>%
  transmute(GEOID, prop_four_plus, prop_uninsured, prop_white, income_mill, geometry) %>%
  inner_join(april_1_cases_elderly, by = "GEOID") %>%
  transmute(GEOID, prop_four_plus, prop_uninsured, prop_white, income_mill, prop_elderly, geometry) %>%
  inner_join(april_1_cases_bus, by = "GEOID") %>%
  transmute(GEOID, prop_four_plus, prop_uninsured, prop_white, income_mill, prop_elderly, prop_bus, geometry) %>%
  inner_join(april_1_cases_mobility, by = "GEOID") %>%
  transmute(GEOID, prop_four_plus, prop_uninsured, prop_white, income_mill, prop_pos, prop_elderly, prop_bus, change_in_mobility, geometry)
```

### Multi-Variable Regression - Table 1

```{r}
multi_model <- lm(prop_pos ~ prop_four_plus + prop_uninsured + prop_white + income_mill, april_1_cases_table1)

april_1_cases_table1 <- april_1_cases_table1 %>%
  add_predictions(multi_model)

summary(multi_model)

multi_r_square <- summary(multi_model)$r.squared
```

### Multi-Variable Regression - Table 2

```{r}
multi_model_2 <- lm(prop_pos ~ prop_elderly + prop_bus + income_mill + prop_white + prop_uninsured + prop_four_plus, april_1_cases_table2)

april_1_cases_table2 <- april_1_cases_table2 %>%
  add_predictions(multi_model_2)

summary(multi_model_2)

multi2_r_square <- summary(multi_model_2)$r.squared

multi_model_mobility <- lm(prop_pos ~ prop_elderly + prop_bus + income_mill + prop_white + prop_uninsured + prop_four_plus + change_in_mobility, april_1_cases_table2)

april_1_cases_table2 <- april_1_cases_table2 %>%
  add_predictions(multi_model_mobility)

summary(multi_model_mobility)

multi_mobility_r_square <- summary(multi_model_mobility)$r.squared
```

## Comparing R^2^ Values

|Variable|Published| Our Estimate|
|:--:|:--:|:--:|
|Multi Table 1|56%|`r round(multi_r_square * 100, 0) `%|
|Multi Table 2|57%|`r round(multi2_r_square * 100, 0) `%|
|Multi Table 2 + Mobility|57%|`r round(multi_mobility_r_square * 100, 0) `%|

# Extension

### Influence of Proportion of People who Take Public Transportation on COVID Positivity

```{r}
public <- master %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  transmute(GEOID, Borough, NAME, geometry, public = total_public, total = total_transport) %>%
  mutate(prop_public = public / total)

median(public$prop_public, na.rm = TRUE)
```

```{r}
ggplot(public, aes(fill = prop_public)) + 
  geom_sf(aes(geometry = geometry)) + 
  labs(title = "Proportion of Public Commuters") +
  scale_fill_distiller(palette = "RdPu", 
                       direction = 1) +
  theme_void()
```

### Graphing Positivity Based on Zip

```{r}
zips_with_geom <- master %>% 
  group_by(GEOID) %>%
  summarize(count = n())

april_1_cases <- april_1_cases %>%
  inner_join(zips_with_geom, by = "GEOID") %>%
  transmute(GEOID, Positive, Total, prop_pos, geometry)

ggplot(april_1_cases, aes(fill = prop_pos)) + 
  geom_sf(aes(geometry = geometry)) + 
  labs(title = "Proportion of Covid Positivity") +
  scale_fill_distiller(palette = "RdPu", 
                       direction = 1) +
  theme_void()
```

The biggest proportion of positivity is in Corona, Queens. This makes sense, as Corona is the most populous zipcode. 

There is also greater positivity in areas of Queens near JFK airport, as these areas are also greatly populated. 

We would expect to see greater positivity in Manhattan based on the proportion of public transport. However, it's possible many people in Manhattan went away during COVID, or the nature of job in Manhttan, such as those in finance and law, etc., are easier to work from home as opposed to those in lower income communities. These positions may also not be seen as "essential" in the same way as those of people in other zipcodes. Further analysis for this can be done if there is data on the proportion of people in these areas working from home during April. 

# Regression Analysis

```{r}
april_1_cases_public <- inner_join(public, april_1_cases, by = "GEOID") %>%
  transmute(GEOID, Borough, prop_public, prop_pos)

public_model <- lm(prop_pos ~ prop_public, april_1_cases_public)

april_1_cases_public <- april_1_cases_public %>%
  add_predictions(public_model)

public_r_square <- summary(public_model)$r.squared
```

### Comparing Bus to All Public Transport

|Bus|Public|
|:--:|:--:|
|`r round(bus_r_square * 100, 0) `%|`r round(public_r_square * 100, 0) `%|

The R^2^ value for all public transport is much smaller than the R^2^ for bus commuters. This would probably not be used in a multi-variate regression.