datacollection.rmd

---
title: "Research Assistantship - Data Collection"
author: "Federico Vicentini"
date: "01/05/2023"
output: pdf_document
bibliography: references.bib
header-includes:
  - \usepackage{longtable}
---


```{r setup, include=FALSE}
library(knitr)
knitr::opts_chunk$set(echo = FALSE)
options(warn = -1)
```

```{r p0, message=FALSE, echo=FALSE}

# Clear the variables
rm(list = ls())


# Install packages
packages <- c("eurostat", "stargazer", "fredr", "kableExtra",
              "here", "knitr", "tinytex", "rmarkdown", "xfun", "pandoc",
              "wbstats", "Rilostat", "sandwich", "dplyr", "urca", "lmtest",
              "haven")
new_packages <- packages[!(packages %in% installed.packages()[, "Package"])]
if (length(new_packages)) install.packages(new_packages, repos="https://cran.mirror.garr.it/CRAN/")
invisible(lapply(packages, library, character.only = TRUE))

# Set the working directory to source file location with
# setwd(here::here())
```

# DATA COLLECTION

## Real GDP Data

\par In this first part, we download nominal GDP data and GDP deflator data from 1980 
to 2020 and then we divide it into the 2 section (1980s and 2010s) in order to calculate
means and thus compare it to the data we got in the TTD Presentation.


```{r p1, message=FALSE, echo=FALSE}
library(fredr)
fredr_set_key("5946a6a1c79f3fe49bea4be0ef8e82e8")
gdp = fredr(
  series_id = "GDP",
  observation_start = as.Date("1981-01-01"),
  observation_end = as.Date("2020-12-31"),
  frequency = "a",
  units = "pc1",
  aggregation_method = "eop"
)

gdp1 = fredr(
  series_id = "GDP",
  observation_start = as.Date("1981-01-01"),
  observation_end = as.Date("1990-12-31"),
  frequency = "a",
  units = "pc1",
  aggregation_method = "eop"
)

gdp2 = fredr(
  series_id = "GDP",
  observation_start = as.Date("2011-01-01"),
  observation_end = as.Date("2019-12-31"),
  frequency = "a",
  units = "pc1",
  aggregation_method = "eop"
)

gdp = data.frame(gdp)
gdp1 = data.frame(gdp1)
gdp2 = data.frame(gdp2)

deflator = fredr(
  series_id = "A191RI1Q225SBEA",
  observation_start = as.Date("1981-01-01"),
  observation_end = as.Date("2020-12-31"),
  frequency = "a",
  units="lin",
  aggregation_method = "eop"
)

deflator1 = fredr(
  series_id = "A191RI1Q225SBEA",
  observation_start = as.Date("1981-01-01"),
  observation_end = as.Date("1990-12-31"),
  frequency = "a",
  units="lin",
  aggregation_method = "eop"
)

deflator2 = fredr(
  series_id = "A191RI1Q225SBEA",
  observation_start = as.Date("2011-01-01"),
  observation_end = as.Date("2019-12-31"),
  frequency = "a",
  units="lin",
  aggregation_method = "eop"
)

deflator = data.frame(deflator)
deflator1 = data.frame(deflator1)
deflator2 = data.frame(deflator2)

realgdp = gdp
realgdp$value = realgdp$value - deflator$value
#mean(realgdp$value)

realgdp1 = gdp1
realgdp1$value = realgdp1$value - deflator1$value
mean80s = mean(realgdp1$value)

realgdp2 = gdp2
realgdp2$value = realgdp2$value - deflator2$value
mean10s=mean(realgdp2$value)


```

```{r p1.1, message=FALSE}
plot(realgdp$date, realgdp$value, type="l",
      xlab="Time", ylab= "Real GDP growth")
plot(realgdp1$date, realgdp1$value, type="l",
      xlab="Time", ylab= "Real GDP growth")
plot(realgdp2$date, realgdp2$value, type="l",
      xlab="Time", ylab= "Real GDP growth")
```

## Labor Share of Output

### "Naive" Labor Share (LS1)

First attempt here is to download the laborshare timeseries from fred.
You can see from the values of the plots that the level of the labor share is 
underestimated if we use LS1. In fact, this approach neglects some important 
aspects, such as the role of self-employed workers and correction to value added 
in the forms of indirect taxes and consumption of fixed capital.

```{r p2.0}
labshare = fredr(
  series_id = "LABSHPUSA156NRUG",
  observation_start = as.Date("1981-01-01"),
  observation_end = as.Date("2020-12-31"),
  frequency = "a"
)

labshare1 = fredr(
  series_id = "LABSHPUSA156NRUG",
  observation_start = as.Date("1981-01-01"),
  observation_end = as.Date("1990-12-31"),
  frequency = "a"
)

labshare2 = fredr(
  series_id = "LABSHPUSA156NRUG",
  observation_start = as.Date("2011-01-01"),
  observation_end = as.Date("2019-12-31"),
  frequency = "a"
)


labshare = data.frame(labshare)

labshare1 = data.frame(labshare1)

labshare2 = data.frame(labshare2)


plot(labshare$date, labshare$value, type="l",
      xlab="Time", ylab= "LS1")
plot(labshare1$date,labshare1$value, type="l",
      xlab="Time", ylab= "LS1")
plot(labshare2$date,labshare2$value, type= "l",
      xlab="Time", ylab= "LS1")

```


### Guerriero Index

This is precisely why we tried to replicate the laborshare measure provided by 
{@guerriero2019}, defined as:

$$LS6 = \frac{compensation\;of\;employees * 
\left(\frac{workforce-employers}{employees}\right)}
{value\;added-ind.\;taxes-fixed\;cap.\;cons.}$$


Data is retrieved from Ilostat R package API and (@dataunorg)


```{r p2.1, message=FALSE, echo=FALSE}
#API key for bls is: 7b39505b098044f886d056883d2bd925
#library(blsAPI)

#s=wb_search("compensation of employees")
comp=wb_data(country="US","GC.XPN.COMP.CN")

#s=fredr_series_search_text("compensation")
#View(s)

labforce1=read.csv("civilianlaborforce.csv", sep=",")
labforce1=subset(labforce1, labforce1$Period=="M12")

nomgdp = fredr(
  series_id = "GDP",
  observation_start = as.Date("1981-01-01"),
  observation_end = as.Date("2020-12-31"),
  frequency = "a",
  units = "lin",
  aggregation_method = "eop"
)


#This should be the correct indicator to use for the labor force level
labforce=get_ilostat("EAP_TEAP_SEX_AGE_NB_A")
labforce=subset(labforce, labforce$indicator=="EAP_TEAP_SEX_AGE_NB"&
                          labforce$ref_area=="USA"&
                          labforce$classif1=="AGE_AGGREGATE_TOTAL"&
                          labforce$sex=="SEX_T"&
                          labforce$time>=1980)
toc=get_ilostat_toc()


# Assume your dataframe is called "my_df" and your time column is called "time"
# Sort the dataframe in ascending order of time
labforce <- labforce %>% arrange(labforce$time)
#plot(labforce1$Value-labforce$obs_value)

#Download and filter the dataset with all employed by status
database=get_ilostat("EMP_TEMP_SEX_AGE_STE_NB_A")


empl=subset(database, database$indicator=="EMP_TEMP_SEX_AGE_STE_NB"&
                  database$ref_area=="USA"&
                  database$classif1=="AGE_AGGREGATE_TOTAL"&
                  database$classif2 == "STE_AGGREGATE_TOTAL"&
                  database$sex=="SEX_T"&
                  database$time>=1980)

#To get only databaseoyees, filter for STE_ICSE93_1
emplee=subset(database, database$indicator=="EMP_TEMP_SEX_AGE_STE_NB"&
                      database$ref_area=="USA"&
                      database$classif1=="AGE_AGGREGATE_TOTAL"&
                      database$classif2 == "STE_ICSE93_1"&
                      database$sex=="SEX_T"&
                      database$time>=1980)

#Filter for STE_ICSE93_3
icse93_3=subset(database, database$indicator=="EMP_TEMP_SEX_AGE_STE_NB"&
                      database$ref_area=="USA"&
                      database$classif1=="AGE_AGGREGATE_TOTAL"&
                      database$classif2 == "STE_ICSE93_3"&
                      database$sex=="SEX_T"&
                      database$time>=1980)

#Filter for STE_ICSE93_5
icse93_5=subset(database, database$indicator=="EMP_TEMP_SEX_AGE_STE_NB"&
                      database$ref_area=="USA"&
                      database$classif1=="AGE_AGGREGATE_TOTAL"&
                      database$classif2 == "STE_ICSE93_5"&
                      database$sex=="SEX_T"&
                      database$time>=1980)

#Filter for STE_AGGREGATE_SLF
aggslf=subset(database, database$indicator=="EMP_TEMP_SEX_AGE_STE_NB"&
                      database$ref_area=="USA"&
                      database$classif1=="AGE_AGGREGATE_TOTAL"&
                      database$classif2 == "STE_AGGREGATE_SLF"&
                      database$sex=="SEX_T"&
                      database$time>=1980)


#empl = select(empl, time, obs_value)
#names(empl)[2]="empl"

emplee = select(emplee, time, obs_value)
names(emplee)[2]="emplee"

icse93_3 = select(icse93_3, time, obs_value)
names(icse93_3)[2]="icse93_3"

icse93_5 = select(icse93_5, time, obs_value)
names(icse93_5)[2]="icse93_5"

aggslf = select(aggslf, time, obs_value)
names(aggslf)[2]="aggslf"

labforce = select(labforce, time, obs_value)
names(labforce)[2]="labforce"

mergemp <- empl %>%
            merge(labforce, by = "time") %>%
            merge(emplee, by = "time") %>%
            merge(icse93_3, by = "time") %>%
            merge(icse93_5, by = "time") %>%
            merge(aggslf, by = "time")
            
toc=get_ilostat_toc()

# Calculate the number of employers as aggslf - icse93_3 - icse93_5
# this is under the assumption that icse93_4 is negligible
# in fact is around 8k workers for the US economy as a whole

#mergemp$emplrs = mergemp$aggslf - mergemp$icse93_3 - mergemp$icse93_5

# This is not a feasible way to do it, let's stick to LS5 indicator
# instead of LS6


```

For lack of available data on the structure of the labor force in the US
prior to the introduction of the ICSE93 standard, we will use LS5 instead of
the LS6 index, defined as:

$$LS6 = \frac{compensation\;of\;employees * 
\left(\frac{workforce}{employees}\right)}
{value\;added-ind.\;taxes-fixed\;cap.\;cons.}$$

Below you can find also an example of the result we would get by trying to 
compute LS6 too.

Note: since (@guerriero2019) says that "Data on the composition of the workforce is not 
always available for every year. When absent, it is assumed to be the same as in the 
previous year (Gollin 2002). This is a realistic assumption (Askenazy 2003), given 
that the composition of the workforce is relatively constant over time.", and since we
have data on the composition of the workforce from 1994 onward, we will assume that 
in years preceding 1994, workforce composition is the same. Below, you can find a plot
of workforce/employees for 1994 onward, where you can see that the figure is pretty stable.
However, instead of taking 1994 value and projecting backward, I found it better to 
make an average for the period 1994-2000 and then project that figure backward.

Note2: since there are 2 different time series for SNA data, coming from two different
accounting standards (SNA93 and SNA08), we tried 2 approaches. First, we tried to find
a conversion factor between those 2 standards (using first lags in order not to have 
spurious regressions), but this approach leads to a big jump in the data around the
cutoff year. Thus, the second method was just to keep the newer standard for years after
1995, and the old one for years prior. This is probably what @guerriero2019 did in the paper,
since the results look similar (albeit different since we use LS5 and not LS6). This
is the method I selected in the end.

```{r p2.12, message=FALSE}
# For lack of available data, we will use  LS5 and not LS6
#Now, calculate the multiplier of the compensation for every year

mergemp=mergemp[-c(28:29),]

mergemp$mult = (mergemp$labforce)/mergemp$emplee
mergemp$mult6 = (mergemp$labforce-mergemp$icse93_3)/mergemp$emplee

plot(mergemp$time,mergemp$mult,
  main = "Total Workforce / Number of Employees",
  typ="l",
  xlab="Time", 
  ylab="Multiplier")

# Now, add the other years by:

multtrain=mergemp$mult[c(1:7)]
simmult=mean(multtrain)
simmult=rep(simmult, 14)
mult=append(simmult, mergemp$mult)

mult6train=mergemp$mult6[c(1:7)]
simmult6=mean(mult6train)
simmult6=rep(simmult6, 14)
mult6=append(simmult6, mergemp$mult6)

#plot(mult)

#Now retrieve data from the un on aggregates
un = read.csv("un-sna-aggregates.txt", sep=";")
names(un)[6]="time"
un = un[order(un$time), ]


#Use the time period of the interregnum (1995-2011), and maybe cut in two (1995-2008)
#Regression with sna2008 as dependent var and sna1993 as regressor
#Coefficient will be our conversion factor.
#Evaluate the model, then check if it can replicate the other series
#and check for violations of assumptions

#Apply to convert the vintage period


findconversion = function(s1,s2){
    lag_s1 <- lag(s1, n=1)
    model <- lm(s1 ~ lag_s1)
    # Check coefficient of lagged variable
    resid <- residuals(model)
    # Perform the Augmented Dickey-Fuller (ADF) test on residuals
    adf_test <- ur.df(resid, type = "drift")
    # Print the test results
    #print(summary(adf_test))
    lag_s2 <- lag(s2, n=1)
    #stargazer(model, type="text")
    model <- lm(s2 ~ lag_s2)
    # Check coefficient of lagged variable
    resid <- residuals(model)
    # Perform the Augmented Dickey-Fuller (ADF) test on residuals
    adf_test <- ur.df(resid, type = "drift")
    # Print the test results
    #print(summary(adf_test))
    #stargazer(model, type="text")
    s1=s1-lag(s1, n=1)
    s2=s2-lag(s2, n=1)
    train = lm(s1 ~ s2)
    #stargazer(train, type = "text")
    coef = train$coefficients
    return(coef)
}

# First is the fixed cap cons, then gross value added
# then compensation and lastly indirect taxes


codelist = c("K.1", "B.1*g", "D.1", "D.2-D.3")

codenames = c("time","fcc", "gdp", "wages", "ind_tax")

ts=data.frame(seq(1980,2020,1))


for(i in 1:length(codelist)){
  ts93 = subset(un, un$SNA93.Item.Code == codelist[i] &
                      un$Series == 100 & un$time <= 2011 & un$time >= 1995)
  ts08 = subset(un, un$SNA93.Item.Code == codelist[i] &
                      un$Series == 1000 & un$time <= 2011 & un$time >= 1995)
  tsold = subset(un, un$SNA93.Item.Code == codelist[i] &
                      un$Series == 100 & un$time < 1995)
  tsnew = subset(un, un$SNA93.Item.Code == codelist[i] &
                      un$Series == 1000 & un$time >= 1995)
  coef = findconversion(ts08$Value, ts93$Value)
  simts08 = tsold$Value * coef[2] + coef[1]
  addts=c(simts08, tsnew$Value)
  ts[,i+1]=addts
  #plot(ts[,1], addts, type = "o")
}

tsb=data.frame(seq(1980,2020,1))


for(i in 1:length(codelist)){
  tsnew = subset(un, un$SNA93.Item.Code == codelist[i] &
                      un$Series == 1000 & un$time >= 1995)
  tsold = subset(un, un$SNA93.Item.Code == codelist[i] &
                      un$Series == 100 & un$time < 1995)
  addts=c(tsold$Value, tsnew$Value)
  tsb[,i+1]=addts
}


names(ts) = codenames
names(tsb) = codenames

ts$mult=mult
tsb$mult=mult
tsb$mult6=mult6

ts$labshare=(ts$wages*ts$mult)/(ts$gdp-ts$ind_tax-ts$fcc)
tsb$labshare=(tsb$wages*tsb$mult)/(tsb$gdp-tsb$ind_tax-tsb$fcc)
tsb$labshare6=(tsb$wages*tsb$mult6)/(tsb$gdp-tsb$ind_tax-tsb$fcc)
#plot(ts$time, ts$labshare)
plot(tsb$time, tsb$labshare, main="Labor Share of Output (LS5)",
                             type="l",
                             xlab="Time",
                             ylab="LS5")
plot(tsb$time, tsb$labshare6, main="Labor Share of Output (LS6)",
                              type="l",
                              xlab="Time",
                              ylab="LS6")

period1=tsb[c(2:11),]
labshmean1=mean(period1$labshare)*100


labsh6mean1=mean(period1$labshare6)*100


period2=tsb[c(32:40),]
labshmean2=mean(period2$labshare)*100

labsh6mean2=mean(period2$labshare6)*100

labsh11=mean(tsb$labshare[c(32)])*100

labsh611=mean(tsb$labshare6[c(32)])*100

#(labshmean2-labshmean1)

#(labsh6mean2-labsh6mean1)

```


## Price Markup

Price markup is computed according to @deloecker2020, using the replication
file related to the paper and extracting the time series of aggregate markup
presented as the main finding of the paper itself.

```{r mk,  message=FALSE}
# try to add the sectoral markups if you find them!!!


mkts=read.csv("markup.csv", sep=",")

mkts = subset(mkts, year %in% c(1980:2016))

mkts80 = subset(mkts,  year %in% c(1981:1990) )

avg80 = mean(mkts80$MARKUP_spec1)
#avg80

mkts10 = subset(mkts,  year %in% c(2011:2016) )

avg10 = mean(mkts10$MARKUP_spec1)
#avg10

plot(mkts$year,mkts$MARKUP_spec1, type="l", col="blue",
      xlab = "Time", ylab = "Markup")
```

# Market Concentration

Here we present a slideshow of graphs of various concentration measures:

\begin{itemize}
  \item CR4
  \item CR20
  \item Herfindal-Hirschman (HH) Index
\end{itemize}

All measures are computed using Compustat firm-level data, so they rely on data
exclusively on US publicly traded firms. Compustat data has already been cleaned in
Stata (the file is saved as "compustatclean.do"). 
Concentration measures are computes firstly at the Naics 4-digit level, then at the
3-digit one. Aggregation by sector is done according to the sectorization provided in @autor2020.
For every SIC code employed by Autor, we tracked down the corresponding Naics code and replicated the
procedure.
In the end, the best results came from CR4 and HHI computed at the Naics 3 level.
Nonetheless, the upward trend is not that clear in all sectors: that could be due to multiple factors:
\begin{itemize}
  \item We deal only with publicly traded firms, thus it is possible that concentration measures
        computed are higher, since the denominator is actually smaller than the real size of them
        corresponding industrial sector if privately owned companies are accounted for.
  \item The second reason is related to the first one. Since in the 1980s the stock market was 
        smaller than today, both in absolute value and as a share of the total economy, it is 
        possible that the concentration measures are biased upward more at the start of the 
        time series than they are at the end.
  \item Specifically when using highly detailed sectorization like naics4, in our sample the population
        of firms in the sector shrinks down to even less than 4 in some cases. It is clear that in those
        cases, the concentration measure is equal to 1, thus biasing our weighted mean upward. Computing
        measures at the naics3 level partially accounts for this bias, but not for all sectors, since
        economic sectors like agriculture are not widely represented in the sample of publicly traded firms.
\end{itemize}

These are just some of the possible reasons why our upward trend seem to be not so strong in 
every sector, and not so strong even at the aggregated level for some of the indices.

```{r mktconc, message=FALSE, warning=FALSE}
compustat <- data.frame(read_dta("leancompustat.dta"))
compustat <- filter(compustat, fyear>=1980 & fyear<=2019)
compustat$sector=as.factor(compustat$sector)
compustat$naics3=as.factor(compustat$naics3)
library(dplyr)

# Use the same aggregation scheme as Autor et al. and trasnfer the mapping into NAICS codes: 
# what we keep out of analysis are naics:
# 11 23 56 61 62 71 72 81 and 99, as well as everything but 211 and 213 in 21 and 
#  everything but 221 in 22 , and same for 517 in 51. they are all non relevant 
# or semipublic sectors, or constructions and accomodation which are particularly 
# full of SMEs not covered by Compustat Data


# Create a new column "Classification" with initial value as "Other"
compustat$AutorClass <- "Other"

# Update the "Classification" column based on the conditions
compustat$AutorClass[compustat$naics2 %in% c(31, 32, 33) & compustat$naics3 != 325] <- "Manufacturing"
compustat$AutorClass[compustat$naics2 %in% c(48, 49) & compustat$naics3 != 486] <- "Transportation"
compustat$AutorClass[compustat$naics2 %in% c(44, 45)] <- "Retail"
compustat$AutorClass[compustat$naics2 %in% c(42)] <- "Wholesale"
compustat$AutorClass[compustat$naics3 %in% c(211, 213, 486)] <- "Petroleum & Gas"
compustat$AutorClass[compustat$naics2 %in% c(52)] <- "Finance & Insurance"
compustat$AutorClass[compustat$naics3 %in% c(221, 517)] <- "TelCo & Utilities"
compustat$AutorClass[compustat$naics3 == 325] <- "Chemicals"
compustat$AutorClass[compustat$naics2 %in% c(54)] <- "Business Services"

compustat$AutorClass <- as.factor(compustat$AutorClass)

# Step 1: Calculate cr4, cr20 and HHI with naics4 sector sales and empl
compustat <- compustat %>%
  group_by(fyear) %>%
  mutate(total_sales = sum(sale))%>%
  mutate(total_emp = sum(emp))%>%
  group_by(fyear, naics4) %>%
  mutate(sector_sales = sum(sale)) %>%
  mutate(sector_emp = sum(emp)) %>%
  mutate(ratiosal = (sale/sector_sales)*100) %>%
  mutate(ratioemp = (emp/sector_emp)*100) %>%
  mutate(sectorweight4 = sector_sales/total_sales)%>%
  mutate(sectorweight4emp = sector_emp / total_emp) %>%
  mutate(rank4 = dense_rank(desc(sale))) %>%
  mutate(rank4emp = dense_rank(desc(emp))) %>%
  mutate(population4 = n()) %>%
  mutate(n4cr4sal = sum(ifelse(rank4<=4, sale, 0))/sector_sales) %>%
  mutate(n4cr20sal = sum(ifelse(rank4<=20, sale, 0))/sector_sales) %>%
  mutate(n4cr4emp = sum(ifelse(rank4emp<=4, emp, 0))/sector_emp) %>%
  mutate(n4cr20emp = sum(ifelse(rank4emp<=20, emp, 0))/sector_emp) %>%
  mutate(n4hhisal = sum(ratiosal^2)) %>%
  mutate(n4hhiemp = sum(ratioemp^2)) %>%
  group_by(fyear, AutorClass) %>%
  mutate(Autorsales = sum(sale)) %>%
  mutate(Autorweightsal = Autorsales/total_sales) %>%
  mutate(Autoremp = sum(emp)) %>%
  mutate(Autorweightemp = Autoremp/total_emp)

# Step 2: Calculate cr4 and cr20 with naics3 sector sales and empl
compustat <- compustat %>%
  group_by(fyear, naics3) %>%
  mutate(sector_sales3 = sum(sale)) %>%
  mutate(sector_emp3 = sum(emp)) %>%
  mutate(ratiosal3 = (sale/sector_sales3)*100) %>%
  mutate(ratioemp3 = (emp/sector_emp3)*100) %>%
  mutate(sectorweight3 = sector_sales3/total_sales)%>%
  mutate(sectorweight3emp = sector_emp3 / total_emp) %>%
  mutate(rank3 = dense_rank(desc(sale))) %>%
  mutate(rank3emp = dense_rank(desc(emp))) %>%
  mutate(population3 = n()) %>%
  mutate(n3cr4sal = sum(ifelse(rank3<=4, sale, 0))/sector_sales3) %>%
  mutate(n3cr20sal = sum(ifelse(rank3<=20, sale, 0))/sector_sales3) %>%
  mutate(n3cr4emp = sum(ifelse(rank3emp<=4, emp, 0))/sector_emp3) %>%
  mutate(n3cr20emp = sum(ifelse(rank3emp<=20, emp, 0))/sector_emp3) %>%
  mutate(n3hhisal = sum(ratiosal3^2)) %>%
  mutate(n3hhiemp = sum(ratioemp3^2))

  
# Aggregate database with cr4 by naics2 with naics4 aggregation and sales
# Normalize the weights within each naics4 and fyear group
n4aggcr <- compustat %>%
  group_by(fyear, naics4) %>%
  summarize(cr4sal = n4cr4sal, cr20sal = n4cr20sal, cr4emp = n4cr4emp, cr20emp = n4cr20emp,
            naics1 = naics1, naics2 = naics2, sectorweight = sectorweight4, 
            sectorweightemp = sectorweight4emp, sector = sector,
            hhiemp = n4hhiemp, hhisal = n4hhisal, AutorClass = AutorClass, 
            Autorweightsal = Autorweightsal, Autorweightemp = Autorweightemp) %>%
  group_by(fyear, AutorClass) %>%
  mutate(normalweightsal = sectorweight / sum(sectorweight)) %>%
  mutate(normalweightemp = sectorweightemp / sum(sectorweightemp)) %>%
  summarize(cr4sal = weighted.mean(cr4sal, normalweightsal),
            cr20sal = weighted.mean(cr20sal, normalweightsal),
            cr4emp = weighted.mean(cr4emp, normalweightemp),
            cr20emp = weighted.mean(cr20emp, normalweightemp),
            hhisal = weighted.mean(hhisal, normalweightsal),
            hhiemp = weighted.mean(hhiemp, normalweightemp), 
            sectorweightsum = sum(normalweightsal), 
            sectorweightempsum = sum(normalweightemp), 
            Autorweightsal = mean(Autorweightsal), 
            Autorweightemp = mean(Autorweightemp)) %>%
  mutate( cr4salagg = weighted.mean(cr4sal, Autorweightsal), 
          cr4empagg = weighted.mean(cr4emp, Autorweightemp),
          cr20salagg = weighted.mean(cr20sal, Autorweightsal),
          cr20empagg = weighted.mean(cr20emp, Autorweightemp),
          hhisalagg = weighted.mean(hhisal, Autorweightsal),
          hhiempagg = weighted.mean(hhiemp, Autorweightemp))


n3aggcr <- compustat %>%
  group_by(fyear, naics3) %>%
  summarize(cr4sal = n3cr4sal,cr20sal=n3cr20sal, cr4emp=n3cr4emp, cr20emp=n3cr20emp,
            naics1=naics1, naics2 = naics2, sectorweight = sectorweight3, 
            sectorweightemp=sectorweight3emp, sector = sector, 
            hhiemp=n3hhiemp, hhisal=n3hhisal, AutorClass = AutorClass, 
            Autorweightsal = Autorweightsal, Autorweightemp = Autorweightemp) %>%
  group_by(fyear, AutorClass) %>%
  mutate(normalweightsal = sectorweight / sum(sectorweight)) %>%
  mutate(normalweightemp = sectorweightemp / sum(sectorweightemp)) %>%
  summarize(cr4sal = weighted.mean(cr4sal, normalweightsal),
            cr20sal = weighted.mean(cr20sal, normalweightsal),
            cr4emp = weighted.mean(cr4emp, normalweightemp),
            cr20emp = weighted.mean(cr20emp, normalweightemp),
            hhisal = weighted.mean(hhisal, normalweightsal),
            hhiemp = weighted.mean(hhiemp, normalweightemp), 
            sectorweightsum = sum(normalweightsal), 
            sectorweightempsum = sum(normalweightemp), 
            Autorweightsal = mean(Autorweightsal), 
            Autorweightemp = mean(Autorweightemp)) %>%
  mutate( cr4salagg = weighted.mean(cr4sal, Autorweightsal), 
          cr4empagg = weighted.mean(cr4emp, Autorweightemp),
          cr20salagg = weighted.mean(cr20sal, Autorweightsal),
          cr20empagg = weighted.mean(cr20emp, Autorweightemp),
          hhisalagg = weighted.mean(hhisal, Autorweightsal),
          hhiempagg = weighted.mean(hhiemp, Autorweightemp))


attach(compustat)

library(ggplot2)

#Position of the legend
pos = c(0.85, 0.1)

# Plot of CR4 with naics4 aggregation

ggplot(n4aggcr, aes(x = fyear, color = AutorClass)) +
  geom_line(aes(y = cr4sal, linetype = "CR4 Sal")) +
  geom_line(aes(y = cr4emp, linetype = "CR4 Emp")) +
  facet_wrap(~ AutorClass, scales = "free_y") +
  labs( x = "Year", y = "CR4", 
        title = "Time Series of CR4 by Sector (aggregation by Naics 4)", linetype = "") +
  scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
  theme_minimal() +
  guides(color = "none") +
  theme(legend.position = pos, legend.box = "horizontal")

#Plot of CR20 with naics4 aggregation

ggplot(n4aggcr, aes(x = fyear, color = AutorClass)) +
  geom_line(aes(y = cr20sal, linetype = "CR20 Sal")) +
  geom_line(aes(y = cr20emp, linetype = "CR20 Emp")) +
  facet_wrap(~ AutorClass, scales = "free_y") +
  labs( x = "Year", y = "CR20", 
        title = "Time Series of CR20 by Sector (aggregation by Naics 4)", linetype = "") +
  scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
  theme_minimal() +
  guides(color = "none") +
  theme(legend.position = pos, legend.box = "horizontal")


#Plot of HHI with naics4 aggregation

ggplot(n4aggcr, aes(x = fyear, color = AutorClass)) +
  geom_line(aes(y = hhisal, linetype = "HHI Sal")) +
  geom_line(aes(y = hhiemp, linetype = "HHI Emp")) +
  facet_wrap(~ AutorClass, scales = "free_y") +
  labs( x = "Year", y = "HHI", 
        title = "Time Series of HHI by Sector (aggregation by Naics 4)", linetype = "") +
  scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
  theme_minimal() +
  guides(color = "none") +
  theme(legend.position = pos, legend.box = "horizontal")

#Plot of AGGREGATED CR4 with naics4 aggregation

ggplot(n4aggcr, aes(x = fyear, color = AutorClass)) +
  geom_line(aes(y = cr4salagg, linetype = "CR4 Sal")) +
  geom_line(aes(y = cr4empagg, linetype = "CR4 Emp")) +
  labs( x = "Year", y = "CR4", 
        title = "Time Series of Aggregated CR4 (Naics 4 aggregation)", linetype = "") +
  scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
  theme_minimal() +
  guides(color = "none") +
  theme(legend.position = pos, legend.box = "horizontal")

#Plot of AGGREGATED CR20 with naics4 aggregation

ggplot(n4aggcr, aes(x = fyear, color = AutorClass)) +
  geom_line(aes(y = cr20salagg, linetype = "CR20 Sal")) +
  geom_line(aes(y = cr20empagg, linetype = "CR20 Emp")) +
  labs( x = "Year", y = "CR20", 
        title = "Time Series of Aggregated CR20 (Naics 4 aggregation)", linetype = "") +
  scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
  theme_minimal() +
  guides(color = "none") +
  theme(legend.position = pos, legend.box = "horizontal")

#Plot of AGGREGATED HHI with naics4 aggregation

ggplot(n4aggcr, aes(x = fyear, color = AutorClass)) +
  geom_line(aes(y = hhisalagg, linetype = "HHI Sal")) +
  geom_line(aes(y = hhiempagg, linetype = "HHI Emp")) +
  labs( x = "Year", y = "HHI", 
        title = "Time Series of Aggregated HHI (Naics 4 aggregation)", linetype = "") +
  scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
  theme_minimal() +
  guides(color = "none") +
  theme(legend.position = pos, legend.box = "horizontal")


# Plot of CR4 with naics3 aggregation

ggplot(n3aggcr, aes(x = fyear, color = AutorClass)) +
  geom_line(aes(y = cr4sal, linetype = "CR4 Sal")) +
  geom_line(aes(y = cr4emp, linetype = "CR4 Emp")) +
  facet_wrap(~ AutorClass, scales = "free_y") +
  labs( x = "Year", y = "CR4", 
        title = "Time Series of CR4 by Sector (aggregation by Naics 3)", linetype = "") +
  scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
  theme_minimal() +
  guides(color = "none") +
  theme(legend.position = pos, legend.box = "horizontal")

#Plot of CR20 with naics3 aggregation

ggplot(n3aggcr, aes(x = fyear, color = AutorClass)) +
  geom_line(aes(y = cr20sal, linetype = "CR20 Sal")) +
  geom_line(aes(y = cr20emp, linetype = "CR20 Emp")) +
  facet_wrap(~ AutorClass, scales = "free_y") +
  labs( x = "Year", y = "CR20", 
        title = "Time Series of CR20 by Sector (aggregation by Naics 3)", linetype = "") +
  scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
  theme_minimal() +
  guides(color = "none") +
  theme(legend.position = pos, legend.box = "horizontal")

#Plot of HHI with naics3 aggregation

ggplot(n3aggcr, aes(x = fyear, color = AutorClass)) +
  geom_line(aes(y = hhisal, linetype = "HHI Sal")) +
  geom_line(aes(y = hhiemp, linetype = "HHI Emp")) +
  facet_wrap(~ AutorClass, scales = "free_y") +
  labs( x = "Year", y = "HHI", 
        title = "Time Series of HHI by Sector (aggregation by Naics 3)", linetype = "") +
  scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
  theme_minimal() +
  guides(color = "none") +
  theme(legend.position = pos, legend.box = "horizontal")

#Plot of AGGREGATED CR4 with naics3 aggregation

ggplot(n3aggcr, aes(x = fyear, color = AutorClass)) +
  geom_line(aes(y = cr4salagg, linetype = "CR4 Sal")) +
  geom_line(aes(y = cr4empagg, linetype = "CR4 Emp")) +
  labs( x = "Year", y = "CR4", 
        title = "Time Series of Aggregated CR4 (Naics 3 aggregation)", linetype = "") +
  scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
  theme_minimal() +
  guides(color = "none") +
  theme(legend.position = pos, legend.box = "horizontal")

#Plot of AGGREGATED CR20 with naics3 aggregation

ggplot(n3aggcr, aes(x = fyear, color = AutorClass)) +
  geom_line(aes(y = cr20salagg, linetype = "CR20 Sal")) +
  geom_line(aes(y = cr20empagg, linetype = "CR20 Emp")) +
  labs( x = "Year", y = "CR20", 
        title = "Time Series of Aggregated CR20 (Naics 3 aggregation)", linetype = "") +
  scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
  theme_minimal() +
  guides(color = "none") +
  theme(legend.position = pos, legend.box = "horizontal")

#Plot of AGGREGATED HHI with naics3 aggregation

ggplot(n3aggcr, aes(x = fyear, color = AutorClass)) +
  geom_line(aes(y = hhisalagg, linetype = "HHI Sal")) +
  geom_line(aes(y = hhiempagg, linetype = "HHI Emp")) +
  labs( x = "Year", y = "HHI", 
        title = "Time Series of Aggregated HHI (Naics 3 aggregation)", linetype = "") +
  scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
  theme_minimal() +
  guides(color = "none") +
  theme(legend.position = pos, legend.box = "horizontal")


# add concentration at the aggregated level and confront sectoral level
# between markup and concentration

# I selected the time series of aggregated cr4 and HHi with naics3 aggregation, since they're 
# the ones where the increase between the 1980s and the 2010s is more pronounced 

datatoadd <- n3aggcr %>%
  group_by(fyear) %>%
  summarize(cr4salagg = mean(cr4salagg),
            hhisalagg = mean(hhisalagg))

datatoadd80 = subset(datatoadd,  fyear %in% c(1981:1990) )

datatoadd80 <- datatoadd80 %>%
  summarize(cr4salagg = mean(cr4salagg),
            hhisalagg = mean(hhisalagg))

cr4mean80 = datatoadd80$cr4salagg * 100
hhimean80 = datatoadd80$hhisalagg

datatoadd10 = subset(datatoadd,  fyear %in% c(2011:2019) )

datatoadd10 <- datatoadd10 %>%
  summarize(cr4salagg = mean(cr4salagg),
            hhisalagg = mean(hhisalagg))

cr4mean10 = datatoadd10$cr4salagg * 100
hhimean10 = datatoadd10$hhisalagg


```

# Finalised Output

In this table you can find reported all the average values for the two time frames
analyzed (1980s and 2010s) for each variable considered. I reported HHI too for concentration,
but I do not know what is the computed HHI for the model, so, i left it blank.

```{r output}
output <- data.frame(
  Variable = c("GDP growth", "CR4","HHI", "Markup", "Labor share"),
  `E.1980s` = c(mean80s, cr4mean80, hhimean80 ,avg80, labsh6mean1),
  `E.2010s` = c(mean10s, cr4mean10, hhimean10,avg10, labsh6mean2),
  `E.Delta` = c(mean10s-mean80s, cr4mean10-cr4mean80, hhimean10-hhimean80 ,avg10-avg80, labsh6mean2-labsh6mean1),
  `S.1980s` = c(3.97, 7.73, 0, 1.33, 77.96),
  `S.2010s` = c(2.28, 17.27, 0,1.47, 74.14),
  `S.Delta` = c(-1.69, 9.54, 0, 0.15, -3.82)
)

# Approximate values to 2 significant digits
output[, -1] <- lapply(output[, -1], signif, digits = 3)

# Create the table and apply styling
styled_table <- kable(output, format = "latex", caption = "Empirical and Simulated Data", digits=2) %>%
  kable_styling(
    latex_options = c("striped", "hold_position"),
    font_size = 14,
    position = "center"
  )

# Print the table in your R Markdown document
styled_table
```


# References