-
Notifications
You must be signed in to change notification settings - Fork 0
/
datacollection.rmd
940 lines (731 loc) · 33.2 KB
/
datacollection.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
---
title: "Research Assistantship - Data Collection"
author: "Federico Vicentini"
date: "01/05/2023"
output: pdf_document
bibliography: references.bib
header-includes:
- \usepackage{longtable}
---
```{r setup, include=FALSE}
library(knitr)
knitr::opts_chunk$set(echo = FALSE)
options(warn = -1)
```
```{r p0, message=FALSE, echo=FALSE}
# Clear the variables
rm(list = ls())
# Install packages
packages <- c("eurostat", "stargazer", "fredr", "kableExtra",
"here", "knitr", "tinytex", "rmarkdown", "xfun", "pandoc",
"wbstats", "Rilostat", "sandwich", "dplyr", "urca", "lmtest",
"haven")
new_packages <- packages[!(packages %in% installed.packages()[, "Package"])]
if (length(new_packages)) install.packages(new_packages, repos="https://cran.mirror.garr.it/CRAN/")
invisible(lapply(packages, library, character.only = TRUE))
# Set the working directory to source file location with
# setwd(here::here())
```
# DATA COLLECTION
## Real GDP Data
\par In this first part, we download nominal GDP data and GDP deflator data from 1980
to 2020 and then we divide it into the 2 section (1980s and 2010s) in order to calculate
means and thus compare it to the data we got in the TTD Presentation.
```{r p1, message=FALSE, echo=FALSE}
library(fredr)
fredr_set_key("5946a6a1c79f3fe49bea4be0ef8e82e8")
gdp = fredr(
series_id = "GDP",
observation_start = as.Date("1981-01-01"),
observation_end = as.Date("2020-12-31"),
frequency = "a",
units = "pc1",
aggregation_method = "eop"
)
gdp1 = fredr(
series_id = "GDP",
observation_start = as.Date("1981-01-01"),
observation_end = as.Date("1990-12-31"),
frequency = "a",
units = "pc1",
aggregation_method = "eop"
)
gdp2 = fredr(
series_id = "GDP",
observation_start = as.Date("2011-01-01"),
observation_end = as.Date("2019-12-31"),
frequency = "a",
units = "pc1",
aggregation_method = "eop"
)
gdp = data.frame(gdp)
gdp1 = data.frame(gdp1)
gdp2 = data.frame(gdp2)
deflator = fredr(
series_id = "A191RI1Q225SBEA",
observation_start = as.Date("1981-01-01"),
observation_end = as.Date("2020-12-31"),
frequency = "a",
units="lin",
aggregation_method = "eop"
)
deflator1 = fredr(
series_id = "A191RI1Q225SBEA",
observation_start = as.Date("1981-01-01"),
observation_end = as.Date("1990-12-31"),
frequency = "a",
units="lin",
aggregation_method = "eop"
)
deflator2 = fredr(
series_id = "A191RI1Q225SBEA",
observation_start = as.Date("2011-01-01"),
observation_end = as.Date("2019-12-31"),
frequency = "a",
units="lin",
aggregation_method = "eop"
)
deflator = data.frame(deflator)
deflator1 = data.frame(deflator1)
deflator2 = data.frame(deflator2)
realgdp = gdp
realgdp$value = realgdp$value - deflator$value
#mean(realgdp$value)
realgdp1 = gdp1
realgdp1$value = realgdp1$value - deflator1$value
mean80s = mean(realgdp1$value)
realgdp2 = gdp2
realgdp2$value = realgdp2$value - deflator2$value
mean10s=mean(realgdp2$value)
```
```{r p1.1, message=FALSE}
plot(realgdp$date, realgdp$value, type="l",
xlab="Time", ylab= "Real GDP growth")
plot(realgdp1$date, realgdp1$value, type="l",
xlab="Time", ylab= "Real GDP growth")
plot(realgdp2$date, realgdp2$value, type="l",
xlab="Time", ylab= "Real GDP growth")
```
## Labor Share of Output
### "Naive" Labor Share (LS1)
First attempt here is to download the laborshare timeseries from fred.
You can see from the values of the plots that the level of the labor share is
underestimated if we use LS1. In fact, this approach neglects some important
aspects, such as the role of self-employed workers and correction to value added
in the forms of indirect taxes and consumption of fixed capital.
```{r p2.0}
labshare = fredr(
series_id = "LABSHPUSA156NRUG",
observation_start = as.Date("1981-01-01"),
observation_end = as.Date("2020-12-31"),
frequency = "a"
)
labshare1 = fredr(
series_id = "LABSHPUSA156NRUG",
observation_start = as.Date("1981-01-01"),
observation_end = as.Date("1990-12-31"),
frequency = "a"
)
labshare2 = fredr(
series_id = "LABSHPUSA156NRUG",
observation_start = as.Date("2011-01-01"),
observation_end = as.Date("2019-12-31"),
frequency = "a"
)
labshare = data.frame(labshare)
labshare1 = data.frame(labshare1)
labshare2 = data.frame(labshare2)
plot(labshare$date, labshare$value, type="l",
xlab="Time", ylab= "LS1")
plot(labshare1$date,labshare1$value, type="l",
xlab="Time", ylab= "LS1")
plot(labshare2$date,labshare2$value, type= "l",
xlab="Time", ylab= "LS1")
```
### Guerriero Index
This is precisely why we tried to replicate the laborshare measure provided by
{@guerriero2019}, defined as:
$$LS6 = \frac{compensation\;of\;employees *
\left(\frac{workforce-employers}{employees}\right)}
{value\;added-ind.\;taxes-fixed\;cap.\;cons.}$$
Data is retrieved from Ilostat R package API and (@dataunorg)
```{r p2.1, message=FALSE, echo=FALSE}
#API key for bls is: 7b39505b098044f886d056883d2bd925
#library(blsAPI)
#s=wb_search("compensation of employees")
comp=wb_data(country="US","GC.XPN.COMP.CN")
#s=fredr_series_search_text("compensation")
#View(s)
labforce1=read.csv("civilianlaborforce.csv", sep=",")
labforce1=subset(labforce1, labforce1$Period=="M12")
nomgdp = fredr(
series_id = "GDP",
observation_start = as.Date("1981-01-01"),
observation_end = as.Date("2020-12-31"),
frequency = "a",
units = "lin",
aggregation_method = "eop"
)
#This should be the correct indicator to use for the labor force level
labforce=get_ilostat("EAP_TEAP_SEX_AGE_NB_A")
labforce=subset(labforce, labforce$indicator=="EAP_TEAP_SEX_AGE_NB"&
labforce$ref_area=="USA"&
labforce$classif1=="AGE_AGGREGATE_TOTAL"&
labforce$sex=="SEX_T"&
labforce$time>=1980)
toc=get_ilostat_toc()
# Assume your dataframe is called "my_df" and your time column is called "time"
# Sort the dataframe in ascending order of time
labforce <- labforce %>% arrange(labforce$time)
#plot(labforce1$Value-labforce$obs_value)
#Download and filter the dataset with all employed by status
database=get_ilostat("EMP_TEMP_SEX_AGE_STE_NB_A")
empl=subset(database, database$indicator=="EMP_TEMP_SEX_AGE_STE_NB"&
database$ref_area=="USA"&
database$classif1=="AGE_AGGREGATE_TOTAL"&
database$classif2 == "STE_AGGREGATE_TOTAL"&
database$sex=="SEX_T"&
database$time>=1980)
#To get only databaseoyees, filter for STE_ICSE93_1
emplee=subset(database, database$indicator=="EMP_TEMP_SEX_AGE_STE_NB"&
database$ref_area=="USA"&
database$classif1=="AGE_AGGREGATE_TOTAL"&
database$classif2 == "STE_ICSE93_1"&
database$sex=="SEX_T"&
database$time>=1980)
#Filter for STE_ICSE93_3
icse93_3=subset(database, database$indicator=="EMP_TEMP_SEX_AGE_STE_NB"&
database$ref_area=="USA"&
database$classif1=="AGE_AGGREGATE_TOTAL"&
database$classif2 == "STE_ICSE93_3"&
database$sex=="SEX_T"&
database$time>=1980)
#Filter for STE_ICSE93_5
icse93_5=subset(database, database$indicator=="EMP_TEMP_SEX_AGE_STE_NB"&
database$ref_area=="USA"&
database$classif1=="AGE_AGGREGATE_TOTAL"&
database$classif2 == "STE_ICSE93_5"&
database$sex=="SEX_T"&
database$time>=1980)
#Filter for STE_AGGREGATE_SLF
aggslf=subset(database, database$indicator=="EMP_TEMP_SEX_AGE_STE_NB"&
database$ref_area=="USA"&
database$classif1=="AGE_AGGREGATE_TOTAL"&
database$classif2 == "STE_AGGREGATE_SLF"&
database$sex=="SEX_T"&
database$time>=1980)
#empl = select(empl, time, obs_value)
#names(empl)[2]="empl"
emplee = select(emplee, time, obs_value)
names(emplee)[2]="emplee"
icse93_3 = select(icse93_3, time, obs_value)
names(icse93_3)[2]="icse93_3"
icse93_5 = select(icse93_5, time, obs_value)
names(icse93_5)[2]="icse93_5"
aggslf = select(aggslf, time, obs_value)
names(aggslf)[2]="aggslf"
labforce = select(labforce, time, obs_value)
names(labforce)[2]="labforce"
mergemp <- empl %>%
merge(labforce, by = "time") %>%
merge(emplee, by = "time") %>%
merge(icse93_3, by = "time") %>%
merge(icse93_5, by = "time") %>%
merge(aggslf, by = "time")
toc=get_ilostat_toc()
# Calculate the number of employers as aggslf - icse93_3 - icse93_5
# this is under the assumption that icse93_4 is negligible
# in fact is around 8k workers for the US economy as a whole
#mergemp$emplrs = mergemp$aggslf - mergemp$icse93_3 - mergemp$icse93_5
# This is not a feasible way to do it, let's stick to LS5 indicator
# instead of LS6
```
For lack of available data on the structure of the labor force in the US
prior to the introduction of the ICSE93 standard, we will use LS5 instead of
the LS6 index, defined as:
$$LS6 = \frac{compensation\;of\;employees *
\left(\frac{workforce}{employees}\right)}
{value\;added-ind.\;taxes-fixed\;cap.\;cons.}$$
Below you can find also an example of the result we would get by trying to
compute LS6 too.
Note: since (@guerriero2019) says that "Data on the composition of the workforce is not
always available for every year. When absent, it is assumed to be the same as in the
previous year (Gollin 2002). This is a realistic assumption (Askenazy 2003), given
that the composition of the workforce is relatively constant over time.", and since we
have data on the composition of the workforce from 1994 onward, we will assume that
in years preceding 1994, workforce composition is the same. Below, you can find a plot
of workforce/employees for 1994 onward, where you can see that the figure is pretty stable.
However, instead of taking 1994 value and projecting backward, I found it better to
make an average for the period 1994-2000 and then project that figure backward.
Note2: since there are 2 different time series for SNA data, coming from two different
accounting standards (SNA93 and SNA08), we tried 2 approaches. First, we tried to find
a conversion factor between those 2 standards (using first lags in order not to have
spurious regressions), but this approach leads to a big jump in the data around the
cutoff year. Thus, the second method was just to keep the newer standard for years after
1995, and the old one for years prior. This is probably what @guerriero2019 did in the paper,
since the results look similar (albeit different since we use LS5 and not LS6). This
is the method I selected in the end.
```{r p2.12, message=FALSE}
# For lack of available data, we will use LS5 and not LS6
#Now, calculate the multiplier of the compensation for every year
mergemp=mergemp[-c(28:29),]
mergemp$mult = (mergemp$labforce)/mergemp$emplee
mergemp$mult6 = (mergemp$labforce-mergemp$icse93_3)/mergemp$emplee
plot(mergemp$time,mergemp$mult,
main = "Total Workforce / Number of Employees",
typ="l",
xlab="Time",
ylab="Multiplier")
# Now, add the other years by:
multtrain=mergemp$mult[c(1:7)]
simmult=mean(multtrain)
simmult=rep(simmult, 14)
mult=append(simmult, mergemp$mult)
mult6train=mergemp$mult6[c(1:7)]
simmult6=mean(mult6train)
simmult6=rep(simmult6, 14)
mult6=append(simmult6, mergemp$mult6)
#plot(mult)
#Now retrieve data from the un on aggregates
un = read.csv("un-sna-aggregates.txt", sep=";")
names(un)[6]="time"
un = un[order(un$time), ]
#Use the time period of the interregnum (1995-2011), and maybe cut in two (1995-2008)
#Regression with sna2008 as dependent var and sna1993 as regressor
#Coefficient will be our conversion factor.
#Evaluate the model, then check if it can replicate the other series
#and check for violations of assumptions
#Apply to convert the vintage period
findconversion = function(s1,s2){
lag_s1 <- lag(s1, n=1)
model <- lm(s1 ~ lag_s1)
# Check coefficient of lagged variable
resid <- residuals(model)
# Perform the Augmented Dickey-Fuller (ADF) test on residuals
adf_test <- ur.df(resid, type = "drift")
# Print the test results
#print(summary(adf_test))
lag_s2 <- lag(s2, n=1)
#stargazer(model, type="text")
model <- lm(s2 ~ lag_s2)
# Check coefficient of lagged variable
resid <- residuals(model)
# Perform the Augmented Dickey-Fuller (ADF) test on residuals
adf_test <- ur.df(resid, type = "drift")
# Print the test results
#print(summary(adf_test))
#stargazer(model, type="text")
s1=s1-lag(s1, n=1)
s2=s2-lag(s2, n=1)
train = lm(s1 ~ s2)
#stargazer(train, type = "text")
coef = train$coefficients
return(coef)
}
# First is the fixed cap cons, then gross value added
# then compensation and lastly indirect taxes
codelist = c("K.1", "B.1*g", "D.1", "D.2-D.3")
codenames = c("time","fcc", "gdp", "wages", "ind_tax")
ts=data.frame(seq(1980,2020,1))
for(i in 1:length(codelist)){
ts93 = subset(un, un$SNA93.Item.Code == codelist[i] &
un$Series == 100 & un$time <= 2011 & un$time >= 1995)
ts08 = subset(un, un$SNA93.Item.Code == codelist[i] &
un$Series == 1000 & un$time <= 2011 & un$time >= 1995)
tsold = subset(un, un$SNA93.Item.Code == codelist[i] &
un$Series == 100 & un$time < 1995)
tsnew = subset(un, un$SNA93.Item.Code == codelist[i] &
un$Series == 1000 & un$time >= 1995)
coef = findconversion(ts08$Value, ts93$Value)
simts08 = tsold$Value * coef[2] + coef[1]
addts=c(simts08, tsnew$Value)
ts[,i+1]=addts
#plot(ts[,1], addts, type = "o")
}
tsb=data.frame(seq(1980,2020,1))
for(i in 1:length(codelist)){
tsnew = subset(un, un$SNA93.Item.Code == codelist[i] &
un$Series == 1000 & un$time >= 1995)
tsold = subset(un, un$SNA93.Item.Code == codelist[i] &
un$Series == 100 & un$time < 1995)
addts=c(tsold$Value, tsnew$Value)
tsb[,i+1]=addts
}
names(ts) = codenames
names(tsb) = codenames
ts$mult=mult
tsb$mult=mult
tsb$mult6=mult6
ts$labshare=(ts$wages*ts$mult)/(ts$gdp-ts$ind_tax-ts$fcc)
tsb$labshare=(tsb$wages*tsb$mult)/(tsb$gdp-tsb$ind_tax-tsb$fcc)
tsb$labshare6=(tsb$wages*tsb$mult6)/(tsb$gdp-tsb$ind_tax-tsb$fcc)
#plot(ts$time, ts$labshare)
plot(tsb$time, tsb$labshare, main="Labor Share of Output (LS5)",
type="l",
xlab="Time",
ylab="LS5")
plot(tsb$time, tsb$labshare6, main="Labor Share of Output (LS6)",
type="l",
xlab="Time",
ylab="LS6")
period1=tsb[c(2:11),]
labshmean1=mean(period1$labshare)*100
labsh6mean1=mean(period1$labshare6)*100
period2=tsb[c(32:40),]
labshmean2=mean(period2$labshare)*100
labsh6mean2=mean(period2$labshare6)*100
labsh11=mean(tsb$labshare[c(32)])*100
labsh611=mean(tsb$labshare6[c(32)])*100
#(labshmean2-labshmean1)
#(labsh6mean2-labsh6mean1)
```
## Price Markup
Price markup is computed according to @deloecker2020, using the replication
file related to the paper and extracting the time series of aggregate markup
presented as the main finding of the paper itself.
```{r mk, message=FALSE}
# try to add the sectoral markups if you find them!!!
mkts=read.csv("markup.csv", sep=",")
mkts = subset(mkts, year %in% c(1980:2016))
mkts80 = subset(mkts, year %in% c(1981:1990) )
avg80 = mean(mkts80$MARKUP_spec1)
#avg80
mkts10 = subset(mkts, year %in% c(2011:2016) )
avg10 = mean(mkts10$MARKUP_spec1)
#avg10
plot(mkts$year,mkts$MARKUP_spec1, type="l", col="blue",
xlab = "Time", ylab = "Markup")
```
# Market Concentration
Here we present a slideshow of graphs of various concentration measures:
\begin{itemize}
\item CR4
\item CR20
\item Herfindal-Hirschman (HH) Index
\end{itemize}
All measures are computed using Compustat firm-level data, so they rely on data
exclusively on US publicly traded firms. Compustat data has already been cleaned in
Stata (the file is saved as "compustatclean.do").
Concentration measures are computes firstly at the Naics 4-digit level, then at the
3-digit one. Aggregation by sector is done according to the sectorization provided in @autor2020.
For every SIC code employed by Autor, we tracked down the corresponding Naics code and replicated the
procedure.
In the end, the best results came from CR4 and HHI computed at the Naics 3 level.
Nonetheless, the upward trend is not that clear in all sectors: that could be due to multiple factors:
\begin{itemize}
\item We deal only with publicly traded firms, thus it is possible that concentration measures
computed are higher, since the denominator is actually smaller than the real size of them
corresponding industrial sector if privately owned companies are accounted for.
\item The second reason is related to the first one. Since in the 1980s the stock market was
smaller than today, both in absolute value and as a share of the total economy, it is
possible that the concentration measures are biased upward more at the start of the
time series than they are at the end.
\item Specifically when using highly detailed sectorization like naics4, in our sample the population
of firms in the sector shrinks down to even less than 4 in some cases. It is clear that in those
cases, the concentration measure is equal to 1, thus biasing our weighted mean upward. Computing
measures at the naics3 level partially accounts for this bias, but not for all sectors, since
economic sectors like agriculture are not widely represented in the sample of publicly traded firms.
\end{itemize}
These are just some of the possible reasons why our upward trend seem to be not so strong in
every sector, and not so strong even at the aggregated level for some of the indices.
```{r mktconc, message=FALSE, warning=FALSE}
compustat <- data.frame(read_dta("leancompustat.dta"))
compustat <- filter(compustat, fyear>=1980 & fyear<=2019)
compustat$sector=as.factor(compustat$sector)
compustat$naics3=as.factor(compustat$naics3)
library(dplyr)
# Use the same aggregation scheme as Autor et al. and trasnfer the mapping into NAICS codes:
# what we keep out of analysis are naics:
# 11 23 56 61 62 71 72 81 and 99, as well as everything but 211 and 213 in 21 and
# everything but 221 in 22 , and same for 517 in 51. they are all non relevant
# or semipublic sectors, or constructions and accomodation which are particularly
# full of SMEs not covered by Compustat Data
# Create a new column "Classification" with initial value as "Other"
compustat$AutorClass <- "Other"
# Update the "Classification" column based on the conditions
compustat$AutorClass[compustat$naics2 %in% c(31, 32, 33) & compustat$naics3 != 325] <- "Manufacturing"
compustat$AutorClass[compustat$naics2 %in% c(48, 49) & compustat$naics3 != 486] <- "Transportation"
compustat$AutorClass[compustat$naics2 %in% c(44, 45)] <- "Retail"
compustat$AutorClass[compustat$naics2 %in% c(42)] <- "Wholesale"
compustat$AutorClass[compustat$naics3 %in% c(211, 213, 486)] <- "Petroleum & Gas"
compustat$AutorClass[compustat$naics2 %in% c(52)] <- "Finance & Insurance"
compustat$AutorClass[compustat$naics3 %in% c(221, 517)] <- "TelCo & Utilities"
compustat$AutorClass[compustat$naics3 == 325] <- "Chemicals"
compustat$AutorClass[compustat$naics2 %in% c(54)] <- "Business Services"
compustat$AutorClass <- as.factor(compustat$AutorClass)
# Step 1: Calculate cr4, cr20 and HHI with naics4 sector sales and empl
compustat <- compustat %>%
group_by(fyear) %>%
mutate(total_sales = sum(sale))%>%
mutate(total_emp = sum(emp))%>%
group_by(fyear, naics4) %>%
mutate(sector_sales = sum(sale)) %>%
mutate(sector_emp = sum(emp)) %>%
mutate(ratiosal = (sale/sector_sales)*100) %>%
mutate(ratioemp = (emp/sector_emp)*100) %>%
mutate(sectorweight4 = sector_sales/total_sales)%>%
mutate(sectorweight4emp = sector_emp / total_emp) %>%
mutate(rank4 = dense_rank(desc(sale))) %>%
mutate(rank4emp = dense_rank(desc(emp))) %>%
mutate(population4 = n()) %>%
mutate(n4cr4sal = sum(ifelse(rank4<=4, sale, 0))/sector_sales) %>%
mutate(n4cr20sal = sum(ifelse(rank4<=20, sale, 0))/sector_sales) %>%
mutate(n4cr4emp = sum(ifelse(rank4emp<=4, emp, 0))/sector_emp) %>%
mutate(n4cr20emp = sum(ifelse(rank4emp<=20, emp, 0))/sector_emp) %>%
mutate(n4hhisal = sum(ratiosal^2)) %>%
mutate(n4hhiemp = sum(ratioemp^2)) %>%
group_by(fyear, AutorClass) %>%
mutate(Autorsales = sum(sale)) %>%
mutate(Autorweightsal = Autorsales/total_sales) %>%
mutate(Autoremp = sum(emp)) %>%
mutate(Autorweightemp = Autoremp/total_emp)
# Step 2: Calculate cr4 and cr20 with naics3 sector sales and empl
compustat <- compustat %>%
group_by(fyear, naics3) %>%
mutate(sector_sales3 = sum(sale)) %>%
mutate(sector_emp3 = sum(emp)) %>%
mutate(ratiosal3 = (sale/sector_sales3)*100) %>%
mutate(ratioemp3 = (emp/sector_emp3)*100) %>%
mutate(sectorweight3 = sector_sales3/total_sales)%>%
mutate(sectorweight3emp = sector_emp3 / total_emp) %>%
mutate(rank3 = dense_rank(desc(sale))) %>%
mutate(rank3emp = dense_rank(desc(emp))) %>%
mutate(population3 = n()) %>%
mutate(n3cr4sal = sum(ifelse(rank3<=4, sale, 0))/sector_sales3) %>%
mutate(n3cr20sal = sum(ifelse(rank3<=20, sale, 0))/sector_sales3) %>%
mutate(n3cr4emp = sum(ifelse(rank3emp<=4, emp, 0))/sector_emp3) %>%
mutate(n3cr20emp = sum(ifelse(rank3emp<=20, emp, 0))/sector_emp3) %>%
mutate(n3hhisal = sum(ratiosal3^2)) %>%
mutate(n3hhiemp = sum(ratioemp3^2))
# Aggregate database with cr4 by naics2 with naics4 aggregation and sales
# Normalize the weights within each naics4 and fyear group
n4aggcr <- compustat %>%
group_by(fyear, naics4) %>%
summarize(cr4sal = n4cr4sal, cr20sal = n4cr20sal, cr4emp = n4cr4emp, cr20emp = n4cr20emp,
naics1 = naics1, naics2 = naics2, sectorweight = sectorweight4,
sectorweightemp = sectorweight4emp, sector = sector,
hhiemp = n4hhiemp, hhisal = n4hhisal, AutorClass = AutorClass,
Autorweightsal = Autorweightsal, Autorweightemp = Autorweightemp) %>%
group_by(fyear, AutorClass) %>%
mutate(normalweightsal = sectorweight / sum(sectorweight)) %>%
mutate(normalweightemp = sectorweightemp / sum(sectorweightemp)) %>%
summarize(cr4sal = weighted.mean(cr4sal, normalweightsal),
cr20sal = weighted.mean(cr20sal, normalweightsal),
cr4emp = weighted.mean(cr4emp, normalweightemp),
cr20emp = weighted.mean(cr20emp, normalweightemp),
hhisal = weighted.mean(hhisal, normalweightsal),
hhiemp = weighted.mean(hhiemp, normalweightemp),
sectorweightsum = sum(normalweightsal),
sectorweightempsum = sum(normalweightemp),
Autorweightsal = mean(Autorweightsal),
Autorweightemp = mean(Autorweightemp)) %>%
mutate( cr4salagg = weighted.mean(cr4sal, Autorweightsal),
cr4empagg = weighted.mean(cr4emp, Autorweightemp),
cr20salagg = weighted.mean(cr20sal, Autorweightsal),
cr20empagg = weighted.mean(cr20emp, Autorweightemp),
hhisalagg = weighted.mean(hhisal, Autorweightsal),
hhiempagg = weighted.mean(hhiemp, Autorweightemp))
n3aggcr <- compustat %>%
group_by(fyear, naics3) %>%
summarize(cr4sal = n3cr4sal,cr20sal=n3cr20sal, cr4emp=n3cr4emp, cr20emp=n3cr20emp,
naics1=naics1, naics2 = naics2, sectorweight = sectorweight3,
sectorweightemp=sectorweight3emp, sector = sector,
hhiemp=n3hhiemp, hhisal=n3hhisal, AutorClass = AutorClass,
Autorweightsal = Autorweightsal, Autorweightemp = Autorweightemp) %>%
group_by(fyear, AutorClass) %>%
mutate(normalweightsal = sectorweight / sum(sectorweight)) %>%
mutate(normalweightemp = sectorweightemp / sum(sectorweightemp)) %>%
summarize(cr4sal = weighted.mean(cr4sal, normalweightsal),
cr20sal = weighted.mean(cr20sal, normalweightsal),
cr4emp = weighted.mean(cr4emp, normalweightemp),
cr20emp = weighted.mean(cr20emp, normalweightemp),
hhisal = weighted.mean(hhisal, normalweightsal),
hhiemp = weighted.mean(hhiemp, normalweightemp),
sectorweightsum = sum(normalweightsal),
sectorweightempsum = sum(normalweightemp),
Autorweightsal = mean(Autorweightsal),
Autorweightemp = mean(Autorweightemp)) %>%
mutate( cr4salagg = weighted.mean(cr4sal, Autorweightsal),
cr4empagg = weighted.mean(cr4emp, Autorweightemp),
cr20salagg = weighted.mean(cr20sal, Autorweightsal),
cr20empagg = weighted.mean(cr20emp, Autorweightemp),
hhisalagg = weighted.mean(hhisal, Autorweightsal),
hhiempagg = weighted.mean(hhiemp, Autorweightemp))
attach(compustat)
library(ggplot2)
#Position of the legend
pos = c(0.85, 0.1)
# Plot of CR4 with naics4 aggregation
ggplot(n4aggcr, aes(x = fyear, color = AutorClass)) +
geom_line(aes(y = cr4sal, linetype = "CR4 Sal")) +
geom_line(aes(y = cr4emp, linetype = "CR4 Emp")) +
facet_wrap(~ AutorClass, scales = "free_y") +
labs( x = "Year", y = "CR4",
title = "Time Series of CR4 by Sector (aggregation by Naics 4)", linetype = "") +
scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
theme_minimal() +
guides(color = "none") +
theme(legend.position = pos, legend.box = "horizontal")
#Plot of CR20 with naics4 aggregation
ggplot(n4aggcr, aes(x = fyear, color = AutorClass)) +
geom_line(aes(y = cr20sal, linetype = "CR20 Sal")) +
geom_line(aes(y = cr20emp, linetype = "CR20 Emp")) +
facet_wrap(~ AutorClass, scales = "free_y") +
labs( x = "Year", y = "CR20",
title = "Time Series of CR20 by Sector (aggregation by Naics 4)", linetype = "") +
scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
theme_minimal() +
guides(color = "none") +
theme(legend.position = pos, legend.box = "horizontal")
#Plot of HHI with naics4 aggregation
ggplot(n4aggcr, aes(x = fyear, color = AutorClass)) +
geom_line(aes(y = hhisal, linetype = "HHI Sal")) +
geom_line(aes(y = hhiemp, linetype = "HHI Emp")) +
facet_wrap(~ AutorClass, scales = "free_y") +
labs( x = "Year", y = "HHI",
title = "Time Series of HHI by Sector (aggregation by Naics 4)", linetype = "") +
scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
theme_minimal() +
guides(color = "none") +
theme(legend.position = pos, legend.box = "horizontal")
#Plot of AGGREGATED CR4 with naics4 aggregation
ggplot(n4aggcr, aes(x = fyear, color = AutorClass)) +
geom_line(aes(y = cr4salagg, linetype = "CR4 Sal")) +
geom_line(aes(y = cr4empagg, linetype = "CR4 Emp")) +
labs( x = "Year", y = "CR4",
title = "Time Series of Aggregated CR4 (Naics 4 aggregation)", linetype = "") +
scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
theme_minimal() +
guides(color = "none") +
theme(legend.position = pos, legend.box = "horizontal")
#Plot of AGGREGATED CR20 with naics4 aggregation
ggplot(n4aggcr, aes(x = fyear, color = AutorClass)) +
geom_line(aes(y = cr20salagg, linetype = "CR20 Sal")) +
geom_line(aes(y = cr20empagg, linetype = "CR20 Emp")) +
labs( x = "Year", y = "CR20",
title = "Time Series of Aggregated CR20 (Naics 4 aggregation)", linetype = "") +
scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
theme_minimal() +
guides(color = "none") +
theme(legend.position = pos, legend.box = "horizontal")
#Plot of AGGREGATED HHI with naics4 aggregation
ggplot(n4aggcr, aes(x = fyear, color = AutorClass)) +
geom_line(aes(y = hhisalagg, linetype = "HHI Sal")) +
geom_line(aes(y = hhiempagg, linetype = "HHI Emp")) +
labs( x = "Year", y = "HHI",
title = "Time Series of Aggregated HHI (Naics 4 aggregation)", linetype = "") +
scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
theme_minimal() +
guides(color = "none") +
theme(legend.position = pos, legend.box = "horizontal")
# Plot of CR4 with naics3 aggregation
ggplot(n3aggcr, aes(x = fyear, color = AutorClass)) +
geom_line(aes(y = cr4sal, linetype = "CR4 Sal")) +
geom_line(aes(y = cr4emp, linetype = "CR4 Emp")) +
facet_wrap(~ AutorClass, scales = "free_y") +
labs( x = "Year", y = "CR4",
title = "Time Series of CR4 by Sector (aggregation by Naics 3)", linetype = "") +
scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
theme_minimal() +
guides(color = "none") +
theme(legend.position = pos, legend.box = "horizontal")
#Plot of CR20 with naics3 aggregation
ggplot(n3aggcr, aes(x = fyear, color = AutorClass)) +
geom_line(aes(y = cr20sal, linetype = "CR20 Sal")) +
geom_line(aes(y = cr20emp, linetype = "CR20 Emp")) +
facet_wrap(~ AutorClass, scales = "free_y") +
labs( x = "Year", y = "CR20",
title = "Time Series of CR20 by Sector (aggregation by Naics 3)", linetype = "") +
scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
theme_minimal() +
guides(color = "none") +
theme(legend.position = pos, legend.box = "horizontal")
#Plot of HHI with naics3 aggregation
ggplot(n3aggcr, aes(x = fyear, color = AutorClass)) +
geom_line(aes(y = hhisal, linetype = "HHI Sal")) +
geom_line(aes(y = hhiemp, linetype = "HHI Emp")) +
facet_wrap(~ AutorClass, scales = "free_y") +
labs( x = "Year", y = "HHI",
title = "Time Series of HHI by Sector (aggregation by Naics 3)", linetype = "") +
scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
theme_minimal() +
guides(color = "none") +
theme(legend.position = pos, legend.box = "horizontal")
#Plot of AGGREGATED CR4 with naics3 aggregation
ggplot(n3aggcr, aes(x = fyear, color = AutorClass)) +
geom_line(aes(y = cr4salagg, linetype = "CR4 Sal")) +
geom_line(aes(y = cr4empagg, linetype = "CR4 Emp")) +
labs( x = "Year", y = "CR4",
title = "Time Series of Aggregated CR4 (Naics 3 aggregation)", linetype = "") +
scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
theme_minimal() +
guides(color = "none") +
theme(legend.position = pos, legend.box = "horizontal")
#Plot of AGGREGATED CR20 with naics3 aggregation
ggplot(n3aggcr, aes(x = fyear, color = AutorClass)) +
geom_line(aes(y = cr20salagg, linetype = "CR20 Sal")) +
geom_line(aes(y = cr20empagg, linetype = "CR20 Emp")) +
labs( x = "Year", y = "CR20",
title = "Time Series of Aggregated CR20 (Naics 3 aggregation)", linetype = "") +
scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
theme_minimal() +
guides(color = "none") +
theme(legend.position = pos, legend.box = "horizontal")
#Plot of AGGREGATED HHI with naics3 aggregation
ggplot(n3aggcr, aes(x = fyear, color = AutorClass)) +
geom_line(aes(y = hhisalagg, linetype = "HHI Sal")) +
geom_line(aes(y = hhiempagg, linetype = "HHI Emp")) +
labs( x = "Year", y = "HHI",
title = "Time Series of Aggregated HHI (Naics 3 aggregation)", linetype = "") +
scale_linetype_manual(values = c("solid", "dotted"), labels = c("Sales", "Employment")) +
theme_minimal() +
guides(color = "none") +
theme(legend.position = pos, legend.box = "horizontal")
# add concentration at the aggregated level and confront sectoral level
# between markup and concentration
# I selected the time series of aggregated cr4 and HHi with naics3 aggregation, since they're
# the ones where the increase between the 1980s and the 2010s is more pronounced
datatoadd <- n3aggcr %>%
group_by(fyear) %>%
summarize(cr4salagg = mean(cr4salagg),
hhisalagg = mean(hhisalagg))
datatoadd80 = subset(datatoadd, fyear %in% c(1981:1990) )
datatoadd80 <- datatoadd80 %>%
summarize(cr4salagg = mean(cr4salagg),
hhisalagg = mean(hhisalagg))
cr4mean80 = datatoadd80$cr4salagg * 100
hhimean80 = datatoadd80$hhisalagg
datatoadd10 = subset(datatoadd, fyear %in% c(2011:2019) )
datatoadd10 <- datatoadd10 %>%
summarize(cr4salagg = mean(cr4salagg),
hhisalagg = mean(hhisalagg))
cr4mean10 = datatoadd10$cr4salagg * 100
hhimean10 = datatoadd10$hhisalagg
```
# Finalised Output
In this table you can find reported all the average values for the two time frames
analyzed (1980s and 2010s) for each variable considered. I reported HHI too for concentration,
but I do not know what is the computed HHI for the model, so, i left it blank.
```{r output}
output <- data.frame(
Variable = c("GDP growth", "CR4","HHI", "Markup", "Labor share"),
`E.1980s` = c(mean80s, cr4mean80, hhimean80 ,avg80, labsh6mean1),
`E.2010s` = c(mean10s, cr4mean10, hhimean10,avg10, labsh6mean2),
`E.Delta` = c(mean10s-mean80s, cr4mean10-cr4mean80, hhimean10-hhimean80 ,avg10-avg80, labsh6mean2-labsh6mean1),
`S.1980s` = c(3.97, 7.73, 0, 1.33, 77.96),
`S.2010s` = c(2.28, 17.27, 0,1.47, 74.14),
`S.Delta` = c(-1.69, 9.54, 0, 0.15, -3.82)
)
# Approximate values to 2 significant digits
output[, -1] <- lapply(output[, -1], signif, digits = 3)
# Create the table and apply styling
styled_table <- kable(output, format = "latex", caption = "Empirical and Simulated Data", digits=2) %>%
kable_styling(
latex_options = c("striped", "hold_position"),
font_size = 14,
position = "center"
)
# Print the table in your R Markdown document
styled_table
```
# References