-
Notifications
You must be signed in to change notification settings - Fork 4
/
05_NonFarm_Business_Income.R
154 lines (112 loc) · 6.68 KB
/
05_NonFarm_Business_Income.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# =================== Non-farm business Income SAS Data - 77th Round of NSSO =================== #
# Author: Sethu C. A.
# License: GNU GPLv3
# This is script 5 of 7
# This work is inspired by Deepak Johnson's work here: https://github.com/deepakjohnson91/NSSO-77-Round-SAS/
# Documentation on and data/readme files available at https://www.mospi.gov.in/unit-level-data-report-nss-77-th-round-schedule-331-january-2019-%E2%80%93-december-2019land-and-livestock
# One level - 13 - deals with the non-farm business income.
# The estimation in the NSSO report for reporting the total income (table 23-A) is done at the level of agricultural households.
# NSSO collects data for the last 30 days.
# Serial no 99 is to be filtered if we want total per household.
# The reported NBI income for V 1 is Rs 641 per month and Rs 638 per month for V 2. It is Rs. 641 for both.
rm(list = ls()) # clear the environment
# Load packages
library(readxl) # for reading excel files
library(readr) # for reading fixed width files in a fast and consistent manner compared to the 'foreign' library
library(dplyr) # tidyverse package for data manipulation
library(tidyr) # tidyverse package for data cleaning
library(Hmisc) # for for weighted mean, etc.
library(data.table) # for exporting data in a fast manner
# Set working directory
setwd(".") # change this path to your specific directory before running the script if you downloaded all the code instead of cloning the repo.
# Load relevant data prepared earlier
load("Output/All_HH_Basic.Rdata")
load("Output/Common_HH_Basic.Rdata")
load("Output/AH_Common_HH_Basic.Rdata")
# Read in relevant level codes
Level13Codes <- read_excel("List_Level_Codes.xlsx", sheet = "Level13")
# Read Level 13 data which contains output information
# Load the data for given level from the fixed width file provided into a data frame using the byte lengths provided in the level codes file
# The name of the data frame has the following logic: Level 13 in Visit 1
L13_V1 <- read_fwf("Raw data/r77s331v1L13.txt",
fwf_widths(widths = Level13Codes$Length),
col_types = cols(
X12 = col_character(), #RANT from before
.default = col_number()
))
# Add column names to the data frame after sanitizing them as valid variable names
colnames(L13_V1) <- make.names(Level13Codes$Name)
# Create a common ID for all households as per documentation.
L13_V1 <- L13_V1 %>%
mutate(HH_ID = paste(FSU.Serial.No.,Second.stage.stratum.no.,Sample.hhld..No., sep = "0"))
# Task: Create a data frame with Non-Farm Business Income
# First we need the gross value of output from Level 11 data
# We need only those observations which are the total value of all businesses for each household, i.e. with one observation per household (Sl.no 99 in the questionnaire block)
# So we filter out only 99 from Sl no first
# Now, We need only the net recepits column from this
# We need to join that against the basic Household info data frame we already made earlier.
# Let us name this data frame using the logic: Non-farm Business Income from Visit 1
NBI_V1 <- left_join(All_HH_Basic, L13_V1 %>%
filter(L13_V1$Serial.no. == 99) %>%
select(c(HH_ID, Net.receipts.col.5...col.4.)),
by = "HH_ID")
# Replace NA with 0
NBI_V1[is.na(NBI_V1)] <- 0
# Now visit 2
# Load data
L13_V2 <- read_fwf("Raw data/r77s331v2L13.txt",
fwf_widths(widths = Level13Codes$Length),
col_types = cols(
X12 = col_character(), #RANT from before
.default = col_number()
))
colnames(L13_V2) <- make.names(Level13Codes$Name)
L13_V2 <- L13_V2 %>%
mutate(HH_ID = paste(FSU.Serial.No.,Second.stage.stratum.no.,Sample.hhld..No., sep = "0"))
# Make NBI V2, but using Common_HH_Basic instead of All_HH_Basic because the former has visit 2 data alone.
NBI_V2 <- left_join(Common_HH_Basic, L13_V2 %>%
filter(L13_V2$Serial.no. == 99) %>%
select(c(HH_ID,Net.receipts.col.5...col.4.)),
by = "HH_ID")
#Replace NAs
NBI_V2[is.na(NBI_V2)] <- 0
# Now merge both
NBI <- left_join(NBI_V1, NBI_V2 %>%
select(c(1, 11, 12)),
by = "HH_ID")
#Replace NAs
NBI[is.na(NBI)] <- 0
# Create column for monthly calculation. Info is collected for 30 days with weight of 8 months in visit 1 and 4 months in visit 2.
NBI$MonthlyNBI <- ((NBI$Net.receipts.col.5...col.4..x * 8) + (NBI$Net.receipts.col.5...col.4..y * 4))/12
# subset this to create Agricultural Households subset
AH_NBI <- NBI %>% filter(HH_ID %in% AH_Common_HH_Basic$HH_ID)
# Run the tests
wtd.mean(AH_NBI$MonthlyNBI, weights = AH_NBI$Weights_V2)
# We get 640.5397 which can be rounded to the reported 641
wtd.mean(AH_NBI$Net.receipts.col.5...col.4..y, weights = AH_NBI$Weights_V2)
# We get 637.76 which matches with 638 in the report
wtd.mean(AH_NBI$Net.receipts.col.5...col.4..x, weights = AH_NBI$Weights_V1)
# We get 640.9731 which matches 641 in the report
# End
# Time to save the files
# I am creating a code block to iterate through all relevant data frames, and save them both as RData and csv files
# First we define an output folder
output_dir <- "Output"
# Next we get a list of objects in the global environment at the moment (NOTE: This takes all objects in the global enviroment, which means it will create confusions if you were running other codes and had other objects from other scripts in the global environment)
obj_list <- ls()
# Next we define a function that runs some checks to see if we want to save a given object in the global enviroment or not.
tobesaved <- sapply( # sapply applies whatever function we specify to all the elements in the list we specify
obj_list, function(x) # we are taking the list tobesaved, and applying a function called x. We will define x now
{
is.data.frame(get(x)) && # It checks if the item in the list is a data frame, and
!startsWith(x, "Level") && # It checks if the item does not start with "Level", and
!endsWith(x, "_list") # It checks if the item does not end with "_list"
}
)
# Now we just run a loop that takes each object from the list, see if it has to be saved, and then saves it if true.
for (i in obj_list[tobesaved])
{
save(list = i, file = file.path(output_dir, paste0(i,".Rdata")))
fwrite(get(i), file = file.path(output_dir, paste0(i,".csv"))) #fwrite is better than write.csv2 as it is faster to save big files.
}
# The End