Skip to content

Commit

Permalink
Adjust age_rescaling script for LADs
Browse files Browse the repository at this point in the history
  • Loading branch information
sgreenbury committed Sep 12, 2023
1 parent 21a9dfc commit 0a9d597
Showing 1 changed file with 88 additions and 43 deletions.
131 changes: 88 additions & 43 deletions scripts/data_prep/age_rescaling/age_rescaling.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
# Set-up
# Options and imports
options(error = traceback)
library(tidyverse)
library(parallel)
library(stringr)
library(tidyverse)
library(rgdal)
library(readxl)

# Set seed
set.seed(790)

# Positional args
args <- commandArgs(TRUE)
Expand All @@ -8,38 +17,43 @@ args <- commandArgs(TRUE)
print(args)

# Single run args
lad <- args[1]
date <- as.integer(args[2])

# File paths
folderInOT <- args[3]
spenserInput <- args[4]
folderOut <- args[5]

# Alias
folderIn <- folderInOT

# Age ref
# Mid-points of Age categories in:
# "Age Group Table 6.5a Hourly pay - Gross 2020.xls"
# 16-17b, 18-21, 22-29, 30-39, 40-49, 50-59, 60+
ageRef <- c(16.5, 19.5, 25.5, 34.5, 44.5, 54.5, 63)

# Read data
# -------------------
##  L326-L341:
## - https://github.com/alan-turing-institute/uatk-spc/blob/31dd8b05e2a67fb73447d13581f6107d39a56820/scripts/data_prep/raw_to_prepared_Income.R#L326-L341

# Loop over all counties
i <- countyList[1]
temp <- addToData(i, lookUp, coefFFT, coefFPT, coefMFT, coefMPT)
write.table(temp, paste("output/tus_hse_", i, ".csv", sep = ","))
checkRes <- data.frame(
MSOA11CD = temp$MSOA11CD, sex = temp$sex, age = temp$age, soc2010 = temp$soc2010,
pwkstat = temp$pwkstat, incomeH = temp$incomeH, incomeY = temp$incomeY
# SPENSER outputs are already reshaped to 2020 LAD codes so loop over these files
lookup <- read.csv(paste(folderInOT, "lookUp-GB.csv", sep = ""))
lad_files <- (
lookup %>% filter(Country == "England")
%>% select(LAD20CD)
%>% unique()
%>% mutate(
file_name = str_replace(
LAD20CD, LAD20CD, paste0(folderOut, LAD20CD, ".csv")
)
)
)

for (i in countyList[2:length(countyList)]) {
temp <- read.csv(paste("output/tus_hse_", i, ".csv", sep = ""), sep = " ")
checkRes2 <- data.frame(
MSOA11CD = temp$MSOA11CD, sex = temp$sex, age = temp$age, soc2010 = temp$soc2010,
pwkstat = temp$pwkstat, incomeH = temp$incomeH, incomeY = temp$incomeY
)
checkRes <- rbind(checkRes, checkRes2)
}
# TODO: For testing, just two LADs, comment out when complete
lad_files <- lad_files %>% filter(LAD20CD %in% c("E09000001", "E06000001"))
check_res <- do.call(rbind, lapply(lad_files$file_name, read.csv))

checkRes$pwkstat <- as.numeric(substr(checkRes$pwkstat, 1, 2))
checkResF <- checkRes[checkRes$pwkstat == 1 | checkRes$pwkstat == 2, ]
# Filter for only Full time (1) and part time (2)
check_res_f <- check_res %>% filter(pwkstat == 1 | pwkstat == 2)


##  L583-L726:
Expand All @@ -49,19 +63,13 @@ checkResF <- checkRes[checkRes$pwkstat == 1 | checkRes$pwkstat == 2, ]
####### (4.) AGE RESCALING ########
###################################


# /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\
# /!\ /!\ /!\ The following is for reference only, it requires legacy data. Use content of SAVE_SPC_required_data.zip /!\ /!\ /!\
# /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\

# print("Producing age rescaling coefficients")
print("Skipping age rescaling")
print("Producing age rescaling coefficients")

# !!! ---> Ages above 67 are treated as 67 due to lack of data

# Read raw data from ONS
download.file("https://www.ons.gov.uk/file?uri=%2femploymentandlabourmarket%2fpeopleinwork%2fearningsandworkinghours%2fdatasets%2fagegroupashetable6%2f2020revised/table62020revised.zip",
destfile = paste(folderIn, "incomeDataAge.zip", sep = "")
destfile = paste(folderIn, "incomeDataAge.zip", sep = ""), headers = c("User-Agent" = "My Custom User Agent")
)
unzip(paste(folderIn, "incomeDataAge.zip", sep = ""), exdir = paste(folderIn, "incomeDataAge", sep = ""))

Expand All @@ -75,10 +83,10 @@ ageFPT <- read_excel(paste(folderIn, "incomeDataAge/", "Age Group Table 6.5a H
ageFPT <- ageFPT[c(1:8), c(1, 3:17)]

# Prepare data to read results of the previous modelling
checkResMFT <- checkResF[checkResF$sex == 1 & checkResF$pwkstat == 1, ]
checkResMPT <- checkResF[checkResF$sex == 1 & checkResF$pwkstat == 2, ]
checkResFFT <- checkResF[checkResF$sex == 0 & checkResF$pwkstat == 1, ]
checkResFPT <- checkResF[checkResF$sex == 0 & checkResF$pwkstat == 2, ]
checkResMFT <- check_res_f[check_res_f$sex == 2 & check_res_f$pwkstat == 1, ]
checkResMPT <- check_res_f[check_res_f$sex == 2 & check_res_f$pwkstat == 2, ]
checkResFFT <- check_res_f[check_res_f$sex == 1 & check_res_f$pwkstat == 1, ]
checkResFPT <- check_res_f[check_res_f$sex == 1 & check_res_f$pwkstat == 2, ]

checkageMFT <- checkResMFT$age
checkageMPT <- checkResMPT$age
Expand All @@ -99,7 +107,7 @@ fitCol <- function(col, row, M, ord = 4) {
outputRefAge <- function(ageData) {
ageData <- as.matrix(ageData[2:8, c(7:11, 3, 12:16)])
ageData <- matrix(as.numeric(as.matrix(ageData)), ncol = ncol(ageData))
coefAgeData <- sapply(1:ncol(ageData), function(x) {
coefAgeData <- sapply(seq_len(ncol(ageData)), function(x) {
fitCol(x, ageRef, ageData, 4)
})
return(coefAgeData)
Expand All @@ -112,17 +120,49 @@ coefAgeFPT <- outputRefAge(ageFPT)

readCoefAgeData <- function(age, coefAgeData) {
refVal <- rep(NA, ncol(coefAgeData))
for (i in 1:ncol(coefAgeData)) {
for (i in seq_len(ncol(coefAgeData))) {
refVal[i] <- coefAgeData[1, i] + coefAgeData[2, i] * age + coefAgeData[3, i] * age^2 + coefAgeData[4, i] * age^3 + coefAgeData[5, i] * age^4
}
return(refVal)
}

# ---------------------
# Plot: testing
library(ggplot2)
ages <- seq(10, 90)
df <- (
data.frame(
do.call(
rbind,
lapply(ages, function(x) readCoefAgeData(x, coefAgeMFT))
)
) %>% mutate(age = ages)
%>% pivot_longer(
cols = !age, names_to = "quantile", values_to = "income"
)
)
ggplot(df, aes(age, income)) +
geom_line(aes(colour = quantile))
# ---------------------

# Returns fitted values from cubic lm fit for vector of values
fitted2 <- function(fit, val) {
i <- val[1]
ret <- fit$coefficients[1] + fit$coefficients[2] * i + fit$coefficients[3] * i^2 + fit$coefficients[4] * i^3
if (length(val) > 1) {
for (i in val[2:length(val)]) {
ret <- c(ret, fit$coefficients[1] + fit$coefficients[2] * i + fit$coefficients[3] * i^2 + fit$coefficients[4] * i^3)
}
}
return(ret)
}


# Build percentile shrinking / expansion reference table depending on age
makeAgeRow <- function(age, sex, fullTime) {
xAx <- c(10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90)
# fetch correct global distribution, distribution for specific age and previoulsy modelled distribution
if (sex == 1 & fullTime == T) {
if (sex == 1 & fullTime == TRUE) {
trueGlob <- as.numeric(ageMFT[1, c(7:11, 3, 12:16)])
if (age > 66) {
temp <- checkincomeHMFT[checkageMFT > 66]
Expand All @@ -132,7 +172,7 @@ makeAgeRow <- function(age, sex, fullTime) {
true <- readCoefAgeData(age, coefAgeMFT)
}
mod <- quantile(temp, c(.10, .20, .25, .30, .40, .50, .60, .70, .75, .80, .90), na.rm = T)
} else if (sex == 1 & fullTime == F) {
} else if (sex == 1 & fullTime == FALSE) {
trueGlob <- as.numeric(ageMPT[1, c(7:11, 3, 12:16)])
if (age > 66) {
temp <- checkincomeHMPT[checkageMPT > 66]
Expand All @@ -142,7 +182,7 @@ makeAgeRow <- function(age, sex, fullTime) {
true <- readCoefAgeData(age, coefAgeMPT)
}
mod <- quantile(temp, c(.10, .20, .25, .30, .40, .50, .60, .70, .75, .80, .90), na.rm = T)
} else if (sex == 0 & fullTime == T) {
} else if (sex == 2 & fullTime == T) {
trueGlob <- as.numeric(ageFFT[1, c(7:11, 3, 12:16)])
if (age > 66) {
temp <- checkincomeHFFT[checkageFFT > 66]
Expand Down Expand Up @@ -184,16 +224,21 @@ makeAgeRow <- function(age, sex, fullTime) {

ageRescaleMFT <- mcmapply(function(x) {
makeAgeRow(x, 1, T)
}, 16:86, mc.cores = detectCores())
}, 16:86, mc.cores = detectCores(), mc.set.seed = FALSE)
ageRescaleMPT <- mcmapply(function(x) {
makeAgeRow(x, 1, F)
}, 16:86, mc.cores = detectCores())
}, 16:86, mc.cores = detectCores(), mc.set.seed = FALSE)
ageRescaleFFT <- mcmapply(function(x) {
makeAgeRow(x, 0, T)
}, 16:86, mc.cores = detectCores())
}, 16:86, mc.cores = detectCores(), mc.set.seed = FALSE)
ageRescaleFPT <- mcmapply(function(x) {
makeAgeRow(x, 0, F)
}, 16:86, mc.cores = detectCores())
}, 16:86, mc.cores = detectCores(), mc.set.seed = FALSE)


## Note: mean of the Income distributions for each feature (groupby) should be the same after rescaling
## "~Exactly" same: pwkstat, sex
## - similar: SOC, Region

print("Writing modelled coefficients")
write.table(ageRescaleMFT, paste(folderOut, "ageRescaleMFT.csv", sep = ""), row.names = F, sep = ",")
Expand Down

0 comments on commit 0a9d597

Please sign in to comment.