Adjust age_rescaling script for LADs

alan-turing-institute · Sep 12, 2023 · 0a9d597 · 0a9d597
1 parent 21a9dfc
commit 0a9d597
Showing 1 changed file with 88 additions and 43 deletions.
diff --git a/scripts/data_prep/age_rescaling/age_rescaling.R b/scripts/data_prep/age_rescaling/age_rescaling.R
@@ -1,5 +1,14 @@
-# Set-up
+# Options and imports
 options(error = traceback)
+library(tidyverse)
+library(parallel)
+library(stringr)
+library(tidyverse)
+library(rgdal)
+library(readxl)
+
+# Set seed
+set.seed(790)
 
 # Positional args
 args <- commandArgs(TRUE)
@@ -8,38 +17,43 @@ args <- commandArgs(TRUE)
 print(args)
 
 # Single run args
-lad <- args[1]
 date <- as.integer(args[2])
-
-# File paths
 folderInOT <- args[3]
 spenserInput <- args[4]
 folderOut <- args[5]
 
+# Alias
+folderIn <- folderInOT
 
+# Age ref
+# Mid-points of Age categories in:
+#   "Age Group Table 6.5a Hourly pay - Gross 2020.xls"
+# 16-17b, 18-21, 22-29, 30-39, 40-49, 50-59, 60+
+ageRef <- c(16.5, 19.5, 25.5, 34.5, 44.5, 54.5, 63)
+
+# Read data
+# -------------------
 ##  L326-L341:
 ##   - https://github.com/alan-turing-institute/uatk-spc/blob/31dd8b05e2a67fb73447d13581f6107d39a56820/scripts/data_prep/raw_to_prepared_Income.R#L326-L341
-
-# Loop over all counties
-i <- countyList[1]
-temp <- addToData(i, lookUp, coefFFT, coefFPT, coefMFT, coefMPT)
-write.table(temp, paste("output/tus_hse_", i, ".csv", sep = ","))
-checkRes <- data.frame(
-    MSOA11CD = temp$MSOA11CD, sex = temp$sex, age = temp$age, soc2010 = temp$soc2010,
-    pwkstat = temp$pwkstat, incomeH = temp$incomeH, incomeY = temp$incomeY
+# SPENSER outputs are already reshaped to 2020 LAD codes so loop over these files
+lookup <- read.csv(paste(folderInOT, "lookUp-GB.csv", sep = ""))
+lad_files <- (
+    lookup %>% filter(Country == "England")
+        %>% select(LAD20CD)
+        %>% unique()
+        %>% mutate(
+            file_name = str_replace(
+                LAD20CD, LAD20CD, paste0(folderOut, LAD20CD, ".csv")
+            )
+        )
 )
 
-for (i in countyList[2:length(countyList)]) {
-    temp <- read.csv(paste("output/tus_hse_", i, ".csv", sep = ""), sep = " ")
-    checkRes2 <- data.frame(
-        MSOA11CD = temp$MSOA11CD, sex = temp$sex, age = temp$age, soc2010 = temp$soc2010,
-        pwkstat = temp$pwkstat, incomeH = temp$incomeH, incomeY = temp$incomeY
-    )
-    checkRes <- rbind(checkRes, checkRes2)
-}
+# TODO: For testing, just two LADs, comment out when complete
+lad_files <- lad_files %>% filter(LAD20CD %in% c("E09000001", "E06000001"))
+check_res <- do.call(rbind, lapply(lad_files$file_name, read.csv))
 
-checkRes$pwkstat <- as.numeric(substr(checkRes$pwkstat, 1, 2))
-checkResF <- checkRes[checkRes$pwkstat == 1 | checkRes$pwkstat == 2, ]
+# Filter for only Full time (1) and part time (2)
+check_res_f <- check_res %>% filter(pwkstat == 1 | pwkstat == 2)
 
 
 ##  L583-L726:
@@ -49,19 +63,13 @@ checkResF <- checkRes[checkRes$pwkstat == 1 | checkRes$pwkstat == 2, ]
 ####### (4.) AGE RESCALING ########
 ###################################
 
-
-# /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\
-# /!\ /!\ /!\ The following is for reference only, it requires legacy data. Use content of SAVE_SPC_required_data.zip /!\ /!\ /!\
-# /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\
-
-# print("Producing age rescaling coefficients")
-print("Skipping age rescaling")
+print("Producing age rescaling coefficients")
 
 # !!! ---> Ages above 67 are treated as 67 due to lack of data
 
 # Read raw data from ONS
 download.file("https://www.ons.gov.uk/file?uri=%2femploymentandlabourmarket%2fpeopleinwork%2fearningsandworkinghours%2fdatasets%2fagegroupashetable6%2f2020revised/table62020revised.zip",
-    destfile = paste(folderIn, "incomeDataAge.zip", sep = "")
+    destfile = paste(folderIn, "incomeDataAge.zip", sep = ""), headers = c("User-Agent" = "My Custom User Agent")
 )
 unzip(paste(folderIn, "incomeDataAge.zip", sep = ""), exdir = paste(folderIn, "incomeDataAge", sep = ""))
 
@@ -75,10 +83,10 @@ ageFPT <- read_excel(paste(folderIn, "incomeDataAge/", "Age Group Table 6.5a   H
 ageFPT <- ageFPT[c(1:8), c(1, 3:17)]
 
 # Prepare data to read results of the previous modelling
-checkResMFT <- checkResF[checkResF$sex == 1 & checkResF$pwkstat == 1, ]
-checkResMPT <- checkResF[checkResF$sex == 1 & checkResF$pwkstat == 2, ]
-checkResFFT <- checkResF[checkResF$sex == 0 & checkResF$pwkstat == 1, ]
-checkResFPT <- checkResF[checkResF$sex == 0 & checkResF$pwkstat == 2, ]
+checkResMFT <- check_res_f[check_res_f$sex == 2 & check_res_f$pwkstat == 1, ]
+checkResMPT <- check_res_f[check_res_f$sex == 2 & check_res_f$pwkstat == 2, ]
+checkResFFT <- check_res_f[check_res_f$sex == 1 & check_res_f$pwkstat == 1, ]
+checkResFPT <- check_res_f[check_res_f$sex == 1 & check_res_f$pwkstat == 2, ]
 
 checkageMFT <- checkResMFT$age
 checkageMPT <- checkResMPT$age
@@ -99,7 +107,7 @@ fitCol <- function(col, row, M, ord = 4) {
 outputRefAge <- function(ageData) {
     ageData <- as.matrix(ageData[2:8, c(7:11, 3, 12:16)])
     ageData <- matrix(as.numeric(as.matrix(ageData)), ncol = ncol(ageData))
-    coefAgeData <- sapply(1:ncol(ageData), function(x) {
+    coefAgeData <- sapply(seq_len(ncol(ageData)), function(x) {
         fitCol(x, ageRef, ageData, 4)
     })
     return(coefAgeData)
@@ -112,17 +120,49 @@ coefAgeFPT <- outputRefAge(ageFPT)
 
 readCoefAgeData <- function(age, coefAgeData) {
     refVal <- rep(NA, ncol(coefAgeData))
-    for (i in 1:ncol(coefAgeData)) {
+    for (i in seq_len(ncol(coefAgeData))) {
         refVal[i] <- coefAgeData[1, i] + coefAgeData[2, i] * age + coefAgeData[3, i] * age^2 + coefAgeData[4, i] * age^3 + coefAgeData[5, i] * age^4
     }
     return(refVal)
 }
 
+# ---------------------
+# Plot: testing
+library(ggplot2)
+ages <- seq(10, 90)
+df <- (
+    data.frame(
+        do.call(
+            rbind,
+            lapply(ages, function(x) readCoefAgeData(x, coefAgeMFT))
+        )
+    ) %>% mutate(age = ages)
+        %>% pivot_longer(
+            cols = !age, names_to = "quantile", values_to = "income"
+        )
+)
+ggplot(df, aes(age, income)) +
+    geom_line(aes(colour = quantile))
+# ---------------------
+
+# Returns fitted values from cubic lm fit for vector of values
+fitted2 <- function(fit, val) {
+    i <- val[1]
+    ret <- fit$coefficients[1] + fit$coefficients[2] * i + fit$coefficients[3] * i^2 + fit$coefficients[4] * i^3
+    if (length(val) > 1) {
+        for (i in val[2:length(val)]) {
+            ret <- c(ret, fit$coefficients[1] + fit$coefficients[2] * i + fit$coefficients[3] * i^2 + fit$coefficients[4] * i^3)
+        }
+    }
+    return(ret)
+}
+
+
 # Build percentile shrinking / expansion reference table depending on age
 makeAgeRow <- function(age, sex, fullTime) {
     xAx <- c(10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90)
     # fetch correct global distribution, distribution for specific age and previoulsy modelled distribution
-    if (sex == 1 & fullTime == T) {
+    if (sex == 1 & fullTime == TRUE) {
         trueGlob <- as.numeric(ageMFT[1, c(7:11, 3, 12:16)])
         if (age > 66) {
             temp <- checkincomeHMFT[checkageMFT > 66]
@@ -132,7 +172,7 @@ makeAgeRow <- function(age, sex, fullTime) {
             true <- readCoefAgeData(age, coefAgeMFT)
         }
         mod <- quantile(temp, c(.10, .20, .25, .30, .40, .50, .60, .70, .75, .80, .90), na.rm = T)
-    } else if (sex == 1 & fullTime == F) {
+    } else if (sex == 1 & fullTime == FALSE) {
         trueGlob <- as.numeric(ageMPT[1, c(7:11, 3, 12:16)])
         if (age > 66) {
             temp <- checkincomeHMPT[checkageMPT > 66]
@@ -142,7 +182,7 @@ makeAgeRow <- function(age, sex, fullTime) {
             true <- readCoefAgeData(age, coefAgeMPT)
         }
         mod <- quantile(temp, c(.10, .20, .25, .30, .40, .50, .60, .70, .75, .80, .90), na.rm = T)
-    } else if (sex == 0 & fullTime == T) {
+    } else if (sex == 2 & fullTime == T) {
         trueGlob <- as.numeric(ageFFT[1, c(7:11, 3, 12:16)])
         if (age > 66) {
             temp <- checkincomeHFFT[checkageFFT > 66]
@@ -184,16 +224,21 @@ makeAgeRow <- function(age, sex, fullTime) {
 
 ageRescaleMFT <- mcmapply(function(x) {
     makeAgeRow(x, 1, T)
-}, 16:86, mc.cores = detectCores())
+}, 16:86, mc.cores = detectCores(), mc.set.seed = FALSE)
 ageRescaleMPT <- mcmapply(function(x) {
     makeAgeRow(x, 1, F)
-}, 16:86, mc.cores = detectCores())
+}, 16:86, mc.cores = detectCores(), mc.set.seed = FALSE)
 ageRescaleFFT <- mcmapply(function(x) {
     makeAgeRow(x, 0, T)
-}, 16:86, mc.cores = detectCores())
+}, 16:86, mc.cores = detectCores(), mc.set.seed = FALSE)
 ageRescaleFPT <- mcmapply(function(x) {
     makeAgeRow(x, 0, F)
-}, 16:86, mc.cores = detectCores())
+}, 16:86, mc.cores = detectCores(), mc.set.seed = FALSE)
+
+
+## Note: mean of the Income distributions for each feature (groupby) should be the same after rescaling
+## "~Exactly" same: pwkstat, sex
+##  - similar: SOC, Region
 
 print("Writing modelled coefficients")
 write.table(ageRescaleMFT, paste(folderOut, "ageRescaleMFT.csv", sep = ""), row.names = F, sep = ",")