From 8b0edc7e0dd8227f6d5080ffa976a19a16cb5252 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 15 Sep 2023 11:12:43 +0100 Subject: [PATCH] Refactoring and adding comments --- .../data_prep/pre_rescaling/age_rescaling.R | 97 ++++++++----------- 1 file changed, 42 insertions(+), 55 deletions(-) diff --git a/scripts/data_prep/pre_rescaling/age_rescaling.R b/scripts/data_prep/pre_rescaling/age_rescaling.R index dd91f8f..b512d93 100644 --- a/scripts/data_prep/pre_rescaling/age_rescaling.R +++ b/scripts/data_prep/pre_rescaling/age_rescaling.R @@ -32,22 +32,21 @@ print("Read modelled data") # Read data outputted at LAD20CD resolution lookup <- read.csv(paste(folder_step_1, "lookUp-GB.csv", sep = "")) lad_files <- ( - lookup %>% filter(Country == "England") - %>% select(LAD20CD) - %>% unique() - %>% mutate( + lookup |> filter(Country == "England") + |> select(LAD20CD) + |> unique() + |> mutate( file_name = str_replace( LAD20CD, LAD20CD, paste0(folder_pre_rescaling, LAD20CD, ".csv") ) ) ) - # TODO: For testing, just two LADs, comment out when complete -# lad_files <- lad_files %>% filter(LAD20CD %in% c("E09000001", "E06000001")) # nolint +# lad_files <- lad_files |> filter(LAD20CD %in% c("E09000001", "E06000001")) # nolint check_res <- do.call(rbind, lapply(lad_files$file_name, read.csv)) # Filter for only Full time (1) and part time (2) -check_res_f <- check_res %>% filter(pwkstat == 1 | pwkstat == 2) +check_res_f <- check_res |> filter(pwkstat == 1 | pwkstat == 2) print("Producing age rescaling coefficients") @@ -61,7 +60,8 @@ unzip( exdir = paste(folder_step_1, "incomeDataAge", sep = "") ) -get_income_data <- function(sheet_name) { +# Reads the income percentile data by age for a given category +read_income_data_by_category <- function(sheet_name) { data <- read_excel( paste( folder_step_1, @@ -76,20 +76,20 @@ get_income_data <- function(sheet_name) { } # Income data by age, sex and pwkstat -age_mft <- get_income_data("Male Full-Time") -age_mpt <- get_income_data("Male Part-Time") -age_fft <- get_income_data("Female Full-Time") -age_fpt <- get_income_data("Female Part-Time") +age_mft <- read_income_data_by_category("Male Full-Time") +age_mpt <- read_income_data_by_category("Male Part-Time") +age_fft <- read_income_data_by_category("Female Full-Time") +age_fpt <- read_income_data_by_category("Female Part-Time") # Prepare data to read results of the previous modelling -check_res_mft <- check_res_f %>% filter(sex == 1 & pwkstat == 1) -check_res_mpt <- check_res_f %>% filter(sex == 1 & pwkstat == 2) -check_res_fft <- check_res_f %>% filter(sex == 2 & pwkstat == 1) -check_res_fpt <- check_res_f %>% filter(sex == 2 & pwkstat == 2) - -# Ready data to be able to model ONS data for any age -fit_col <- function(col, row, modelled, ord = 4) { - fit <- lm(modelled[, col] ~ poly(row, ord, raw = TRUE)) +check_res_mft <- check_res_f |> filter(sex == 1 & pwkstat == 1) +check_res_mpt <- check_res_f |> filter(sex == 1 & pwkstat == 2) +check_res_fft <- check_res_f |> filter(sex == 2 & pwkstat == 1) +check_res_fpt <- check_res_f |> filter(sex == 2 & pwkstat == 2) + +# Fits a column (quantile) of income data by age given ages (age midpoints) +fit_col <- function(col, ages, age_data, ord = 4) { + fit <- lm(age_data[, col] ~ poly(ages, ord, raw = TRUE)) as.numeric(c( fit$coefficients[1], fit$coefficients[2], @@ -99,21 +99,24 @@ fit_col <- function(col, row, modelled, ord = 4) { )) } +# Gets fitted coefficients (rows) for income percentiles (columns) output_ref_age <- function(age_data) { age_data <- as.matrix(age_data[2:8, c(7:11, 3, 12:16)]) age_data <- matrix(as.numeric(as.matrix(age_data)), ncol = ncol(age_data)) - coef_age_data <- sapply(seq_len(ncol(age_data)), function(x) { - fit_col(x, age_midpoints, age_data, 4) + coef_age_data <- sapply(seq_len(ncol(age_data)), function(col) { + fit_col(col, age_midpoints, age_data, 4) }) return(coef_age_data) } +# Get coefs matrix for each category coef_age_mft <- output_ref_age(age_mft) coef_age_mpt <- output_ref_age(age_mpt) coef_age_fft <- output_ref_age(age_fft) coef_age_fpt <- output_ref_age(age_fpt) -read_coef_age_data <- function(age, coef_age_data) { +# Gets the predicted income from fitted quartic model for each percentile +get_coef_age_data <- function(age, coef_age_data) { ref_val <- rep(NA, ncol(coef_age_data)) for (i in seq_len(ncol(coef_age_data))) { ref_val[i] <- ( @@ -128,26 +131,15 @@ read_coef_age_data <- function(age, coef_age_data) { } # Returns fitted values from cubic lm fit for vector of values -fitted2 <- function(fit, val) { - i <- val[1] - ret <- ( - fit$coefficients[1] - + fit$coefficients[2] * i - + fit$coefficients[3] * i^2 - + fit$coefficients[4] * i^3 - ) - if (length(val) > 1) { - for (i in val[2:length(val)]) { - ret <- c( - ret, - fit$coefficients[1] - + fit$coefficients[2] * i - + fit$coefficients[3] * i^2 - + fit$coefficients[4] * i^3 - ) - } - } - ret +fitted_cubic <- function(fit, vals) { + unlist(lapply(vals, function(x) { + ( + fit$coefficients[1] + + fit$coefficients[2] * x + + fit$coefficients[3] * x^2 + + fit$coefficients[4] * x^3 + ) + })) } # Gets global incomes for a given Male/Female, Full-Time/Part-Time combination. @@ -161,10 +153,10 @@ get_true_and_modelled <- function(modelled, age, coef) { # Ages above 67 are treated as 67 due to lack of data if (age > 66) { temp <- modelled$incomeH[modelled$age > 66] - true <- read_coef_age_data(67, coef) + true <- get_coef_age_data(67, coef) } else { temp <- modelled$incomeH[modelled$age == age] - true <- read_coef_age_data(age, coef) + true <- get_coef_age_data(age, coef) } modelled <- quantile( temp, @@ -212,19 +204,19 @@ make_age_row <- function(age, sex, full_time) { fit_ref_perc_true_global <- lm(ref_perc ~ poly(true_global, 3, raw = TRUE)) # deduce new percentile value (see methods) do.call(cbind, lapply(seq_len(100), function(perc) { - a <- fitted2(fit_true_global, perc) - b <- fitted2(fit_ref_perc, a) - c <- fitted2(fit_true, b) + a <- fitted_cubic(fit_true_global, perc) + b <- fitted_cubic(fit_ref_perc, a) + c <- fitted_cubic(fit_true, b) min( max( - 1, as.numeric(round(fitted2(fit_ref_perc_true_global, c))) + 1, as.numeric(round(fitted_cubic(fit_ref_perc_true_global, c))) ), 100 ) })) } -# Get rescaling +# Perform rescaling for each percentile (row) and age (column) age_rescale_mft <- mcmapply(function(x) { make_age_row(x, 1, TRUE) }, 16:86, mc.cores = cores, mc.set.seed = FALSE) @@ -238,11 +230,6 @@ age_rescale_fpt <- mcmapply(function(x) { make_age_row(x, 2, FALSE) }, 16:86, mc.cores = cores, mc.set.seed = FALSE) -# Note: mean of the Income distributions for each feature (groupby) should -# be the same after rescaling -# "~Exactly" same: pwkstat, sex -# - similar: SOC, Region - print("Writing modelled coefficients") write.table( age_rescale_mft,