Workplaces finished

alan-turing-institute · Apr 27, 2023 · 108794b · 108794b
1 parent c0a5dd2
commit 108794b
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 29 deletions.
diff --git a/scripts/data_prep/README.md b/scripts/data_prep/README.md
@@ -4,17 +4,21 @@
 
 ## Step 1: Curate public data from diverse sources (WIP)
 
-This step requires an API key that can be obtained by registering with [nomisweb](https://www.nomisweb.co.uk/). Once registered, the API key can be found [here](https://www.nomisweb.co.uk/myaccount/webservice.asp). Replace the content of `raw_to_prepared_nomisAPIKey.txt` with this key.
+1. This step requires a nomis API key that can be obtained by registering with [nomisweb](https://www.nomisweb.co.uk/). Once registered, the API key can be found [here](https://www.nomisweb.co.uk/myaccount/webservice.asp). Replace the content of `raw_to_prepared_nomisAPIKey.txt` with this key.
 
-Use `raw_to_prepared.R`. Health and time use data are safeguarded, download directly from [10.5255/UKDA-SN-8860-1](http://doi.org/10.5255/UKDA-SN-8860-1), [10.5255/UKDA-SN-8090-1](http://doi.org/10.5255/UKDA-SN-8090-1), [10.5255/UKDA-SN-8737-1](http://doi.org/10.5255/UKDA-SN-8737-1) and [10.5255/UKDA-SN-8128-1](http://doi.org/10.5255/UKDA-SN-8128-1) and place inside the download folder (path defined by `folderIn`) before running the script.
+2. Use `raw_to_prepared_Environment.R` to install the necessary R packages and create directories.
+
+3. Download manually safeguarded/geoportal data, place those inside the `Data/dl` directory. Required: [LSOA centroids in csv format](https://geoportal.statistics.gov.uk/datasets/ons::lsoa-dec-2011-population-weighted-centroids-in-england-and-wales/explore) (adapt l. 219-220 of `raw_to_prepared_Workplaces.R` if necessary); Health and time use data (download directly from [10.5255/UKDA-SN-8860-1](http://doi.org/10.5255/UKDA-SN-8860-1), [10.5255/UKDA-SN-8090-1](http://doi.org/10.5255/UKDA-SN-8090-1), [10.5255/UKDA-SN-8737-1](http://doi.org/10.5255/UKDA-SN-8737-1) and [10.5255/UKDA-SN-8128-1](http://doi.org/10.5255/UKDA-SN-8128-1)).
+
+4. Run `raw_to_prepared.R`.
 
 This step outputs two types of files:
 - `diariesRef.csv`, `businessRegistry.csv` and `timeAtHomeIncreaseCTY.csv` should be gzipped and stored directly inside `nationaldata-v2` on Azure; and `lookUp-GB.csv` inside `referencedata`on Azure. These files are directly used by SPC.
 - The other files (30 items) are used by the next step. They have been saved within `SAVE_SPC_required_data.zip` for convenience.
 
 Refer to the [data sources](https://alan-turing-institute.github.io/uatk-spc/data_sources.html) to learn more about the raw data and the content of the files.
 
-The script calls `raw_to_prepared_Income.R` to produce income data for the next step. Note that only the modelled coefficients for hourly salaries (averaged over all age groups) and number of hours worked are produced by the script. The age rescaling coefficients require running the entire population once without rescaling, which is not practical. The methodology is commented out for reference. Use the content of `SAVE_SPC_required_data.zip` to obtain these coefficients. The script also calls `raw_to_prepared_Workplaces.R` to create `businessRegistry.csv` (note that this script can only be used on its own after some of the content of `raw_to_prepared.R` have been created).
+The script calls `raw_to_prepared_Income.R` to produce income data for the next step. Note that only the modelled coefficients for hourly salaries (averaged over all age groups) and number of hours worked are produced by the script. The age rescaling coefficients require running the entire population once without rescaling, which is not practical. The methodology is left commented out for reference. Use the content of `SAVE_SPC_required_data.zip` to obtain these coefficients. The script also calls `raw_to_prepared_Workplaces.R` to create `businessRegistry.csv` (note that this script can only be used on its own after some of the content of `raw_to_prepared.R` have been created).
 
 ## Step 2: Add to SPENSER
 

diff --git a/scripts/data_prep/raw_to_prepared_Environment.R b/scripts/data_prep/raw_to_prepared_Environment.R
@@ -0,0 +1,6 @@
+install.packages(c('readr', 'dplyr', 'tidyr', 'janitor', 'foreign', 'sp', 'rgdal', 'raster', 'fitdistrplus', 'reshape2', 'stringr', 'parallel', 'readxl'))
+#install.packages('ggplot2')
+
+dir.create("Data/")
+dir.create("Data/dl")
+dir.create("Data/prepData/")
diff --git a/scripts/data_prep/raw_to_prepared_Workplaces.R b/scripts/data_prep/raw_to_prepared_Workplaces.R
@@ -1,8 +1,6 @@
 library(dplyr)
 library(tidyr)
 library(rgdal)
-library(rgeos)
-library(raster)
 library(sp)
 library(foreign)
 library(reshape2)
@@ -186,36 +184,40 @@ busPop <- busPop[,c(1,6,2,4:5)]
 # 'lsoa' field
 print("Assigning LSOAs...")
 lsoaData <- merge(lsoaData,oatoOther,by.x="LSOA11CD",by.y="LSOA11CD")
-msoaFilling <- function(busPop,name){
+msoaFilling <- function(name,lsoaData,MSOA11CD,sic2d07){
   lsoa <- lsoaData %>% filter(MSOA11CD == name)
-  for(i in unique(busPop$sic2d07)){
-    ref <- which(busPop$MSOA11CD == name & busPop$sic2d07 == i)
-    weights <- lsoa[,paste("X",str_pad(i, 2, pad = "0"),sep = "")]
-    if(sum(weights > 0)){
-      busPop$LSOA11CD[ref] <- sample(lsoa$LSOA11CD, length(ref), replace = T, prob = weights)
-    }else{
-      busPop$LSOA11CD[ref] <- sample(lsoa$LSOA11CD, length(ref), replace = T)
-    }
+  ref <- which(MSOA11CD == name)
+  sic <- sic2d07[ref]
+  res <- rep(NA,length(ref))
+  for(i in 1:length(unique(sic))){
+    ref2 <- which(sic == unique(sic)[i])
+    weights <- lsoa[,paste("X",str_pad(unique(sic)[i], 2, pad = "0"),sep = "")]
+    potlsoa <- lsoa$LSOA11CD
+    ifelse(sum(weights) > 0, res[ref2] <- sample(potlsoa, length(ref2), prob = weights, replace = T),
+           res[ref2] <- sample(potlsoa, length(ref2), replace = T))
   }
-  return(busPop)
-}
-busPop$LSOA11CD <- NA
-for(i in unique(busPop2$MSOA11CD)){
-  if(i%%80 == 0){print(paste(round(i/length(unique(busPop2$MSOA11CD)),1),"%",sep = ""))}
-  busPop <- msoaFilling(busPop,i)
+  return(res)
 }
 
-for(i in unique(busPop2$MSOA11CD)){
-  if(i%%80 == 0){print(paste(round(i/length(unique(busPop2$MSOA11CD)),1),"%",sep = ""))}
-}
+LSOA11CD <- sapply(unique(busPop$MSOA11CD),function(x){msoaFilling(x,lsoaData,busPop$MSOA11CD,busPop$sic2d07)})
+LSOA11CD <- unname(unlist(LSOA11CD))
+
+#LSOA11CD <- msoaFilling(unique(busPop$MSOA11CD)[1],lsoaData,busPop$MSOA11CD,busPop$sic2d07)
+#for(i in 2:length(unique(busPop$MSOA11CD))){
+#  if(i%%80 == 0){print(paste(round(i/length(unique(busPop$MSOA11CD)),2)*100,"%",sep = ""))}
+#  res <- msoaFilling(unique(busPop$MSOA11CD)[i],lsoaData,busPop$MSOA11CD,busPop$sic2d07)
+#  LSOA11CD <- c(LSOA11CD,res)
+#}
+
+busPop$LSOA11CD <- LSOA11CD
 
 # 'lng' and 'lat' fields
 print("Adding coordinates...")
 
 # England and Wales
-download.file("https://stg-arcgisazurecdataprod1.az.arcgis.com/exportfiles-1559-15693/Lower_layer_Super_Output_Areas_Dec_2011_Boundaries_Full_Clipped_BFC_EW_V3_2022_3601855424856006397.csv?sv=2018-03-28&sr=b&sig=tmZTl6Eh6ryGtEsEaHWPbp0GKF2SUcejnO1DeF7csk4%3D&se=2023-04-26T15%3A58%3A01Z&sp=r",destfile = paste(folderIn,"Lower_Layer_Super_Output_Areas__December_2011__Boundaries_Full_Clipped__BFC__EW_V3.csv",sep = ""))
-shp <- read.csv(paste(folderIn,"Lower_Layer_Super_Output_Areas__December_2011__Boundaries_Full_Clipped__BFC__EW_V3.csv",sep = ""))
-coords <- data.frame(LSOA11CD = shp$LSOA11CD, lng = shp$LONG_, lat = shp$LAT)
+#download.file("https://stg-arcgisazurecdataprod1.az.arcgis.com/exportfiles-1559-15693/Lower_layer_Super_Output_Areas_Dec_2011_Boundaries_Full_Clipped_BFC_EW_V3_2022_3601855424856006397.csv?sv=2018-03-28&sr=b&sig=tmZTl6Eh6ryGtEsEaHWPbp0GKF2SUcejnO1DeF7csk4%3D&se=2023-04-26T15%3A58%3A01Z&sp=r",destfile = paste(folderIn,"Lower_Layer_Super_Output_Areas__December_2011__Boundaries_Full_Clipped__BFC__EW_V3.csv",sep = ""))
+shp <- read.csv(paste(folderIn,"LSOA_Dec_2011_PWC_in_England_and_Wales_2022_1923591000694358693.csv",sep = ""))
+coords <- data.frame(LSOA11CD = shp$LSOA11CD, lng = shp$x, lat = shp$y)
 
 # Scotland
 download.file("https://maps.gov.scot/ATOM/shapefiles/SG_DataZoneCent_2011.zip",destfile = paste(folderIn,"SG_DataZoneCent_2011.zip",sep = ""))
@@ -231,10 +233,9 @@ coords3 <- coords3@coords
 coords3 <- data.frame(LSOA11CD = coords2$DataZone, lng = coords3[,1], lat = coords3[,2])
 
 refLSOA <- rbind(coords,coords3)
-busPop2 <- merge(busPop,refLSOA,by.x = "LSOA11CD",by.y = "LSOA11CD")
+busPop <- merge(busPop,refLSOA,by.x = "LSOA11CD",by.y = "LSOA11CD")
 
-busPop <- busPop2[,c(3,4,5,1,9,10,6,2)]
-colnames(busPop)[7] <- "sic1d07"
+busPop <- busPop[,c(2:4,1,7,8,5,6)]
 busPop <- busPop[order(busPop$id),]
 row.names(busPop) <- 1:nrow(busPop)