Merge branch 'gempyor-plugin' into new_inference

HopkinsIDD · Apr 12, 2024 · 9f87b18 · 9f87b18
2 parents 34b2314 + cf6beca
commit 9f87b18
Show file tree

Hide file tree

Showing 17 changed files with 239 additions and 207 deletions.
diff --git a/batch/AWS_inference_runner.sh b/batch/AWS_inference_runner.sh
@@ -3,7 +3,7 @@
 set -x
 
 # Expected environment variables from AWS Batch env
-# S3_MODEL_DATA_PATH location in S3 with the code, data, and dvc pipeline to run
+# S3_MODEL_PROJECT_PATH location in S3 with the code, data, and dvc pipeline to run
 # DVC_OUTPUTS the names of the directories with outputs to save in S3, separated by a space
 # SIMS_PER_JOB is the number of sims to run per job
 # JOB_NAME the name of the job
@@ -40,7 +40,7 @@ aws configure set default.s3.multipart_chunksize 8MB
 
 # Copy the complete model + data package from S3 and
 # install the local R packages
-aws s3 cp --quiet $S3_MODEL_DATA_PATH model_data.tar.gz
+aws s3 cp --quiet $S3_MODEL_PROJECT_PATH model_data.tar.gz
 mkdir model_data
 tar -xzf model_data.tar.gz -C model_data # chadi: removed v(erbose) option here as it floods the log with data we have anyway from the s3 bucket
 cd model_data

diff --git a/batch/AWS_postprocess_runner.sh b/batch/AWS_postprocess_runner.sh
@@ -3,7 +3,7 @@
 set -x
 
 # Expected environment variables from AWS Batch env
-# S3_MODEL_DATA_PATH location in S3 with the code, data, and dvc pipeline to run
+# S3_MODEL_PROJECT_PATH location in S3 with the code, data, and dvc pipeline to run
 # DVC_OUTPUTS the names of the directories with outputs to save in S3, separated by a space
 # SIMS_PER_JOB is the number of sims to run per job
 # JOB_NAME the name of the job
@@ -34,7 +34,7 @@ aws configure set default.s3.multipart_chunksize 8MB
 
 # Copy the complete model + data package from S3 and
 # install the local R packages
-aws s3 cp --quiet $S3_MODEL_DATA_PATH model_data.tar.gz
+aws s3 cp --quiet $S3_MODEL_PROJECT_PATH model_data.tar.gz
 mkdir model_data
 tar -xzf model_data.tar.gz -C model_data # chadi: removed v(erbose) option here as it floods the log with data we have anyway from the s3 bucket
 cd model_data

diff --git a/batch/AWS_scenario_runner.sh b/batch/AWS_scenario_runner.sh
@@ -3,7 +3,7 @@
 set -x
 
 # Expected environment variables from AWS Batch env
-# S3_MODEL_DATA_PATH location in S3 with the code, data, and dvc pipeline to run
+# S3_MODEL_PROJECT_PATH location in S3 with the code, data, and dvc pipeline to run
 # DVC_TARGET the name of the dvc file in the model that should be reproduced locally.
 # DVC_OUTPUTS the names of the directories with outputs to save in S3, separated by a space
 # S3_RESULTS_PATH location in S3 to store the results
@@ -24,7 +24,7 @@ aws configure set default.s3.multipart_chunksize 8MB
 
 # Copy the complete model + data package from S3 and
 # install the local R packages
-aws s3 cp --quiet $S3_MODEL_DATA_PATH model_data.tar.gz
+aws s3 cp --quiet $S3_MODEL_PROJECT_PATH model_data.tar.gz
 mkdir model_data
 tar -xvzf model_data.tar.gz -C model_data
 cd model_data

diff --git a/batch/SLURM_inference_job.run b/batch/SLURM_inference_job.run
@@ -7,7 +7,7 @@
 
 set -x
 
-cd $DATA_PATH
+cd $PROJECT_PATH
 
 FLEPI_SLOT_INDEX=${SLURM_ARRAY_TASK_ID}
 

diff --git a/batch/inference_job_launcher.py b/batch/inference_job_launcher.py
@@ -55,7 +55,7 @@ def user_confirmation(question="Continue?", default=False):
     "--data-path",
     "--data-path",
     "data_path",
-    envvar="DATA_PATH",
+    envvar="PROJECT_PATH",
     type=click.Path(exists=True),
     default=".",
     help="path to the data directory",
@@ -673,12 +673,12 @@ def launch(self, job_name, config_file, seir_modifiers_scenarios, outcome_modifi
         ## TODO: check how each of these variables are used downstream
         base_env_vars = [
             {"name": "BATCH_SYSTEM", "value": self.batch_system},
-            {"name": "S3_MODEL_DATA_PATH", "value": f"s3://{self.s3_bucket}/{job_name}.tar.gz"},
+            {"name": "S3_MODEL_PROJECT_PATH", "value": f"s3://{self.s3_bucket}/{job_name}.tar.gz"},
             {"name": "DVC_OUTPUTS", "value": " ".join(self.outputs)},
             {"name": "S3_RESULTS_PATH", "value": s3_results_path},
             {"name": "FS_RESULTS_PATH", "value": fs_results_path},
             {"name": "S3_UPLOAD", "value": str(self.s3_upload).lower()},
-            {"name": "DATA_PATH", "value": str(self.data_path)},
+            {"name": "PROJECT_PATH", "value": str(self.data_path)},
             {"name": "FLEPI_PATH", "value": str(self.flepi_path)},
             {"name": "CONFIG_PATH", "value": config_file},
             {"name": "FLEPI_NUM_SLOTS", "value": str(self.num_jobs)},

diff --git a/batch/scenario_job.py b/batch/scenario_job.py
@@ -213,7 +213,7 @@ def launch_job_inner(
     results_path = f"s3://{s3_output_bucket}/{job_name}"
     env_vars = [
         {"name": "CONFIG_PATH", "value": config_file},
-        {"name": "S3_MODEL_DATA_PATH", "value": model_data_path},
+        {"name": "S3_MODEL_PROJECT_PATH", "value": model_data_path},
         {"name": "DVC_TARGET", "value": dvc_target},
         {"name": "DVC_OUTPUTS", "value": " ".join(dvc_outputs)},
         {"name": "S3_RESULTS_PATH", "value": results_path},

diff --git a/flepimop/R_packages/flepicommon/NAMESPACE b/flepimop/R_packages/flepicommon/NAMESPACE
@@ -24,6 +24,7 @@ export(load_config)
 export(load_geodata_file)
 export(prettyprint_optlist)
 export(read_file_of_type)
+export(read_parquet_with_check)
 export(run_id)
 import(covidcast)
 import(doParallel)

diff --git a/flepimop/R_packages/flepicommon/R/DataUtils.R b/flepimop/R_packages/flepicommon/R/DataUtils.R
@@ -38,7 +38,7 @@ load_geodata_file <- function(filename,
 
     if(state_name) {
         utils::data(fips_us_county, package = "flepicommon") # arrow::read_parquet("datasetup/usdata/fips_us_county.parquet")
-        geodata <- fips_us_county %>%  
+        geodata <- fips_us_county %>%
             dplyr::distinct(state, state_name) %>%
             dplyr::rename(USPS = state) %>%
             dplyr::rename(state = state_name) %>%
@@ -53,6 +53,31 @@ load_geodata_file <- function(filename,
 
 
 
+#' Read parquet files with check for existence to understand errors
+#'
+#' @param file The file to read
+#'
+#' @return
+#' @export
+#'
+#' @examples
+read_parquet_with_check <- function(file){
+    if(!file.exists(file)){
+        stop(paste("File",file,"does not exist"))
+    }
+    arrow::read_parquet(file)
+}
+
+
+
+
+
+
+
+
+
+
+
 
 #' Depracated function that returns a function to read files of a specific type (or automatically detected type based on extension)
 #' @param extension The file extension to read files of
@@ -113,11 +138,11 @@ read_file_of_type <- function(extension,...){
 # ##' @importFrom cdlTools fips census2010FIPS stateNames
 # ##'
 # download_USAFacts_data <- function(filename, url, value_col_name, incl_unassigned = FALSE){
-# 
+#
 #   dir.create(dirname(filename), showWarnings = FALSE, recursive = TRUE)
 #   message(paste("Downloading", url, "to", filename))
 #   download.file(url, filename, "auto")
-# 
+#
 #   usafacts_data <- readr::read_csv(filename)
 #   names(usafacts_data) <- stringr::str_to_lower(names(usafacts_data))
 #   usafacts_data <- dplyr::select(usafacts_data, -statefips,-`county name`) %>% # drop statefips columns
@@ -141,13 +166,13 @@ read_file_of_type <- function(extension,...){
 #   date_func <- ifelse(any(grepl("^\\d\\d\\d\\d",col_names)),lubridate::ymd, lubridate::mdy)
 #   usafacts_data <- tidyr::pivot_longer(usafacts_data, tidyselect::all_of(date_cols), names_to="Update", values_to=value_col_name)
 #   usafacts_data <- dplyr::mutate(usafacts_data, Update=date_func(Update), FIPS=sprintf("%05d", FIPS))
-# 
+#
 #   validation_date <- Sys.getenv("VALIDATION_DATE")
 #   if ( validation_date != '' ) {
 #     print(paste("(DataUtils.R) Limiting USAFacts data to:", validation_date, sep=" "))
 #     usafacts_data <- dplyr::filter(usafacts_data, Update < validation_date )
 #   }
-# 
+#
 #   return(usafacts_data)
 # }
 

diff --git a/flepimop/R_packages/flepicommon/data/state_fips_abbr.rda b/flepimop/R_packages/flepicommon/data/state_fips_abbr.rda
diff --git a/flepimop/R_packages/inference/R/inference_slot_runner_funcs.R b/flepimop/R_packages/inference/R/inference_slot_runner_funcs.R
@@ -614,8 +614,6 @@ initialize_mcmc_first_block <- function(
     }
 
 
-
-
     ## initial conditions (init)
 
     if (!is.null(config$initial_conditions)){
@@ -633,19 +631,18 @@ initialize_mcmc_first_block <- function(
                 }
                 if (grepl(".csv", initial_init_file)){
                     initial_init <- readr::read_csv(initial_init_file)
-                    config$initial_conditions$initial_conditions_file <- gsub(".csv", ".parquet", config$initial_conditions$initial_conditions_file)
-                    arrow::write_parquet(initial_init, config$initial_conditions$initial_conditions_file)
+                    arrow::write_parquet(initial_init, global_files[["init_filename"]])
+                }else{
+                    err <- !(file.copy(initial_init_file, global_files[["init_filename"]]))
+                    if (err != 0) {
+                        stop("Could not copy initial conditions file")
+                    }
                 }
 
-                err <- !(file.copy(config$initial_conditions$initial_conditions_file, global_files[["init_filename"]]))
-                if (err != 0) {
-                    stop("Could not copy initial conditions file")
-                }
             } else if (config$initial_conditions$method %in% c("InitialConditionsFolderDraw", "SetInitialConditionsFolderDraw")) {
                 print("Initial conditions in inference has not been fully implemented yet for the 'folder draw' methods,
                       and no copying to global or chimeric files is being done.")
 
-
                 if (is.null(config$initial_conditions$initial_file_type)) {
                     stop("ERROR: Initial conditions file needs to be specified in the config under `initial_conditions:initial_conditions_file`")
                 }
@@ -654,15 +651,14 @@ initialize_mcmc_first_block <- function(
                 if (!file.exists(initial_init_file)) {
                     stop("ERROR: Initial conditions file specified but does not exist.")
                 }
-                if (grepl(".csv", initial_init_file)){
+                if (grepl(".csv", initial_init_file)){ 
                     initial_init <- readr::read_csv(initial_init_file)
-                    initial_init_file <- gsub(".csv", ".parquet", initial_init_file)
-                    arrow::write_parquet(initial_init, initial_init_file)
-                }
-
-                err <- !(file.copy(initial_init_file, global_files[["init_filename"]]))
-                if (err != 0) {
-                    stop("Could not copy initial conditions file")
+                    arrow::write_parquet(initial_init, global_files[["init_filename"]])
+                }else{
+                    err <- !(file.copy(initial_init_file, global_files[["init_filename"]]))
+                    if (err != 0) {
+                        stop("Could not copy initial conditions file")
+                    }
                 }
 
             }

diff --git a/flepimop/gempyor_pkg/docs/integration_benchmark.ipynb b/flepimop/gempyor_pkg/docs/integration_benchmark.ipynb
@@ -31,14 +31,14 @@
     "Run once\n",
     "```python\n",
     "export FLEPI_PATH=$(pwd)/flepiMoP\n",
-    "export DATA_PATH=$(pwd)/COVID19_USA\n",
+    "export PROJECT_PATH=$(pwd)/COVID19_USA\n",
     "conda activate covidSProd6\n",
     "cd $FLEPI_PATH\n",
     "Rscript build/local_install.R\n",
     "python setup.py develop --no-deps\n",
     "git lfs install\n",
     "git lfs pull\n",
-    "cd $DATA_PATH\n",
+    "cd $PROJECT_PATH\n",
     "git restore data/\n",
     "export CONFIG_PATH=config_smh_r11_optsev_highie_base_deathscases_blk1.yml\n",
     "Rscript $FLEPI_PATH/datasetup/build_US_setup.R\n",

diff --git a/flepimop/gempyor_pkg/src/gempyor/seeding_ic.py b/flepimop/gempyor_pkg/src/gempyor/seeding_ic.py
@@ -482,7 +482,7 @@ def get_from_config(self, sim_id: int, setup) -> np.ndarray:
                             else:
                                 raise ValueError(
                                     f"Initial Conditions: Could not set compartment {comp_name} (id: {comp_idx}) in subpop {pl} (id: {pl_idx}). The data from the init file is {states_pl}. \n \
-                                                 Use 'allow_missing_compartments' to default to 0 for compartments without initial conditions"
+                                                Use 'allow_missing_compartments' to default to 0 for compartments without initial conditions"
                                 )
                         if "rest" in str(ic_df_compartment_val).strip().lower():
                             rests.append([comp_idx, pl_idx])
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,7 +7,7 @@ @@
     set -x
-    cd $DATA_PATH
+    cd $PROJECT_PATH
     FLEPI_SLOT_INDEX=${SLURM_ARRAY_TASK_ID}
@@ Expand Down @@