Skip to content

Commit

Permalink
Merge branch 'gempyor-plugin' into new_inference
Browse files Browse the repository at this point in the history
  • Loading branch information
jcblemai authored Apr 12, 2024
2 parents 34b2314 + cf6beca commit 9f87b18
Show file tree
Hide file tree
Showing 17 changed files with 239 additions and 207 deletions.
4 changes: 2 additions & 2 deletions batch/AWS_inference_runner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
set -x

# Expected environment variables from AWS Batch env
# S3_MODEL_DATA_PATH location in S3 with the code, data, and dvc pipeline to run
# S3_MODEL_PROJECT_PATH location in S3 with the code, data, and dvc pipeline to run
# DVC_OUTPUTS the names of the directories with outputs to save in S3, separated by a space
# SIMS_PER_JOB is the number of sims to run per job
# JOB_NAME the name of the job
Expand Down Expand Up @@ -40,7 +40,7 @@ aws configure set default.s3.multipart_chunksize 8MB

# Copy the complete model + data package from S3 and
# install the local R packages
aws s3 cp --quiet $S3_MODEL_DATA_PATH model_data.tar.gz
aws s3 cp --quiet $S3_MODEL_PROJECT_PATH model_data.tar.gz
mkdir model_data
tar -xzf model_data.tar.gz -C model_data # chadi: removed v(erbose) option here as it floods the log with data we have anyway from the s3 bucket
cd model_data
Expand Down
4 changes: 2 additions & 2 deletions batch/AWS_postprocess_runner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
set -x

# Expected environment variables from AWS Batch env
# S3_MODEL_DATA_PATH location in S3 with the code, data, and dvc pipeline to run
# S3_MODEL_PROJECT_PATH location in S3 with the code, data, and dvc pipeline to run
# DVC_OUTPUTS the names of the directories with outputs to save in S3, separated by a space
# SIMS_PER_JOB is the number of sims to run per job
# JOB_NAME the name of the job
Expand Down Expand Up @@ -34,7 +34,7 @@ aws configure set default.s3.multipart_chunksize 8MB

# Copy the complete model + data package from S3 and
# install the local R packages
aws s3 cp --quiet $S3_MODEL_DATA_PATH model_data.tar.gz
aws s3 cp --quiet $S3_MODEL_PROJECT_PATH model_data.tar.gz
mkdir model_data
tar -xzf model_data.tar.gz -C model_data # chadi: removed v(erbose) option here as it floods the log with data we have anyway from the s3 bucket
cd model_data
Expand Down
4 changes: 2 additions & 2 deletions batch/AWS_scenario_runner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
set -x

# Expected environment variables from AWS Batch env
# S3_MODEL_DATA_PATH location in S3 with the code, data, and dvc pipeline to run
# S3_MODEL_PROJECT_PATH location in S3 with the code, data, and dvc pipeline to run
# DVC_TARGET the name of the dvc file in the model that should be reproduced locally.
# DVC_OUTPUTS the names of the directories with outputs to save in S3, separated by a space
# S3_RESULTS_PATH location in S3 to store the results
Expand All @@ -24,7 +24,7 @@ aws configure set default.s3.multipart_chunksize 8MB

# Copy the complete model + data package from S3 and
# install the local R packages
aws s3 cp --quiet $S3_MODEL_DATA_PATH model_data.tar.gz
aws s3 cp --quiet $S3_MODEL_PROJECT_PATH model_data.tar.gz
mkdir model_data
tar -xvzf model_data.tar.gz -C model_data
cd model_data
Expand Down
2 changes: 1 addition & 1 deletion batch/SLURM_inference_job.run
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

set -x

cd $DATA_PATH
cd $PROJECT_PATH

FLEPI_SLOT_INDEX=${SLURM_ARRAY_TASK_ID}

Expand Down
6 changes: 3 additions & 3 deletions batch/inference_job_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def user_confirmation(question="Continue?", default=False):
"--data-path",
"--data-path",
"data_path",
envvar="DATA_PATH",
envvar="PROJECT_PATH",
type=click.Path(exists=True),
default=".",
help="path to the data directory",
Expand Down Expand Up @@ -673,12 +673,12 @@ def launch(self, job_name, config_file, seir_modifiers_scenarios, outcome_modifi
## TODO: check how each of these variables are used downstream
base_env_vars = [
{"name": "BATCH_SYSTEM", "value": self.batch_system},
{"name": "S3_MODEL_DATA_PATH", "value": f"s3://{self.s3_bucket}/{job_name}.tar.gz"},
{"name": "S3_MODEL_PROJECT_PATH", "value": f"s3://{self.s3_bucket}/{job_name}.tar.gz"},
{"name": "DVC_OUTPUTS", "value": " ".join(self.outputs)},
{"name": "S3_RESULTS_PATH", "value": s3_results_path},
{"name": "FS_RESULTS_PATH", "value": fs_results_path},
{"name": "S3_UPLOAD", "value": str(self.s3_upload).lower()},
{"name": "DATA_PATH", "value": str(self.data_path)},
{"name": "PROJECT_PATH", "value": str(self.data_path)},
{"name": "FLEPI_PATH", "value": str(self.flepi_path)},
{"name": "CONFIG_PATH", "value": config_file},
{"name": "FLEPI_NUM_SLOTS", "value": str(self.num_jobs)},
Expand Down
2 changes: 1 addition & 1 deletion batch/scenario_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def launch_job_inner(
results_path = f"s3://{s3_output_bucket}/{job_name}"
env_vars = [
{"name": "CONFIG_PATH", "value": config_file},
{"name": "S3_MODEL_DATA_PATH", "value": model_data_path},
{"name": "S3_MODEL_PROJECT_PATH", "value": model_data_path},
{"name": "DVC_TARGET", "value": dvc_target},
{"name": "DVC_OUTPUTS", "value": " ".join(dvc_outputs)},
{"name": "S3_RESULTS_PATH", "value": results_path},
Expand Down
1 change: 1 addition & 0 deletions flepimop/R_packages/flepicommon/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ export(load_config)
export(load_geodata_file)
export(prettyprint_optlist)
export(read_file_of_type)
export(read_parquet_with_check)
export(run_id)
import(covidcast)
import(doParallel)
Expand Down
35 changes: 30 additions & 5 deletions flepimop/R_packages/flepicommon/R/DataUtils.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ load_geodata_file <- function(filename,

if(state_name) {
utils::data(fips_us_county, package = "flepicommon") # arrow::read_parquet("datasetup/usdata/fips_us_county.parquet")
geodata <- fips_us_county %>%
geodata <- fips_us_county %>%
dplyr::distinct(state, state_name) %>%
dplyr::rename(USPS = state) %>%
dplyr::rename(state = state_name) %>%
Expand All @@ -53,6 +53,31 @@ load_geodata_file <- function(filename,



#' Read parquet files with check for existence to understand errors
#'
#' @param file The file to read
#'
#' @return
#' @export
#'
#' @examples
read_parquet_with_check <- function(file){
if(!file.exists(file)){
stop(paste("File",file,"does not exist"))
}
arrow::read_parquet(file)
}












#' Depracated function that returns a function to read files of a specific type (or automatically detected type based on extension)
#' @param extension The file extension to read files of
Expand Down Expand Up @@ -113,11 +138,11 @@ read_file_of_type <- function(extension,...){
# ##' @importFrom cdlTools fips census2010FIPS stateNames
# ##'
# download_USAFacts_data <- function(filename, url, value_col_name, incl_unassigned = FALSE){
#
#
# dir.create(dirname(filename), showWarnings = FALSE, recursive = TRUE)
# message(paste("Downloading", url, "to", filename))
# download.file(url, filename, "auto")
#
#
# usafacts_data <- readr::read_csv(filename)
# names(usafacts_data) <- stringr::str_to_lower(names(usafacts_data))
# usafacts_data <- dplyr::select(usafacts_data, -statefips,-`county name`) %>% # drop statefips columns
Expand All @@ -141,13 +166,13 @@ read_file_of_type <- function(extension,...){
# date_func <- ifelse(any(grepl("^\\d\\d\\d\\d",col_names)),lubridate::ymd, lubridate::mdy)
# usafacts_data <- tidyr::pivot_longer(usafacts_data, tidyselect::all_of(date_cols), names_to="Update", values_to=value_col_name)
# usafacts_data <- dplyr::mutate(usafacts_data, Update=date_func(Update), FIPS=sprintf("%05d", FIPS))
#
#
# validation_date <- Sys.getenv("VALIDATION_DATE")
# if ( validation_date != '' ) {
# print(paste("(DataUtils.R) Limiting USAFacts data to:", validation_date, sep=" "))
# usafacts_data <- dplyr::filter(usafacts_data, Update < validation_date )
# }
#
#
# return(usafacts_data)
# }

Expand Down
Binary file not shown.
30 changes: 13 additions & 17 deletions flepimop/R_packages/inference/R/inference_slot_runner_funcs.R
Original file line number Diff line number Diff line change
Expand Up @@ -614,8 +614,6 @@ initialize_mcmc_first_block <- function(
}




## initial conditions (init)

if (!is.null(config$initial_conditions)){
Expand All @@ -633,19 +631,18 @@ initialize_mcmc_first_block <- function(
}
if (grepl(".csv", initial_init_file)){
initial_init <- readr::read_csv(initial_init_file)
config$initial_conditions$initial_conditions_file <- gsub(".csv", ".parquet", config$initial_conditions$initial_conditions_file)
arrow::write_parquet(initial_init, config$initial_conditions$initial_conditions_file)
arrow::write_parquet(initial_init, global_files[["init_filename"]])
}else{
err <- !(file.copy(initial_init_file, global_files[["init_filename"]]))
if (err != 0) {
stop("Could not copy initial conditions file")
}
}

err <- !(file.copy(config$initial_conditions$initial_conditions_file, global_files[["init_filename"]]))
if (err != 0) {
stop("Could not copy initial conditions file")
}
} else if (config$initial_conditions$method %in% c("InitialConditionsFolderDraw", "SetInitialConditionsFolderDraw")) {
print("Initial conditions in inference has not been fully implemented yet for the 'folder draw' methods,
and no copying to global or chimeric files is being done.")


if (is.null(config$initial_conditions$initial_file_type)) {
stop("ERROR: Initial conditions file needs to be specified in the config under `initial_conditions:initial_conditions_file`")
}
Expand All @@ -654,15 +651,14 @@ initialize_mcmc_first_block <- function(
if (!file.exists(initial_init_file)) {
stop("ERROR: Initial conditions file specified but does not exist.")
}
if (grepl(".csv", initial_init_file)){
if (grepl(".csv", initial_init_file)){
initial_init <- readr::read_csv(initial_init_file)
initial_init_file <- gsub(".csv", ".parquet", initial_init_file)
arrow::write_parquet(initial_init, initial_init_file)
}

err <- !(file.copy(initial_init_file, global_files[["init_filename"]]))
if (err != 0) {
stop("Could not copy initial conditions file")
arrow::write_parquet(initial_init, global_files[["init_filename"]])
}else{
err <- !(file.copy(initial_init_file, global_files[["init_filename"]]))
if (err != 0) {
stop("Could not copy initial conditions file")
}
}

}
Expand Down
4 changes: 2 additions & 2 deletions flepimop/gempyor_pkg/docs/integration_benchmark.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,14 @@
"Run once\n",
"```python\n",
"export FLEPI_PATH=$(pwd)/flepiMoP\n",
"export DATA_PATH=$(pwd)/COVID19_USA\n",
"export PROJECT_PATH=$(pwd)/COVID19_USA\n",
"conda activate covidSProd6\n",
"cd $FLEPI_PATH\n",
"Rscript build/local_install.R\n",
"python setup.py develop --no-deps\n",
"git lfs install\n",
"git lfs pull\n",
"cd $DATA_PATH\n",
"cd $PROJECT_PATH\n",
"git restore data/\n",
"export CONFIG_PATH=config_smh_r11_optsev_highie_base_deathscases_blk1.yml\n",
"Rscript $FLEPI_PATH/datasetup/build_US_setup.R\n",
Expand Down
2 changes: 1 addition & 1 deletion flepimop/gempyor_pkg/src/gempyor/seeding_ic.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ def get_from_config(self, sim_id: int, setup) -> np.ndarray:
else:
raise ValueError(
f"Initial Conditions: Could not set compartment {comp_name} (id: {comp_idx}) in subpop {pl} (id: {pl_idx}). The data from the init file is {states_pl}. \n \
Use 'allow_missing_compartments' to default to 0 for compartments without initial conditions"
Use 'allow_missing_compartments' to default to 0 for compartments without initial conditions"
)
if "rest" in str(ic_df_compartment_val).strip().lower():
rests.append([comp_idx, pl_idx])
Expand Down
Loading

0 comments on commit 9f87b18

Please sign in to comment.