From a9fc587793c7657605ef3c6cd807d7ba46d4ec75 Mon Sep 17 00:00:00 2001 From: saraloo <45245630+saraloo@users.noreply.github.com> Date: Fri, 26 Jul 2024 15:27:08 +1000 Subject: [PATCH 1/5] more fixes for aggregation --- postprocessing/postprocess_snapshot.R | 222 +++++++++++++++++++------- 1 file changed, 168 insertions(+), 54 deletions(-) diff --git a/postprocessing/postprocess_snapshot.R b/postprocessing/postprocess_snapshot.R index 93bdba5d1..a505b79b6 100644 --- a/postprocessing/postprocess_snapshot.R +++ b/postprocessing/postprocess_snapshot.R @@ -175,20 +175,64 @@ if("hosp" %in% model_outputs){ statistics <- purrr::flatten(config$inference$statistics[i]) cols_sim <- c("date", statistics$sim_var, "subpop","slot") cols_data <- c("date", "subpop", statistics$data_var) + + # aggregate based on what is in the config + df_sim <- lapply(subpop_names, function(y) { + lapply(unique(hosp_outputs_global$slot), function(x) + purrr::flatten_df(inference::getStats( + hosp_outputs_global_tmp %>% .[subpop == y & slot == x] , + "date", + "sim_var", + stat_list = config$inference$statistics[i], + start_date = config$start_date_groundtruth, + end_date = config$end_date_groundtruth + )) %>% dplyr::mutate(subpop = y, slot = x)) %>% dplyr::bind_rows() + }) %>% dplyr::bind_rows() + + df_data <- lapply(subpop_names, function(x) { + purrr::flatten_df( + inference::getStats( + gt_data %>% .[subpop == x,..cols_data], + "date", + "data_var", + stat_list = config$inference$statistics[i], + start_date = config$start_date_groundtruth, + end_date = config$end_date_groundtruth + )) %>% dplyr::mutate(subpop = x) %>% + mutate(data_var = as.numeric(data_var)) }) %>% dplyr::bind_rows() + + ## summarize slots - print(outputs_global$hosp %>% - .[, ..cols_sim] %>% + # print(outputs_global$hosp %>% + # .[, ..cols_sim] %>% + # .[, date := lubridate::as_date(date)] %>% + # .[, as.list(quantile(get(statistics$sim_var), c(.05, .25, .5, .75, .95), na.rm = TRUE, names = FALSE)), by = c("date", "subpop")] %>% + # ggplot() + + # geom_ribbon(aes(x = date, ymin = V1, ymax = V5), alpha = 0.1) + + # geom_ribbon(aes(x = date, ymin = V2, ymax = V4), alpha = 0.1) + + # geom_line(aes(x = date, y = V3)) + + # geom_point(data = gt_data %>% + # .[, ..cols_data], + # aes(lubridate::as_date(date), get(statistics$data_var)), color = 'firebrick', alpha = 0.1) + + # facet_wrap(~subpop, scales = 'free', ncol = gg_cols) + + # labs(x = 'date', y = fit_stats[i], title = statistics$sim_var) + + # theme_classic() + # ) + print( + df_sim %>% + setDT() %>% .[, date := lubridate::as_date(date)] %>% - .[, as.list(quantile(get(statistics$sim_var), c(.05, .25, .5, .75, .95), na.rm = TRUE, names = FALSE)), by = c("date", "subpop")] %>% + .[, as.list(quantile(sim_var, c(.05, .25, .5, .75, .95), na.rm = TRUE, names = FALSE)), by = c("date", "subpop")] %>% + setnames(., paste0("V", 1:5), paste0("q", c(.05,.25,.5,.75,.95))) %>% ggplot() + - geom_ribbon(aes(x = date, ymin = V1, ymax = V5), alpha = 0.1) + - geom_ribbon(aes(x = date, ymin = V2, ymax = V4), alpha = 0.1) + - geom_line(aes(x = date, y = V3)) + - geom_point(data = gt_data %>% - .[, ..cols_data], - aes(lubridate::as_date(date), get(statistics$data_var)), color = 'firebrick', alpha = 0.1) + - facet_wrap(~subpop, scales = 'free', ncol = gg_cols) + - labs(x = 'date', y = fit_stats[i], title = statistics$sim_var) + + geom_ribbon(aes(x = date, ymin = q0.05, ymax = q0.95), alpha = 0.1) + + geom_ribbon(aes(x = date, ymin = q0.25, ymax = q0.75), alpha = 0.1) + + geom_line(aes(x = date, y = q0.5)) + + # if inference, plot gt along side + geom_point(data = df_data, + aes(lubridate::as_date(date), data_var), color = 'firebrick', alpha = 0.2, size=1) + + facet_wrap(~subpop, scales = 'free') + + labs(x = 'date', y = fit_stats[i]) + theme_classic() ) @@ -206,23 +250,44 @@ if("hosp" %in% model_outputs){ # ) ## plot cumulatives - print(outputs_global$hosp %>% - .[, ..cols_sim] %>% - .[, date := lubridate::as_date(date)] %>% - .[, csum := cumsum(get(statistics$sim_var)), by = .(subpop, slot)] %>% - .[, as.list(quantile(csum, c(.05, .25, .5, .75, .95), na.rm = TRUE, names = FALSE)), by = c("date", "subpop")] %>% - ggplot() + - geom_ribbon(aes(x = date, ymin = V1, ymax = V5), alpha = 0.1) + - geom_ribbon(aes(x = date, ymin = V2, ymax = V4), alpha = 0.1) + - geom_line(aes(x = date, y = V3)) + - geom_point(data = gt_data %>% - .[, ..cols_data] %>% - .[, csum := cumsum(replace_na(get(statistics$data_var), 0)) , by = .(subpop)] - , - aes(lubridate::as_date(date), csum), color = 'firebrick', alpha = 0.1) + - facet_wrap(~subpop, scales = 'free', ncol = gg_cols) + - labs(x = 'date', y = fit_stats[i], title = paste0("cumulative ", statistics$sim_var)) + - theme_classic() + print( + df_sim %>% + setDT() %>% + .[, date := lubridate::as_date(date)] %>% + .[, .(date, subpop, sim_var, slot)] %>% + data.table::melt(., id.vars = c("date", "slot", "subpop")) %>% + # dplyr::arrange(subpop, slot, date) %>% + .[, csum := cumsum(value), by = .(slot, subpop, variable)] %>% + .[, as.list(quantile(csum, c(.05, .25, .5, .75, .95), na.rm = TRUE, names = FALSE)), by = c("date", config$subpop_setup$subpop)] %>% + setnames(., paste0("V", 1:5), paste0("q", c(.05,.25,.5,.75,.95))) %>% + ggplot() + + geom_ribbon(aes(x = date, ymin = q0.05, ymax = q0.95), alpha = 0.1) + + geom_ribbon(aes(x = date, ymin = q0.25, ymax = q0.75), alpha = 0.1) + + geom_line(aes(x = date, y = q0.5)) + + geom_point(data = df_data %>% setDT() %>% + .[, csum := cumsum(data_var) , by = .(subpop)], + aes(lubridate::as_date(date), csum), color = 'firebrick', alpha = 0.2, size=1) + + facet_wrap(~subpop, scales = 'free') + + # facet_wrap(~get(subpop), scales = 'free') + + labs(x = 'date', y = paste0("cumulative ", fit_stats[i])) + + theme_classic() + # outputs_global$hosp %>% + # .[, ..cols_sim] %>% + # .[, date := lubridate::as_date(date)] %>% + # .[, csum := cumsum(get(statistics$sim_var)), by = .(subpop, slot)] %>% + # .[, as.list(quantile(csum, c(.05, .25, .5, .75, .95), na.rm = TRUE, names = FALSE)), by = c("date", "subpop")] %>% + # ggplot() + + # geom_ribbon(aes(x = date, ymin = V1, ymax = V5), alpha = 0.1) + + # geom_ribbon(aes(x = date, ymin = V2, ymax = V4), alpha = 0.1) + + # geom_line(aes(x = date, y = V3)) + + # geom_point(data = gt_data %>% + # .[, ..cols_data] %>% + # .[, csum := cumsum(replace_na(get(statistics$data_var), 0)) , by = .(subpop)] + # , + # aes(lubridate::as_date(date), csum), color = 'firebrick', alpha = 0.1) + + # facet_wrap(~subpop, scales = 'free', ncol = gg_cols) + + # labs(x = 'date', y = fit_stats[i], title = paste0("cumulative ", statistics$sim_var)) + + # theme_classic() ) } @@ -231,6 +296,18 @@ if("hosp" %in% model_outputs){ ## hosp by highest and lowest llik + if("llik" %in% model_outputs){ + llik_rank <- copy(outputs_global$llik) %>% + .[, .SD[order(ll)], subpop] + high_low_llik <- rbindlist(list(data.table(llik_rank, key = "subpop") %>% + .[, head(.SD,5), by = subpop] %>% + .[, llik_bin := "top"], + data.table(llik_rank, key = "subpop") %>% + .[, tail(.SD,5), by = subpop]%>% + .[, llik_bin := "bottom"]) + ) + } + fname <- paste0("pplot/hosp_by_llik_mod_outputs_", opt$run_id,".pdf") # pdf_dims <- data.frame(width = gg_cols*2, length = num_nodes/gg_cols * 2) # pdf(fname, width = pdf_dims$width, height = pdf_dims$length) @@ -240,38 +317,75 @@ if("hosp" %in% model_outputs){ statistics <- purrr::flatten(config$inference$statistics[i]) cols_sim <- c("date", statistics$sim_var, "subpop","slot") cols_data <- c("date", "subpop", statistics$data_var) + hosp_outputs_global_tmp <- hosp_outputs_global[,..cols_sim] + if("llik" %in% model_outputs){ - llik_rank <- copy(outputs_global$llik) %>% - .[, .SD[order(ll)], subpop] - high_low_llik <- rbindlist(list(data.table(llik_rank, key = "subpop") %>% - .[, head(.SD,5), by = subpop] %>% - .[, llik_bin := "top"], - data.table(llik_rank, key = "subpop") %>% - .[, tail(.SD,5), by = subpop]%>% - .[, llik_bin := "bottom"]) - ) + # high_low_hosp_llik <- copy(outputs_global$hosp) %>% + # .[high_low_llik, on = c("slot", "subpop"), allow.cartesian = TRUE] - high_low_hosp_llik <- copy(outputs_global$hosp) %>% - .[high_low_llik, on = c("slot", "subpop"), allow.cartesian = TRUE] + # aggregate simulation output and data by time based on what is in the config + df_sim <- lapply(subpop_names, function(y) { + lapply(unique(hosp_outputs_global$slot), function(x) + purrr::flatten_df(inference::getStats( + hosp_outputs_global_tmp %>% .[subpop == y & slot == x] , + "date", + "sim_var", + stat_list = config$inference$statistics[i], + start_date = config$start_date_groundtruth, + end_date = config$end_date_groundtruth + )) %>% dplyr::mutate(subpop = y, slot = x)) %>% dplyr::bind_rows() + }) %>% dplyr::bind_rows() %>% setDT() + + df_data <- lapply(subpop_names, function(x) { + purrr::flatten_df( + inference::getStats( + gt_data %>% .[subpop == x,..cols_data], + "date", + "data_var", + stat_list = config$inference$statistics[i], + start_date = config$start_date_groundtruth, + end_date = config$end_date_groundtruth + )) %>% dplyr::mutate(subpop = x) %>% + dplyr::mutate(data_var = as.numeric(data_var)) %>% + dplyr::mutate(date = lubridate::as_date(date)) }) %>% + dplyr::bind_rows() %>% setDT() + + # add likelihood ranking to simulation output + high_low_hosp_llik <- df_sim %>% + .[high_low_llik, on = c("slot", "subpop"), allow.cartesian=TRUE] %>% # right join by "on" variables + .[subpop != "Total"] hosp_llik_plots <- lapply(unique(high_low_hosp_llik %>% .[, subpop]), function(e){ high_low_hosp_llik %>% - .[, date := lubridate::as_date(date)] %>% .[subpop == e] %>% - ggplot() + - geom_line(aes(lubridate::as_date(date), get(statistics$data_var), - group = slot, color = ll))+#, linetype = llik_bin)) + - # scale_linetype_manual(values = c(1, 2), name = "likelihood\nbin") + + .[, date := lubridate::as_date(date)] %>% + ggplot() + + geom_line(aes(x = date, y = sim_var, group = slot, color = ll)) + + scale_linetype_manual(values = c(1, 2), name = "likelihood\nbin") + scale_color_viridis_c(option = "D", name = "log\nlikelihood") + - geom_point(data = gt_data %>% - .[, ..cols_data] %>% - .[subpop == e] , - aes(lubridate::as_date(date), get(statistics$data_var)), color = 'firebrick', alpha = 0.1) + - facet_wrap(~subpop, scales = 'free', ncol = gg_cols) + - labs(x = 'date', y = fit_stats[i]) + #, title = paste0("top 5, bottom 5 lliks, ", statistics$sim_var)) + - theme_classic() + - guides(linetype = 'none') + geom_point(data = df_data, + aes(lubridate::as_date(date), data_var), color = 'firebrick', alpha = 0.2, size=1) + + facet_wrap(~subpop, scales = 'free') + + labs(x = 'date', y = fit_stats[i]) + + theme_classic() + + theme(legend.key.size = unit(0.2, "cm")) + # high_low_hosp_llik %>% + # .[, date := lubridate::as_date(date)] %>% + # .[subpop == e] %>% + # ggplot() + + # geom_line(aes(lubridate::as_date(date), get(statistics$data_var), + # group = slot, color = ll))+#, linetype = llik_bin)) + + # # scale_linetype_manual(values = c(1, 2), name = "likelihood\nbin") + + # scale_color_viridis_c(option = "D", name = "log\nlikelihood") + + # geom_point(data = gt_data %>% + # .[, ..cols_data] %>% + # .[subpop == e] , + # aes(lubridate::as_date(date), get(statistics$data_var)), color = 'firebrick', alpha = 0.1) + + # facet_wrap(~subpop, scales = 'free', ncol = gg_cols) + + # labs(x = 'date', y = fit_stats[i]) + #, title = paste0("top 5, bottom 5 lliks, ", statistics$sim_var)) + + # theme_classic() + + # guides(linetype = 'none') } ) @@ -290,8 +404,8 @@ if("hnpi" %in% model_outputs){ gg_cols <- 4 num_nodes <- length(unique(outputs_global$hnpi %>% .[,subpop])) - pdf_dims <- data.frame(width = gg_cols*3, length = num_nodes/gg_cols * 2) - #pdf_dims <- data.frame(width = 20, length = 5) + # pdf_dims <- data.frame(width = gg_cols*3, length = num_nodes/gg_cols * 2) + pdf_dims <- data.frame(width = 20, length = 10) fname <- paste0("pplot/hnpi_mod_outputs_", opt$run_id,".pdf") pdf(fname, width = pdf_dims$width, height = pdf_dims$length) From 4b21739f3895f120d07a056079f036ccd30f9f37 Mon Sep 17 00:00:00 2001 From: saraloo <45245630+saraloo@users.noreply.github.com> Date: Fri, 26 Jul 2024 15:48:55 +1000 Subject: [PATCH 2/5] fix --- postprocessing/postprocess_snapshot.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/postprocessing/postprocess_snapshot.R b/postprocessing/postprocess_snapshot.R index a505b79b6..7ccac71d3 100644 --- a/postprocessing/postprocess_snapshot.R +++ b/postprocessing/postprocess_snapshot.R @@ -170,15 +170,17 @@ if("hosp" %in% model_outputs){ # pdf(fname, width = 20, height = 18) # pdf(fname) fit_stats <- names(config$inference$statistics) + subpop_names <- unique(outputs_global$hosp %>% .[,subpop]) for(i in 1:length(fit_stats)){ statistics <- purrr::flatten(config$inference$statistics[i]) cols_sim <- c("date", statistics$sim_var, "subpop","slot") cols_data <- c("date", "subpop", statistics$data_var) + hosp_outputs_global_tmp <- copy(outputs_global$hosp)[,..cols_sim] # aggregate based on what is in the config df_sim <- lapply(subpop_names, function(y) { - lapply(unique(hosp_outputs_global$slot), function(x) + lapply(unique(outputs_global$hosp$slot), function(x) purrr::flatten_df(inference::getStats( hosp_outputs_global_tmp %>% .[subpop == y & slot == x] , "date", @@ -317,7 +319,6 @@ if("hosp" %in% model_outputs){ statistics <- purrr::flatten(config$inference$statistics[i]) cols_sim <- c("date", statistics$sim_var, "subpop","slot") cols_data <- c("date", "subpop", statistics$data_var) - hosp_outputs_global_tmp <- hosp_outputs_global[,..cols_sim] if("llik" %in% model_outputs){ # high_low_hosp_llik <- copy(outputs_global$hosp) %>% @@ -325,7 +326,7 @@ if("hosp" %in% model_outputs){ # aggregate simulation output and data by time based on what is in the config df_sim <- lapply(subpop_names, function(y) { - lapply(unique(hosp_outputs_global$slot), function(x) + lapply(unique(outputs_global$hosp$slot), function(x) purrr::flatten_df(inference::getStats( hosp_outputs_global_tmp %>% .[subpop == y & slot == x] , "date", From 93fa3eb645a4d63354808e39f3fff306769a9303 Mon Sep 17 00:00:00 2001 From: saraloo <45245630+saraloo@users.noreply.github.com> Date: Fri, 26 Jul 2024 16:00:49 +1000 Subject: [PATCH 3/5] more fixes --- postprocessing/postprocess_snapshot.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/postprocessing/postprocess_snapshot.R b/postprocessing/postprocess_snapshot.R index 7ccac71d3..7d353ba6b 100644 --- a/postprocessing/postprocess_snapshot.R +++ b/postprocessing/postprocess_snapshot.R @@ -313,12 +313,13 @@ if("hosp" %in% model_outputs){ fname <- paste0("pplot/hosp_by_llik_mod_outputs_", opt$run_id,".pdf") # pdf_dims <- data.frame(width = gg_cols*2, length = num_nodes/gg_cols * 2) # pdf(fname, width = pdf_dims$width, height = pdf_dims$length) - pdf(fname, width = 20, height = 20) + pdf(fname, width = 20, height = 10) for(i in 1:length(fit_stats)){ statistics <- purrr::flatten(config$inference$statistics[i]) cols_sim <- c("date", statistics$sim_var, "subpop","slot") cols_data <- c("date", "subpop", statistics$data_var) + hosp_outputs_global_tmp <- copy(outputs_global$hosp)[,..cols_sim] if("llik" %in% model_outputs){ # high_low_hosp_llik <- copy(outputs_global$hosp) %>% @@ -365,7 +366,7 @@ if("hosp" %in% model_outputs){ geom_line(aes(x = date, y = sim_var, group = slot, color = ll)) + scale_linetype_manual(values = c(1, 2), name = "likelihood\nbin") + scale_color_viridis_c(option = "D", name = "log\nlikelihood") + - geom_point(data = df_data, + geom_point(data = df_data %>% .[subpop == e], aes(lubridate::as_date(date), data_var), color = 'firebrick', alpha = 0.2, size=1) + facet_wrap(~subpop, scales = 'free') + labs(x = 'date', y = fit_stats[i]) + From 1a97b18bc083787e8505ce720dffb9b28da99e7c Mon Sep 17 00:00:00 2001 From: saraloo <45245630+saraloo@users.noreply.github.com> Date: Tue, 30 Jul 2024 15:00:08 +1000 Subject: [PATCH 4/5] fix aggregation filterig --- flepimop/R_packages/inference/R/inference_slot_runner_funcs.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flepimop/R_packages/inference/R/inference_slot_runner_funcs.R b/flepimop/R_packages/inference/R/inference_slot_runner_funcs.R index 9b0afb068..117eb15c5 100644 --- a/flepimop/R_packages/inference/R/inference_slot_runner_funcs.R +++ b/flepimop/R_packages/inference/R/inference_slot_runner_funcs.R @@ -51,8 +51,8 @@ aggregate_and_calc_loc_likelihoods <- function( ## Filter to this location dplyr::filter( modeled_outcome, - !!rlang::sym(obs_subpop) == location, - date %in% unique(obs$date[obs$subpop == location]) + !!rlang::sym(obs_subpop) == location #, + #date %in% unique(obs$date[obs$subpop == location]) ) %>% ## Reformat into form the algorithm is looking for inference::getStats( From 0dda4a38665ae992b4f3bf42f2657638450a5d89 Mon Sep 17 00:00:00 2001 From: saraloo <45245630+saraloo@users.noreply.github.com> Date: Wed, 31 Jul 2024 08:56:26 +1000 Subject: [PATCH 5/5] remove date filtering --- flepimop/R_packages/inference/R/inference_slot_runner_funcs.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/flepimop/R_packages/inference/R/inference_slot_runner_funcs.R b/flepimop/R_packages/inference/R/inference_slot_runner_funcs.R index 117eb15c5..a708b2460 100644 --- a/flepimop/R_packages/inference/R/inference_slot_runner_funcs.R +++ b/flepimop/R_packages/inference/R/inference_slot_runner_funcs.R @@ -51,8 +51,7 @@ aggregate_and_calc_loc_likelihoods <- function( ## Filter to this location dplyr::filter( modeled_outcome, - !!rlang::sym(obs_subpop) == location #, - #date %in% unique(obs$date[obs$subpop == location]) + !!rlang::sym(obs_subpop) == location ) %>% ## Reformat into form the algorithm is looking for inference::getStats(