update gitignore

clean up and document lasso functions, add new predict and brier functions, add code to run and evaluate simulations run simulations script
karissawhiting · karissawhiting · Feb 28, 2024 · Feb 28, 2024 · Feb 28, 2024 · Feb 29, 2024
commit 272fb289c5ac5a66362a7416e6dbdbeeead5a15e
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -13,4 +13,8 @@
 ^.*[.]Rproj.*$
 ^\.Rproj\.user$
 ^.Rhistory$
-[~]
+[~]
+^R/cv.cureitlasso.R
+^R/simulasso.R
+^R/coxsplit.R
+^R/run-simulations.R
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@
 inst/doc
 inst/model-fits
 inst/cure-model-fits
+inst/cureit-simulation-results/*
 inst/archive
 docs/
 revdep/

diff --git a/R/brier2.R b/R/brier2.R
@@ -0,0 +1,158 @@
+# we should use this: https://www.tidymodels.org/learn/statistics/survival-metrics/ 
+
+library(survival)
+library(tidyverse)
+
+# Prep Data ---------------------------------------------------------------
+
+xi = dat$x
+final_fit <- fit$fit[[fit$index$`min`[1]]][[fit$index$`min`[2]]]
+
+fitcure <- final_fit$fitcure
+predcure <- predict(fitcure, newx = xi,
+                    s = min(fitcure$lambda), type = "response")
+
+fitcox <- fi nal_fit$fitcox
+predsurvexp <- predict(fitcox, newx = xi, s = min(fitcox$lambda), type = "response")
+
+
+haz <- final_fit$haz
+cumhaz <- final_fit$cumhaz
+
+predsurv <- exp(-cumhaz*predsurvexp)
+
+# Create Data frame to pass ----------------------------------------------
+
+data = data.frame(dat$x) %>%
+  mutate(times = dat$t, 
+         event = dat$d,
+         "truth_survival" = Surv(dat$t, dat$d))
+
+# Predict Function (glmnet cure) ---------------------------------------------
+
+# Trying to get predictions in similar format as tidymodels yardstick
+# Eventually will integratethis with existing predict method function
+
+predict_cure <- function(final_fit,
+                         data = NULL,
+                         eval_timepoints = NULL) {
+
+  # Get values from observed data
+  survival_times = data$times
+  sorted_survival_times <- sort(survival_times)
+  sorted_event_times = sort(data$times[data$event==1])
+  censoring_weights = survfit(Surv(data$times, 1 - data$event) ~ 1)$surv
+
+  data_x <- data %>%
+    select(-c(times, event, truth_survival)) %>%
+    as.matrix()
+
+  # Extract cumulative hazard from final fit (for event times)
+  haz <- final_fit$haz
+  cumhaz <- final_fit$cumhaz
+
+  # default eval timepoints are observed event times if not supplied by user
+  if(is.null(eval_timepoints)) {
+    eval_timepoints = sorted_event_times
+  }
+
+
+  # Get Relative Cure and Cox Risk ------------------------------------------
+
+  # Cure probability risk prediction (independent of timepoint)
+  fitcure <- final_fit$fitcure
+  predcure <- predict(fitcure, newx = data_x,
+                      s = min(fitcure$lambda), type = "response")
+
+  # Cox survival probability risk prediction (independent of timepoint)
+  fitcox <- final_fit$fitcox
+  predsurvexp <- predict(fitcox, 
+                         newx = data_x,
+                         s = min(fitcox$lambda),
+                         type = "response")
+
+
+
+  # Get Predicted probabilities at select timepoints -------------------
+
+  # vector for conditional survival prob for all patients at sorted_survival_times[l]
+  all_preds_all_tps <- list()
+
+  # for each eval timepoint, get predicted survival values
+  for (l in 1:length(eval_timepoints)) {
+    all_preds <- data.frame("eval_timepoint" = NA, 
+                            "preds" = NA, 
+                            "pred_cure" = NA, 
+                            "preds_surv" = NA, 
+                            "weights" = NA)
+
+    # vector for conditional survival prob for all patients at sorted_survival_times[l]
+    predsurv <- rep(NA,length(survival_times)) 
+
+
+    if (eval_timepoints[l] >= min(sorted_event_times)){
+
+      ids <- which(
+        sorted_event_times == 
+          max(sorted_event_times[sorted_event_times <= eval_timepoints[l]])
+        )
+
+      # Conditional survival prob at given patient
+      predsurv <- exp(-cumhaz[ids]*predsurvexp)
+
+      # inverse probability of the censoring weights
+      ipw <- as.numeric(survival_times > eval_timepoints[l])*censoring_weights[l] + 
+        as.numeric(survival_times <= eval_timepoints[l])*censoring_weights
+
+      # if no events occurred before eval time 
+    } else if (eval_timepoints[l] < min(sorted_event_times)) {
+
+      predsurv <- rep(1,length(survival_times))
+      ipw <- 1
+
+    }
+
+    preds <- 1 - predcure + predcure*predsurv
+
+    all_preds <- tibble(
+
+      "preds" = preds, 
+      "predcure" = predcure, 
+      "predsurv" = predsurv, 
+      "weights" = ipw) %>%
+      mutate(id_num = 1:nrow(.))
+
+    names(all_preds) <- c(
+                          "preds",
+                          "predcure", "predsurv", "weights",
+                          "id_num")
+
+    x <-  all_preds %>% nest(.preds = -c(id_num))
+
+    all_preds_all_tps[[l]] <- all_preds
+
+  #  names(all_preds) = c("pred", "pred_cure","pred_surv")
+  }
+  # all_pred_df <- tibble(
+  #   times = data$times,
+  #   events = data$event,
+  #   .pred = all_preds_all_tps
+  # )
+  return(all_pred_df)
+}
+
+predict_df <- predict_cure(final_fit = final_fit, data = data)
+
+
+# Brier Calculation -------------------------------------------------------
+
+brier_cure <- function(predict_df, times, event) {
+
+  eval_timepoints = unique(predict_df$eval_timepoint)
+
+  for (l in 1:length(eval_timepoints)) {
+    brier[l] <- mean(I(di==0)*(1 - preds)^2/(ipw+0.001) + I(di==1)*(0 - preds)^2/(ipw+0.001))
+
+
+  }
+
diff --git a/R/cureitlasso.R b/R/cureitlasso.R
@@ -1,3 +1,47 @@
+
+# mu is penlanlity for cure
+# lamda is penality for cox
+# touch is posterior probabilty of being cured fr uncured (maybe don't need) (Entropy) 
+ # NOTE maybe change name of alphs 
+# calculated linear peredictor of that iteration 
+
+
+#' Lasso for Cure Models
+#'
+#' @param t Survival times
+#' @param d Event indicator
+#' @param x data n x p matrix or data.frame
+#' @param minmu.ratio minimum penalty value in the logistic model
+#' @param minlambda.ratio minimum penalty value in the cox model
+#' @param adaptive 
+#' @param length.grid 
+#' @param mus penalty for cure
+#' @param lambdas is penalty for cox
+#' @param tol posterior probability of being cured for uncured (maybe don't need) (Entropy) 
+#' @param maxiter 
+#' @param progress verbose iteration counter
+#'
+#' @return
+#' - `all_fits`
+#'  - `t`,
+#'  - `tj` - unique event times sorted,
+#'  - `d`- event indicator,
+#'  - `x`- data n x p matrix or data.frame,
+#'  - `alpha0` fitcure$a0[length(fitcure$lambda)],
+#'  - `alpha` =alpha,
+#'  - `beta` =beta,
+#'  - `haz`=haz,
+#'  - `cumhaz`=cumhaz,
+#'  - `tau`=as.vector(tau),
+#'  - `predsurv`=as.vector(predsurv_iter),
+#'  - `predcure`=as.vector(predcure_iter),
+#'  - `fitcure`=fitcure,
+#'  - `fitcox` =fitcox
+#' - `mus` - mus that were used as penalties for cure models in cross validation
+#' - `lambda` - lambdas that were used as penalities in cix model cross validation
+#' @export
+#'
+#' @examples
 cureitlasso <- function(t,
                         d,
                         x,
@@ -15,6 +59,7 @@ cureitlasso <- function(t,
   require(glmnet)
   require(survival)
 
+  # initialize initial weights 
   tau <- matrix(0.5,nrow=length(t),ncol=2) # First column: cured; second column: uncured
   tau[d==1,1] = 0
   tau[d==1,2] = 1
@@ -24,6 +69,7 @@ cureitlasso <- function(t,
 
   # CV: Be careful for fold split! Two replicates from the same subject should always be assigned to the same fold
 
+  #this is for later integreation with tuning
   penalty.factor.cure=rep(1,ncol(x))
   penalty.factor.cox=rep(1,ncol(x))
 
@@ -51,6 +97,8 @@ cureitlasso <- function(t,
   #   
   # }
 
+  # So this section is finding the set of optimal hyperparemters to test ----
+  # tau is uncure probability 
   fitcure0 <- glmnet(x=do.call("rbind", rep(list(x), 2)),
                      y=lab_class,
                      family="binomial",
@@ -72,8 +120,10 @@ cureitlasso <- function(t,
     lambdas <- fitcox0$lambda[idx_lambdas]
   }
 
+
   musmax <- fitcure0$lambda[1]
   lambdasmax <- fitcox0$lambda[1]
+  # -----------------
 
   predcure <- predict(fitcure0,newx=x,s=mus,type="response") #Prob of uncured
   predsurvexp <- predict(fitcox0,newx=x,s=lambdas,type="response") # exp(betaX)
@@ -104,6 +154,7 @@ cureitlasso <- function(t,
 
     fit[[i]] <- list()
 
+
     for (j in 1:length(lambdas)){
 
       tau <- d + (1-d) * (predcure[,i] * predsurv[,j])/( (1 - predcure[,i]) + predcure[,i] * predsurv[,j] )
@@ -309,14 +360,24 @@ cureitlasso <- function(t,
       # num_alpha[i,j] <- sum(alpha!=0)
       # num_beta[i,j] <- sum(beta!=0)
 
+    #  names(fit[i][j]) <- paste0("mu_fit_", i, "_lambda_fit_", "j")
+
     }
 
+
+    names(fit[[i]]) <- paste0("lambda_fit_", 1:length(lambdas))
   }
 
-  return(list(fit=fit,
+  names(fit) <- paste0("mu_fit_", 1:length(mus))
+
+  return(list(fit = fit,
               # num_alpha=num_alpha,
               # num_beta=num_beta,
               mus=mus,
               lambdas=lambdas))
 
-}
+}
+
+# tj - is unique event times sorted
+# haz 
+
diff --git a/R/cv.cureitlasso.R b/R/cv.cureitlasso.R
@@ -1,10 +1,14 @@
+# Returns:
+# fit - 
+# arg: length.grid - 
+
 cv.cureitlasso <- function(t,
                            d,
                            x,
                            minmu.ratio=0.05, # minimum penalty value in the logistic model
                            minlambda.ratio=0.05, # minimum penalty value in the cox model
                            adaptive=FALSE,
-                           length.grid=10,
+                           length.grid=10, # tells how many hyperparameters to test. default is 10 for each 
                            nfolds=5,
                            tol=1e-2,
                            maxiter=100,
@@ -22,13 +26,15 @@ cv.cureitlasso <- function(t,
   require(survcomp)
   require(pracma)
 
+  # order data by times 
   order_idx <- order(t)
   t <- t[order_idx]
   d <- d[order_idx]
   x <- x[order_idx,]
 
   if (progress) print("Fitting by EM algorithm ...")
 
+  # main model with no cross validation
   fit <- cureitlasso(t,d,x,
                      minmu.ratio,
                      minlambda.ratio,
@@ -38,14 +44,16 @@ cv.cureitlasso <- function(t,
                      lambdas=NULL,
                      tol,
                      maxiter,
-                     progress)
+                     progress = TRUE)
 
   if (progress) print("Running cross validations ...")
 
   if (is.null(seed)) seed <- as.numeric(Sys.Date())
 
   # Fold split
   foldid <- coxsplit(as.matrix(Surv(t,d)), nfolds)
+
+  # grid/array of values 
   cv_brier <- array(NA,dim=c(length.grid,length.grid,nfolds))
 
   # Run CVs
@@ -57,6 +65,7 @@ cv.cureitlasso <- function(t,
 
       if (progress) print(i)
 
+
       cv.fit[[i]] <- cureitlasso(t[foldid != i],
                                  d[foldid != i],
                                  x[foldid != i,],
@@ -114,6 +123,7 @@ cv.cureitlasso <- function(t,
 
           }
 
+          # calcualte trapazoidal area for brier score 
           cv_brier[j,k,i] <- trapz(tbrier,brier)
 
 
@@ -130,7 +140,7 @@ cv.cureitlasso <- function(t,
 
     cv.fit <- foreach(i = 1:nfolds) %dopar% {
 
-      source("~/Projects/Whiting-Qin-cureit/cureit/R/cureitlasso.R")
+      source(here::here("R/cureitlasso.R"))
 
       cureitlasso(t[foldid != i],
                   d[foldid != i],
@@ -206,6 +216,7 @@ cv.cureitlasso <- function(t,
 
   }
 
+  # summarize - take average across all folds 
   cv_brier_mean <- apply(cv_brier,c(1,2),function(x) mean(x))
   cv_brier_se <- apply(cv_brier,c(1,2),function(x) sd(x)/sqrt(nfolds))
   # pheatmap::pheatmap(cv_brier_mean,cluster_cols = F,cluster_rows = F)
@@ -235,4 +246,4 @@ cv.cureitlasso <- function(t,
   )
   )
 
-}
+}