Skip to content

Commit

Permalink
Release 2.0.1 (#94)
Browse files Browse the repository at this point in the history
 *  Connection parameter fixed to be in line with newest polars
 *  Fixed a bug where LRFinder used a hardcoded batch size
 *  Seed is now used in LRFinder so it's reproducible
 *  Fixed a bug in NumericalEmbedding
 *  Fixed a bug for Transformer and numerical features
 *  Fixed a bug when resuming from a full TrainingCache (thanks Zoey Jiang and Linying Zhang )
 *  Updated installation documentation after feedback from HADES hackathon
 *  Fixed a bug where order of numeric features wasn't conserved between training and test set
 *  TrainingCache now only saves prediction dataframe for the best performing model

Co-authored-by: Henrik John <[email protected]>
Co-authored-by: Xinzhuo Jiang <[email protected]>
  • Loading branch information
3 people authored Nov 3, 2023
1 parent 2aba758 commit 8942e11
Show file tree
Hide file tree
Showing 16 changed files with 174 additions and 157 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ config.yml
docs
.idea/
renv.lock
extras/
extras/
.Renviron
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: DeepPatientLevelPrediction
Type: Package
Title: Deep Learning For Patient Level Prediction Using Data In The OMOP Common Data Model
Version: 2.0.0
Version: 2.0.1
Date: 18-04-2023
Authors@R: c(
person("Egill", "Fridgeirsson", email = "[email protected]", role = c("aut", "cre")),
Expand Down
12 changes: 12 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
DeepPatientLevelPrediction 2.0.1
======================
- Connection parameter fixed to be in line with newest polars
- Fixed a bug where LRFinder used a hardcoded batch size
- Seed is now used in LRFinder so it's reproducible
- Fixed a bug in NumericalEmbedding
- Fixed a bug for Transformer and numerical features
- Fixed a bug when resuming from a full TrainingCache (thanks Zoey Jiang and Linying Zhang )
- Updated installation documentation after feedback from HADES hackathon
- Fixed a bug where order of numeric features wasn't conserved between training and test set
- TrainingCache now only saves prediction dataframe for the best performing model

DeepPatientLevelPrediction 2.0.0
======================
- New backend which uses pytorch through reticulate instead of torch in R
Expand Down
40 changes: 27 additions & 13 deletions R/Estimator.R
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,8 @@ gridCvDeep <- function(mappedData,

fitParams <- names(paramSearch[[1]])[grepl("^estimator", names(paramSearch[[1]]))]
findLR <- modelSettings$estimatorSettings$findLR
for (gridId in trainCache$getLastGridSearchIndex():length(paramSearch)) {
if (!trainCache$isFull()) {
for (gridId in trainCache$getLastGridSearchIndex():length(paramSearch)) {
ParallelLogger::logInfo(paste0("Running hyperparameter combination no ", gridId))
ParallelLogger::logInfo(paste0("HyperParameters: "))
ParallelLogger::logInfo(paste(names(paramSearch[[gridId]]), paramSearch[[gridId]], collapse = " | "))
Expand Down Expand Up @@ -363,25 +364,38 @@ gridCvDeep <- function(mappedData,
)
}
maxIndex <- which.max(unlist(sapply(learnRates, `[`, 2)))
paramSearch[[gridId]]$learnSchedule <- learnRates[[maxIndex]]

gridSearchPredictons[[gridId]] <- list(
prediction = prediction,
param = paramSearch[[gridId]]
param = paramSearch[[gridId]],
gridPerformance = PatientLevelPrediction::computeGridPerformance(prediction, paramSearch[[gridId]])
)
gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary$learnRates <- rep(list(unlist(learnRates[[maxIndex]]$LRs)),
nrow(gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary))
gridSearchPredictons[[gridId]]$param$learnSchedule <- learnRates[[maxIndex]]


# remove all predictions that are not the max performance
indexOfMax <- which.max(unlist(lapply(gridSearchPredictons, function(x) x$gridPerformance$cvPerformance)))
for (i in seq_along(gridSearchPredictons)) {
if (!is.null(gridSearchPredictons[[i]])) {
if (i != indexOfMax) {
gridSearchPredictons[[i]]$prediction <- list(NULL)
}
}
}
ParallelLogger::logInfo(paste0("Caching all grid search results and prediction for best combination ", indexOfMax))
trainCache$saveGridSearchPredictions(gridSearchPredictons)
}
}
paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance)
# get best params
indexOfMax <- which.max(unlist(lapply(gridSearchPredictons, function(x) x$gridPerformance$cvPerformance)))
finalParam <- gridSearchPredictons[[indexOfMax]]$param

paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance)

# get best para (this could be modified to enable any metric instead of AUC, just need metric input in function)
paramGridSearch <- lapply(gridSearchPredictons, function(x) {
do.call(PatientLevelPrediction::computeGridPerformance, x)
}) # cvAUCmean, cvAUC, param

optimalParamInd <- which.max(unlist(lapply(paramGridSearch, function(x) x$cvPerformance)))
finalParam <- paramGridSearch[[optimalParamInd]]$param

cvPrediction <- gridSearchPredictons[[optimalParamInd]]$prediction
# get best CV prediction
cvPrediction <- gridSearchPredictons[[indexOfMax]]$prediction
cvPrediction$evaluationType <- "CV"

ParallelLogger::logInfo("Training final model using optimal parameters")
Expand Down
7 changes: 7 additions & 0 deletions R/TrainingCache-class.R
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@ TrainingCache <- R6::R6Class(
return(private$.paramPersistence$gridSearchPredictions)
},

#' @description
#' Check if cache is full
#' @returns Boolen
isFull = function() {
return(all(unlist(lapply(private$.paramPersistence$gridSearchPredictions, function(x) !is.null(x$gridPerformance)))))
},

#' @description
#' Gets the last index from the cached grid search
#' @returns Last grid search index
Expand Down
73 changes: 0 additions & 73 deletions extras/example.R

This file was deleted.

4 changes: 2 additions & 2 deletions inst/python/Dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def __init__(self,
if pathlib.Path(data).suffix == '.sqlite':
data = urllib.parse.quote(data)
data = pl.read_database("SELECT * from covariates",
connection_uri=f"sqlite://{data}").lazy()
connection=f"sqlite://{data}").lazy()
else:
data = pl.scan_ipc(pathlib.Path(data).joinpath('covariates/*.arrow'))
observations = data.select(pl.col('rowId').max()).collect()[0, 0]
Expand Down Expand Up @@ -67,7 +67,7 @@ def __init__(self,
if pl.count(self.numerical_features) == 0:
self.num = None
else:
numerical_data = data.filter(pl.col('columnId').is_in(self.numerical_features)). \
numerical_data = data.filter(pl.col('columnId').is_in(self.numerical_features)).sort(by='columnId'). \
with_row_count('newColumnId').with_columns(pl.col('newColumnId').first().over('columnId').
rank(method="dense") - 1, pl.col('rowId') - 1) \
.select(pl.col('rowId'), pl.col('newColumnId').alias('columnId'), pl.col('covariateValue')).collect()
Expand Down
7 changes: 4 additions & 3 deletions inst/python/LrFinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def __init__(self,
smooth = lr_settings.get("smooth", 0.05)
divergence_threshold = lr_settings.get("divergence_threshold", 4)
torch.manual_seed(seed=estimator_settings["seed"])
self.seed = estimator_settings["seed"]
self.model = model(**model_parameters)
if callable(estimator_settings["device"]):
self.device = estimator_settings["device"]()
Expand All @@ -55,18 +56,18 @@ def __init__(self,
self.scheduler = ExponentialSchedulerPerBatch(self.optimizer, self.max_lr, self.num_lr)

self.criterion = estimator_settings["criterion"]()
self.batch_size = estimator_settings['batch_size']
self.batch_size = int(estimator_settings['batch_size'])
self.losses = None
self.loss_index = None

def get_lr(self, dataset):
batch_index = torch.arange(0, len(dataset), 1).tolist()

random.seed(self.seed)
losses = torch.empty(size=(self.num_lr,), dtype=torch.float)
lrs = torch.empty(size=(self.num_lr,), dtype=torch.float)
for i in tqdm(range(self.num_lr)):
self.optimizer.zero_grad()
random_batch = random.sample(batch_index, 32)
random_batch = random.sample(batch_index, self.batch_size)
batch = dataset[random_batch]
batch = batch_to_device(batch, self.device)

Expand Down
4 changes: 2 additions & 2 deletions inst/python/ResNet.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,9 @@ def __init__(self,
nn.init.kaiming_uniform_(parameter, a=math.sqrt(5))

def forward(self, input):
x = self.weight.unsqueeze(0) * input.unsqueeze(-1)
x = self.weight[None] * input[..., None]
if self.bias is not None:
x = x + self.bias.unsqueeze(-1)
x = x + self.bias[None]
return x


Expand Down
5 changes: 4 additions & 1 deletion inst/python/Transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ def __init__(self,

if num_features != 0 and num_features is not None:
self.numerical_embedding = NumericalEmbedding(num_features, dim_token)
self.use_numerical = True
else:
self.use_numerical = False
self.class_token = ClassToken(dim_token)

self.layers = nn.ModuleList([])
Expand Down Expand Up @@ -78,7 +81,7 @@ def __init__(self,
def forward(self, x):
mask = torch.where(x["cat"] == 0, True, False)
cat = self.categorical_embedding(x["cat"])
if "num" in x.keys() and self.numerical_embedding is not None:
if self.use_numerical:
num = self.numerical_embedding(x["num"])
x = torch.cat([cat, num], dim=1)
mask = torch.cat([mask, torch.zeros([x.shape[0],
Expand Down
13 changes: 13 additions & 0 deletions man/TrainingCache.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions tests/testthat/setup.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,21 @@ dataset <- Dataset$Data(
)
small_dataset <- torch$utils$data$Subset(dataset, (1:round(length(dataset)/3)))

modelSettings <- setResNet(
numLayers = 1, sizeHidden = 16, hiddenFactor = 1,
residualDropout = c(0, 0.2), hiddenDropout = 0,
sizeEmbedding = 16, hyperParamSearch = "random",
randomSample = 2,
setEstimator(epochs=1,
learningRate = 3e-4)
)
fitEstimatorPath <- file.path(testLoc, 'fitEstimator')
if (!dir.exists(fitEstimatorPath)) {
dir.create(fitEstimatorPath)
}
fitEstimatorResults <- fitEstimator(trainData$Train,
modelSettings = modelSettings,
analysisId = 1,
analysisPath = fitEstimatorPath)


25 changes: 6 additions & 19 deletions tests/testthat/test-Estimator.R
Original file line number Diff line number Diff line change
Expand Up @@ -146,25 +146,12 @@ test_that("early stopping works", {
testthat::expect_true(earlyStop$early_stop)
})

modelSettings <- setResNet(
numLayers = 1, sizeHidden = 16, hiddenFactor = 1,
residualDropout = 0, hiddenDropout = 0,
sizeEmbedding = 16, hyperParamSearch = "random",
randomSample = 1,
setEstimator(epochs=1,
learningRate = 3e-4)
)

sink(nullfile())
results <- fitEstimator(trainData$Train, modelSettings = modelSettings, analysisId = 1, analysisPath = testLoc)
sink()

test_that("Estimator fit function works", {
expect_true(!is.null(results$trainDetails$trainingTime))
expect_true(!is.null(fitEstimatorResults$trainDetails$trainingTime))

expect_equal(class(results), "plpModel")
expect_equal(attr(results, "modelType"), "binary")
expect_equal(attr(results, "saveType"), "file")
expect_equal(class(fitEstimatorResults), "plpModel")
expect_equal(attr(fitEstimatorResults, "modelType"), "binary")
expect_equal(attr(fitEstimatorResults, "saveType"), "file")
fakeTrainData <- trainData
fakeTrainData$train$covariateData <- list(fakeCovData <- c("Fake"))
expect_error(fitEstimator(fakeTrainData$train, modelSettings, analysisId = 1, analysisPath = testLoc))
Expand All @@ -184,7 +171,7 @@ test_that("predictDeepEstimator works", {
# input is a plpModel and data
sink(nullfile())
predictions <- predictDeepEstimator(
plpModel = results, data = trainData$Test,
plpModel = fitEstimatorResults, data = trainData$Test,
trainData$Test$labels
)
sink()
Expand Down Expand Up @@ -369,4 +356,4 @@ test_that("estimatorSettings can be saved and loaded with correct python objects
testthat::expect_false(reticulate::py_is_null_xptr(optimizer))
testthat::expect_false(reticulate::py_is_null_xptr(scheduler$fun))
testthat::expect_false(reticulate::py_is_null_xptr(criterion))
})
})
Loading

0 comments on commit 8942e11

Please sign in to comment.