From ec4c15663b7ffbee6b48a53e4236e49909db8fea Mon Sep 17 00:00:00 2001 From: Bagrat Ter-Akopyan Date: Tue, 3 Sep 2019 14:56:27 +0200 Subject: [PATCH 1/5] add convert oml task to mlr3 --- R/convertOMLDataSetToMlr3.R | 123 ++++++++++++++++++ inst/examples/convertOMLDataSetToMlr3.R | 5 + .../test_local_convertOMLDataSetToMlr3.R | 45 +++++++ 3 files changed, 173 insertions(+) create mode 100644 R/convertOMLDataSetToMlr3.R create mode 100644 inst/examples/convertOMLDataSetToMlr3.R create mode 100644 tests/testthat/test_local_convertOMLDataSetToMlr3.R diff --git a/R/convertOMLDataSetToMlr3.R b/R/convertOMLDataSetToMlr3.R new file mode 100644 index 0000000..6e2d154 --- /dev/null +++ b/R/convertOMLDataSetToMlr3.R @@ -0,0 +1,123 @@ +#' @title Convert an OpenML data set to mlr3 task. +#' +#' @description +#' Converts an \code{\link{OMLDataSet}} to a \code{\link[mlr3]{Task}}. +#' +#' @param obj [\code{\link{OMLDataSet}}]\cr +#' The object that should be converted. +#' @param mlr.task.id [\code{character(1)}]\cr +#' Id string for \code{\link[mlr3]{Task}} object. +#' The strings \code{}, \code{} and \code{} +#' will be replaced by their respective values contained in the \code{\link{OMLDataSet}} object. +#' Default is \code{}. +#' @param task.type [\code{character(1)}]\cr +#' As we only pass the data set, we need to define the task type manually. +#' Possible are: \dQuote{Supervised Classification}, \dQuote{Supervised Regression}, +#' \dQuote{Survival Analysis}. +#' Default is \code{NULL} which means to guess it from the target column in the +#' data set. If that is a factor or a logical, we choose classification. +#' If it is numeric we choose regression. In all other cases an error is thrown. +#' @param target [\code{character}]\cr +#' The target for the classification/regression task. +#' Default is the \code{default.target.attribute} of the \code{\link{OMLDataSetDescription}}. +#' @param ignore.flagged.attributes [\code{logical(1)}]\cr +#' Should those features that are listed in the data set description slot \dQuote{ignore.attribute} +#' be removed? +#' Default is \code{TRUE}. +#' @param drop.levels [\code{logical(1)}]\cr +#' Should empty factor levels be dropped in the data? +#' Default is \code{TRUE}. +#' @param fix.colnames [\code{logical(1)}]\cr +#' Should colnames of the data be fixed using \code{\link[base]{make.names}}? +#' Default is \code{TRUE}. +#' @template arg_verbosity +#' @return [\code{\link[mlr3]{Task}}]. +#' @family data set-related functions +#' @example /inst/examples/convertOMLDataSetToMlr3.R +#' @export +convertOMLDataSetToMlr3 = function( + obj, + mlr.task.id = "", + task.type = NULL, + target = obj$desc$default.target.attribute, + ignore.flagged.attributes = TRUE, + drop.levels = TRUE, + fix.colnames = TRUE, + verbosity = NULL) { + + assertClass(obj, "OMLDataSet") + assertSubset(target, obj$colnames.new) + assertFlag(ignore.flagged.attributes) + assertFlag(drop.levels) + + data = obj$data + desc = obj$desc + + # no task type? guess it by looking at target + if (is.null(task.type)) + task.type = guessTaskType(data[, target]) + assertChoice(task.type, getValidTaskTypes()) + + # remove ignored attributes from data + if (any(!is.na(desc$ignore.attribute)) & ignore.flagged.attributes) { + keep.cols = obj$colnames.old %nin% desc$ignore.attribute + data = data[, keep.cols, drop = FALSE] + } + + # drop levels + if (drop.levels) + data = droplevels(data) + + # fix colnames using make.names + if (fix.colnames) { + colnames(data) = make.names(colnames(data), unique = TRUE) + target = make.names(target, unique = TRUE) + } + + # get fixup verbose setting for mlr + if (is.null(verbosity)) + verbosity = getOMLConfig()$verbosity + fixup = ifelse(verbosity == 0L, "quiet", "warn") + + mlr.task = switch(task.type, + "Supervised Classification" = mlr3::TaskClassif$new(id = desc$name, backend = data, target = target), + "Supervised Regression" = mlr3::TaskRegr$new(id = desc$name, backend = data, target = target), + "Survival Analysis" = mlr3survival::TaskSurv$new(id = desc$name, backend = data, target = target), + stopf("Encountered currently unsupported task type: %s", task.type) + ) + + if (!is.null(mlr.task.id)) + mlr.task$id = replaceOMLDataSetString(mlr.task.id, obj) + + return(mlr.task) +} + +replaceOMLDataSetString = function(string, data.set) { + string = stri_replace_all_fixed(string, "", data.set$desc$id) + string = stri_replace_all_fixed(string, "", data.set$desc$name) + stri_replace_all_fixed(string, "", data.set$desc$version) +} + +# @title Helper to guess task type from target column format. +# +# @param target [vector] +# Vector of target values. +# @return [character(1)] +guessTaskType = function(target) { + if (inherits(target, "data.frame")) { + assertDataFrame(target, types = "logical") + return("Multilabel") + } else { + if (is.factor(target) | is.logical(target)) + return("Supervised Classification") + if (is.numeric(target)) + return("Supervised Regression") + } + + stopf("Cannot guess task.type from data!") +} + +getValidTaskTypes = function() { + c("Supervised Classification", "Supervised Regression", "Survival Analysis", "Multilabel") +} + diff --git a/inst/examples/convertOMLDataSetToMlr3.R b/inst/examples/convertOMLDataSetToMlr3.R new file mode 100644 index 0000000..b81efe1 --- /dev/null +++ b/inst/examples/convertOMLDataSetToMlr3.R @@ -0,0 +1,5 @@ +# \dontrun{ +# library("mlr3") +# autosOML = getOMLDataSet(data.id = 9) +# autosMlr3 = convertOMLDataSetToMlr3(autosOML) +# } diff --git a/tests/testthat/test_local_convertOMLDataSetToMlr3.R b/tests/testthat/test_local_convertOMLDataSetToMlr3.R new file mode 100644 index 0000000..c591efa --- /dev/null +++ b/tests/testthat/test_local_convertOMLDataSetToMlr3.R @@ -0,0 +1,45 @@ +context("convertOMLDataSetToMlr3") + +test_that("convertOMLDataSetToMlr3", { + with_test_cache({ + ds = getOMLDataSet(10) + + expect_is_mlr_task = function(mlr.task, ds) { + expect_equal(mlr.task$task_type, "classif") + expect_equal(mlr.task$nrow, nrow(ds$data)) + expect_equal(ds$desc$default.target.attribute, mlr.task$target_names) + } + + # now create the task + mlr.task = convertOMLDataSetToMlr3(ds) + expect_equal(mlr.task$task_type, "classif") + + # now modify dataset by hand (no more server calls) to check + # ignore attributes stuff: + # Define the first two attributes as ignored attributes + ds$desc$ignore.attribute = colnames(ds$data[, 1:2]) + + mlr.task = convertOMLDataSetToMlr3(ds, ignore.flagged.attributes = TRUE) + expect_is_mlr_task(mlr.task, ds) + # we removed two attributes (and the target column is not considered here) + #expect_equal(sum(mlr.task$task.desc$n.feat), ncol(ds$data) - 3L) + expect_equal(mlr.task$ncol, ncol(ds$data) - 2L) + + # pass faulty parameters + expect_error(convertOMLDataSetToMlr3(ds, task.type = "Nonexistent task type"), "element of") + + # check setting mlr task id + expect_equal(convertOMLDataSetToMlr3(ds)$id, ds$desc$name) + expect_equal(convertOMLDataSetToMlr3(ds, mlr.task.id = ".")$id, + sprintf("%s.%s", ds$desc$name, ds$desc$id)) + expect_equal(convertOMLDataSetToMlr3(ds, mlr.task.id = "test")$id, "test") + expect_equal(convertOMLDataSetToMlr3(ds, mlr.task.id = "")$id, as.character(ds$desc$id)) + expect_equal(convertOMLDataSetToMlr3(ds, mlr.task.id = "")$id, as.character(ds$desc$name)) + expect_equal(convertOMLDataSetToMlr3(ds, mlr.task.id = "")$id, as.character(ds$desc$version)) + expect_equal(convertOMLDataSetToMlr3(ds, mlr.task.id = "")$id, "") + + # check if conversion to regression task works + ds$desc$target.features = ds$desc$default.target.attribute = "no_of_nodes_in" + expect_equal(convertOMLDataSetToMlr3(ds)$task_type, "regr") + }) +}) From a63f22d54b5081ca1c9cf29493fed1e4e306ae50 Mon Sep 17 00:00:00 2001 From: Bagrat Ter-Akopyan Date: Mon, 9 Sep 2019 17:03:52 +0200 Subject: [PATCH 2/5] add convert oml splits to mlr3 --- R/convertOMLSplitsToMlr3.R | 24 ++++++++++++++++++ .../test_local_convertOMLSplitsToMlr3.R | 25 +++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 R/convertOMLSplitsToMlr3.R create mode 100644 tests/testthat/test_local_convertOMLSplitsToMlr3.R diff --git a/R/convertOMLSplitsToMlr3.R b/R/convertOMLSplitsToMlr3.R new file mode 100644 index 0000000..d64823d --- /dev/null +++ b/R/convertOMLSplitsToMlr3.R @@ -0,0 +1,24 @@ +convertOMLSplitsToMlr3 = function(estim.proc, mlr.task, predict = "both") { + type = estim.proc$type + n.repeats = estim.proc$parameters[["number_repeats"]] + n.folds = estim.proc$parameters[["number_folds"]] + percentage = as.numeric(estim.proc$parameters[["percentage"]]) + data.splits = estim.proc$data.splits + stratified = estim.proc$parameters[["stratified_sampling"]] + stratified = ifelse(is.null(stratified), FALSE, stratified == "true") + + if (type == "crossvalidation") { + if (n.repeats == 1L) + mlr.rdesc = mlr3::rsmp("cv", folds = n.folds, stratify = stratified) + else + mlr.rdesc = mlr3::rsmp("repeated_cv", reps = n.repeats, folds = n.folds, stratify = stratified) + mlr.rin = mlr.rdesc$instantiate(mlr.task) + } else if (type == "holdout") { + mlr.rdesc = mlr3::rsmp("holdout") + mlr.rin = mlr.rdesc$instantiate(task = mlr.task) + n.folds = 1 + } else { + stopf("Unsupported estimation procedure type: %s", type) + } + return(mlr.rin) +} diff --git a/tests/testthat/test_local_convertOMLSplitsToMlr3.R b/tests/testthat/test_local_convertOMLSplitsToMlr3.R new file mode 100644 index 0000000..09b2462 --- /dev/null +++ b/tests/testthat/test_local_convertOMLSplitsToMlr3.R @@ -0,0 +1,25 @@ +context("convertOMLSplitsToMlr3") + +test_that("convertOMLSplitsToMlr3", { + with_test_cache({ + task = getOMLTask(59) + mlr.task = convertOMLTaskToMlr3(task)$mlr.task + + oml.types = c("crossvalidation", "holdout") + mlr.types = c("cv", "holdout") + + for (i in seq_along(oml.types)) { + task$input$estimation.procedure[['type']]= oml.types[i] + if (oml.types[i] == "holdout") { + task$input$estimation.procedure$parameters$percentage = "50" + } + splits = convertOMLSplitsToMlr3(task$input$estimation.procedure, mlr.task) + expect_is(splits, "Resampling") + expect_equal(splits$id, mlr.types[i]) + } + + # pass invalid estim.proc + task$input$estimation.procedure$type = "blabla" + expect_error(convertOMLSplitsToMlr3(task$input$estimation.procedure, mlr.task), "Unsupported estimation procedure type: blabla") + }) +}) From a239b3accdaace8f45fc77dcd1cf68b27f66ed3c Mon Sep 17 00:00:00 2001 From: Bagrat Ter-Akopyan Date: Mon, 9 Sep 2019 17:53:23 +0200 Subject: [PATCH 3/5] fixed lintr --- tests/testthat/test_local_convertOMLSplitsToMlr3.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test_local_convertOMLSplitsToMlr3.R b/tests/testthat/test_local_convertOMLSplitsToMlr3.R index 09b2462..98e713a 100644 --- a/tests/testthat/test_local_convertOMLSplitsToMlr3.R +++ b/tests/testthat/test_local_convertOMLSplitsToMlr3.R @@ -9,7 +9,7 @@ test_that("convertOMLSplitsToMlr3", { mlr.types = c("cv", "holdout") for (i in seq_along(oml.types)) { - task$input$estimation.procedure[['type']]= oml.types[i] + task$input$estimation.procedure$type = oml.types[i] if (oml.types[i] == "holdout") { task$input$estimation.procedure$parameters$percentage = "50" } From af4fa54deae37e5d6f1b6d2888d1c6edf3ae3ae0 Mon Sep 17 00:00:00 2001 From: Bagrat Ter-Akopyan Date: Tue, 5 Nov 2019 10:21:13 +0100 Subject: [PATCH 4/5] modifed DESCRIPTION and NAMESPACE --- DESCRIPTION | 4 +++- NAMESPACE | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 085242a..5302a08 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,7 +29,9 @@ Suggests: rmarkdown, R.rsp, lintr (>= 1.0.1), - rex + rex, + mlr3, + mlr3survival Imports: backports (>= 1.1.0), BBmisc (>= 1.11), diff --git a/NAMESPACE b/NAMESPACE index 6e399bf..0f3fcf3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -31,10 +31,12 @@ export(clearOMLCache) export(convertMlrLearnerToOMLFlow) export(convertMlrTaskToOMLDataSet) export(convertOMLDataSetToMlr) +export(convertOMLDataSetToMlr3) export(convertOMLFlowToMlr) export(convertOMLMlrRunToBMR) export(convertOMLRunToBMR) export(convertOMLTaskToMlr) +export(convertOMLTaskToMlr3) export(deleteOMLObject) export(extractOMLStudyIds) export(getCachedOMLDataSetStatus) From d9f29afd29bd4711ac159dd58f539dcff1d21075 Mon Sep 17 00:00:00 2001 From: Bagrat Ter-Akopyan Date: Tue, 5 Nov 2019 10:48:20 +0100 Subject: [PATCH 5/5] modified .travis.yml --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 294c9de..fd36c96 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,6 +18,7 @@ r_packages: r_github_packages: - mlr-org/farff - mlr-org/mlr + - mlr-org/mlr3survival - berndbischl/BBmisc - berndbischl/ParamHelpers - r-lib/httr