diff --git a/.gitignore b/.gitignore index 5b6a065..7b732e7 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ .Rhistory .RData .Ruserdata +.DS_Store diff --git a/DESCRIPTION b/DESCRIPTION index 99cb4ed..c590b09 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,11 +1,35 @@ -Package: reinbo +Package: ReinBo Type: Package -Title: What the Package Does (Title Case) +Title: Reinforcement Learning in R Version: 0.1.0 -Author: Who wrote it -Maintainer: The package maintainer -Description: More about what it does (maybe more than one line) - Use four spaces when indenting paragraphs within the Description. -License: What license is it under? +Authors@R: c( + person("Jiali", "lin", email = {"gruber_sebastian@t-online.de"}, role = c("aut", "ctb")), + person("Xudong", "Sun", email = {"smilesun.east@gmail.com"}, role = c("aut", "cre")) + ) +Maintainer: Xudong Sun +Description: automatic machine learning. +License: BSD_2_clause + file LICENSE Encoding: UTF-8 -LazyData: true \ No newline at end of file +Depends: + R (>= 3.4.0), +Imports: + R6, + mlr, + mlrCPO, + ParamHelpers, + BBmisc, + rlR, + hash, + mlrMBO +LazyData: true +RoxygenNote: 6.1.1 +BugReports: https://github.com/smilesun/rlR/issues +URL: https://github.com/smilesun/rlR +SystemRequirements: The following python package are needed to use the gym openAI environment. gym >= 0.10.5; At least one deep learning backend which keras requires(tensorflow, cntk, theano) should be installed on your computer, for example tensorflow >= 1.1.0 (tested on Ubuntu 14.04); The backend keras requires could be installed by keras::install_keras(); Both dependencies can also be installed by rlR::installDep() function. It is important to note that the user should run 'reticulate::use_python("/usr/local/bin/python")' to specify the python path and 'reticulate::use_virtualenv("myenv")' to specify which virtual environment to use. By default, the package is using "~/anaconda3/bin/python" as its python version. For detail, please refer to https://rstudio.github.io/reticulate/articles/versions.html +Suggests: + devtools, + testthat, + knitr, + covr, + rmarkdown +VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index d75f824..5eb36cf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1 +1,12 @@ -exportPattern("^[[:alpha:]]+") +# Generated by roxygen2: do not edit by hand + +export(reinbo) +import(BBmisc) +import(ParamHelpers) +import(R6) +import(hash) +import(mlr) +import(mlrCPO) +import(mlrMBO) +import(rlR) +import(smoof) diff --git a/R/hello.R b/R/hello.R index 3c562c4..4bdd285 100644 --- a/R/hello.R +++ b/R/hello.R @@ -16,3 +16,28 @@ hello <- function() { print("Hello, world!") } + +getGconf = function() { + flag_debug = T + conf_common = list( + NCVOuterIter = 5L, + NCVInnerIter = 5L, + measures = list(mlr::mmce), + repl = 10L, + prob_seed = 1L, + RLMaxEpisode = 2000L # this number does not play a role, it only ensures RL could run for sufficient time + ) + + conf_debug = list( + budget = 40L, + conf_tpot = list(generations = 1L, population_size = 3L, offspring_size = 3L, config_dict = 'TPOT light') + ) + + conf_full = list( + budget = 1000L, + # TPOT will evaluate population_size + generations × offspring_size pipelines in total. + conf_tpot = list(generations = 20L, population_size = 10L, offspring_size = 50L) + ) + if (flag_debug) return(c(conf_debug, conf_common)) + return(c(conf_full, conf_common)) +} diff --git a/R/reinbo_table_env.R b/R/reinbo_table_env.R new file mode 100644 index 0000000..b7df4b0 --- /dev/null +++ b/R/reinbo_table_env.R @@ -0,0 +1,136 @@ +Q_table_Env = R6::R6Class( + "Q_table_Env", + inherit = rlR::Environment, + public = list( + step_cnt = NULL, + s_r_d_info = NULL, + task = NULL, + mbo_cache = NULL, # store pipeline, hyperparameter set and corresponding performance for MBO + model_best_perf = NULL, # best performance of sampled model until now + model_trained = NULL, # store all trained models (limited to budget) + budget = NULL, # maximun models to be evaluated + measure = NULL, + cv_instance = NULL, + ctrl = NULL, + initialize = function(task, budget, measure, cv_instance, ctrl){ + self$flag_continous = FALSE # non-continuous action + self$flag_tensor = FALSE # no use of cnn + self$ctrl = ctrl + self$act_cnt = self$ctrl$g_act_cnt # available operators/actions at each stage + self$state_dim = self$ctrl$g_state_dim + self$step_cnt = 0L + self$s_r_d_info = list( + state = "s", + reward = 0, + done = FALSE, + info = list()) + self$task = task + self$mbo_cache = hash() + self$model_trained = NULL + self$budget = budget + self$measure = measure + self$cv_instance = cv_instance + }, + + evaluateArm = function(vec_arm) { + return(vec_arm) + }, + + # This function will be called at each step of the learning + step = function(action) { + operators = self$ctrl$g_operators[[names(self$ctrl$g_operators)[self$step_cnt + 1]]] + mod = action %% length(operators) + if (mod == 0){ + operator = operators[length(operators)] + } else { + operator = operators[mod] + } + self$s_r_d_info[["state"]] = paste0(self$s_r_d_info[["state"]], "-[", operator, "]") + #print(self$s_r_d_info[["state"]]) + self$s_r_d_info[["reward"]] = 0 + self$step_cnt = self$step_cnt + 1L + if (self$step_cnt >= self$ctrl$g_max_depth) { + model = g_getRLPipeline(self$s_r_d_info[["state"]]) + print(paste(model, collapse = " --> ")) + # stop RL agent if no enough budget for this episode: + model_id = paste(model, collapse = "\t") + if (has.key(model_id, self$mbo_cache)){ + require_budget = self$ctrl$g_mbo_iter*sum(getParamLengths(g_getParamSetFun(model))) + } else { + require_budget = (self$ctrl$g_init_design + self$ctrl$g_mbo_iter)*sum(getParamLengths(g_getParamSetFun(model))) + } + if(self$budget < require_budget) stop("too small total budget for reinbo table!") + if (self$budget - length(self$model_trained) < require_budget) { + self$agent$interact$idx_episode = self$agent$interact$maxiter + self$s_r_d_info[["done"]] = TRUE + } else { + # train model with hyperparameter tuning: + self$tuning(model) + self$s_r_d_info[["reward"]] = self$model_best_perf # best performance of the model until now + self$s_r_d_info[["done"]] = TRUE + #print(paste("Best Perfomance:", self$model_best_perf)) + } + } + return(self$s_r_d_info) + }, + + + # This function will be called at the beginning of the learning and at the end of each episode + reset = function() { + self$step_cnt = 0 + self$s_r_d_info[["state"]] = "s" + self$s_r_d_info[["done"]] = FALSE + self$s_r_d_info + }, + + + # Hyperparameter tuning for generated model, return best performance as reward and update mbo_cache + tuning = function(model) { + model_id = paste(model, collapse = "\t") # mdoel_id for search in mbo_cache + ps = g_getParamSetFun(model) # generate parameter set + + # check if we have already evaluated this model + + # if already in mbo_cache: + if (has.key(model_id, self$mbo_cache)){ + previous_perf = max(self$mbo_cache[[model_id]][ , "y"]) # best performance until now + epis_unimproved = self$mbo_cache[[model_id]][1, "epis_unimproved"] # number of episodes that performance has not been improved + # if in more than 2 episodes that the performance of this model has not been improved, + # stop further hyperparameter tuning: + if (epis_unimproved > 2) { + self$model_best_perf = previous_perf + } else { + # else: use parameter set and performance in memory as initial design + design = self$mbo_cache[[model_id]][ , -length(self$mbo_cache[[model_id]])] + # run several iterations of MBO: + run = mbo_fun(self$task, model, design, self$measure, self$cv_instance, self$ctrl) + # best accuracy: + self$model_best_perf = run$y + # update mbo_cache: + self$mbo_cache[[model_id]] = run$opt.path$env$path + # add result to self$model_trained: + new = run$opt.path$env$path$y[run$opt.path$env$dob != 0] + self$model_trained = c(self$model_trained, new) + # check if the performance of this model has been improved in this episode: + if (run$y <= previous_perf) { + self$mbo_cache[[model_id]]["epis_unimproved"] = epis_unimproved + 1 + } else { + self$mbo_cache[[model_id]]["epis_unimproved"] = 0 + } + } + } else { + + # if not in mbo_cache: + design = generateDesign(n = self$ctrl$g_init_design*sum(getParamLengths(ps)), par.set = ps) + run = mbo_fun(self$task, model, design, self$measure, self$cv_instance, self$ctrl) # potential warning: generateDesign could only produce 3 points instead of 1000, see issue 442 of mlrMBO + self$model_best_perf = run$y + self$mbo_cache[[model_id]] = run$opt.path$env$path + self$mbo_cache[[model_id]]["epis_unimproved"] = 0 + new = run$opt.path$env$path$y + self$model_trained = c(self$model_trained, new) + } + } + ) +) + + diff --git a/R/reinbo_table_func.R b/R/reinbo_table_func.R new file mode 100644 index 0000000..775461d --- /dev/null +++ b/R/reinbo_table_func.R @@ -0,0 +1,109 @@ +# ML_ReinBo algorithm: +opt.reinbo.table = function(task, budget, measure, init_val, train_set = NULL, conf, ctrl) { + subTask = task + if (!is.null(train_set)) subTask = subsetTask(task, train_set) + inner_loop = makeResampleInstance("CV", iters = getGconf()$NCVInnerIter, stratify = TRUE, subTask) + env = runQTable(subTask, budget, measure, inner_loop, init_val, conf, ctrl) + mmodel = getBestModel(env$mbo_cache) + return(list(mmodel = mmodel, env = env)) +} + +# Predict function: evaluate best model on test dataset +lock_eval.reinbo.table = function(task, measure, train_set, test_set, best_model){ + best_model = best_model$mmodel + lrn = genLearnerForBestModel(task, best_model, measure) + mod = train(lrn, task, subset = train_set) + pred = predict(mod, task, subset = test_set) + perf = performance(pred, measures = measure) + return(perf) +} + + +# Reinforcement learning part: +#' @param ctrl pipeline configuration +runQTable <- function(task, budget, measure, instance, init_val, conf, ctrl) { + env = Q_table_Env$new(task, budget, measure, instance, ctrl) + agent = initAgent(name = "AgentTable", env = env, conf = conf, q_init = init_val, + state_names = ctrl$g_state_names, + act_names_per_state = get_act_names_perf_state(ctrl$g_operators), + vis_after_episode = FALSE) + agent$learn(getGconf()$RLMaxEpisode) + return(env) +} + +# MBO function: hyperparameter tuning +#' @param model character vector +mbo_fun = function(task, model, design, measure, cv_instance, ctrl) { + ps = g_getParamSetFun(model) # get parameter set from string representation of a model + object = makeSingleObjectiveFunction( + fn = function(x) { + -reinbo_mlr_fun(task, model, x, measure, cv_instance) + runif(1)/100000 + }, + par.set = ps, + has.simple.signature = FALSE, + minimize = FALSE + ) + ctrlmbo = setMBOControlTermination(makeMBOControl(), iters = ctrl$g_mbo_iter * sum(getParamLengths(ps))) # 2 times the parameter set size + run = mbo(object, design = design, control = ctrlmbo, show.info = FALSE) + ## in (function (fn, nvars, max = FALSE, pop.size = 1000, max.generations = 100, : Stopped because hard maximum generation limit was hit. + ## Genoud is a function that combines evolutionary search algorithms with derivative-based (Newton or quasi-Newton) methods to solve difficult optimization problems. + ## not always occur: Warning in generateDesign(control$infill.opt.focussearch.points, ps.local,: generateDesign could only produce 20 points instead of 1000! + ## in https://github.com/mlr-org/mlrMBO/issues/442, is being worked on https://github.com/mlr-org/mlrMBO/pull/444 + return(run) +} + + +# Mlr function: caculate performance of generated model given specific param_set +reinbo_mlr_fun = function(task, model, param_set, measure, cv_instance){ + lrn = genLearner.reinbo(task, model, param_set, measure) + perf = resample(lrn, task, resampling = cv_instance, measures = measure, show.info = FALSE)$aggr + return(perf) +} + + + +# To get best model from mbo_cache of environment: +getBestModel = function(cache){ + models = keys(cache) + results = data.frame(model = 0, y = 0) + for (i in 1:length(models)) { + results[i, 1] = models[i] + results[i, 2] = max(cache[[models[i]]][, "y"]) + } + key = results[results$y == max(results$y), "model"][1] + ps = cache[[key]] + ps = ps[(ps$y == max(ps$y)), (colnames(ps) != "epis_unimproved")][1, ] + return(data.frame(Model = key, ps)) +} + +genLearnerForBestModel = function(task, best_model, measure){ + model = strsplit(as.character(best_model$Model), "\t")[[1]] + param_set = as.list(best_model) + param_set$Model = NULL + param_set$y = NULL + if (!is.null(param_set$C)) { param_set$C = 2^param_set$C } + if (!is.null(param_set$sigma)) { param_set$sigma = 2^param_set$sigma } + lrn = genLearner.reinbo(task, model, param_set, measure) + return(lrn) +} + + +genLearner.reinbo = function(task, model, param_set, measure){ + p = getTaskNFeats(task) + lrn = sprintf("%s %%>>%% %s %%>>%% makeLearner('%s', par.vals = ps.learner)", + model[1], model[2], model[3]) + lrn = gsub(pattern = "perc", x = lrn, replacement = "perc = param_set$perc", fixed = TRUE) + lrn = gsub(pattern = "rank", x = lrn, replacement = "rank = as.integer(max(1, round(p*param_set$rank)))", fixed = TRUE) + lrn = gsub(pattern = "NA %>>%", x = lrn, replacement = "", fixed = TRUE) + ps.learner = param_set + ps.learner$perc = NULL + ps.learner$rank = NULL + if (model[3] == "classif.ranger") { + p1 = p + if (!is.null(param_set$perc)) {p1 = max(1, round(p*param_set$perc))} + if (!is.null(param_set$rank)) {p1 = max(1, round(p*param_set$rank))} + ps.learner$mtry = max(1, as.integer(p1*param_set$mtry)) + } + lrn = eval(parse(text = lrn)) + return(lrn) +} diff --git a/R/reinbo_table_hyperpara_space.R b/R/reinbo_table_hyperpara_space.R new file mode 100644 index 0000000..a2573b2 --- /dev/null +++ b/R/reinbo_table_hyperpara_space.R @@ -0,0 +1,42 @@ +##### Parameter set of operators for hyperparameter tuning: +ps.ksvm = ParamHelpers::makeParamSet( + ParamHelpers::makeNumericParam("C", lower = -15, upper = 15, trafo = function(x) 2^x), + ParamHelpers::makeNumericParam("sigma", lower = -15, upper = 15, trafo = function(x) 2^x)) + +ps.ranger = ParamHelpers::makeParamSet( + ParamHelpers::makeNumericParam("mtry", lower = 1/10, upper = 1/1.5), ## range(p/10, p/1.5), p is the number of features + ParamHelpers::makeNumericParam("sample.fraction", lower = .1, upper = 1)) + +ps.xgboost = ParamHelpers::makeParamSet( + ParamHelpers::makeNumericParam("eta", lower = .001, upper = .3), + ParamHelpers::makeIntegerParam("max_depth", lower = 1L, upper = 15L), + ParamHelpers::makeNumericParam("subsample", lower = 0.5, upper = 1), + ParamHelpers::makeNumericParam("colsample_bytree", lower = 0.5, upper = 1), + ParamHelpers::makeNumericParam("min_child_weight", lower = 0, upper = 50) + ) + +ps.kknn = ParamHelpers::makeParamSet(ParamHelpers::makeIntegerParam("k", lower = 1L, upper = 20L)) + +ps.naiveBayes = ParamHelpers::makeParamSet(ParamHelpers::makeNumericParam("laplace", lower = 0.01, upper = 100)) + +ps.filter = ParamHelpers::makeParamSet(ParamHelpers::makeNumericParam("perc", lower = .1, upper = 1)) + +ps.pca = ParamHelpers::makeParamSet(ParamHelpers::makeNumericParam("rank", lower = .1, upper = 1)) ## range(p/10, p), p is the number of features + + + +##### Get parameter set for generated model: +g_getParamSetFun = function(model) { + ps.classif = sub(pattern = "classif", model[3], replacement = "ps") + ps.classif = eval(parse(text = ps.classif)) # hyperparameter set for classifier + if (model[2] == "NA") { + return(ps.classif) + } else if (length(grep(pattern = "perc", x = model)) > 0) { + return(c(ps.classif, ps.filter)) + } else { + return(c(ps.classif, ps.pca)) + } +} + + + diff --git a/R/reinbo_table_reinbo.R b/R/reinbo_table_reinbo.R new file mode 100644 index 0000000..dd476e3 --- /dev/null +++ b/R/reinbo_table_reinbo.R @@ -0,0 +1,23 @@ +#' @title reinbo +#' @description +#' automatic machine learning +#' @param task an mlr task +#' @return best model. +#' @export +reinbo = function(task, custom_operators, budget, train_set) { + ## Parameters for RL environment: + ctrl = list() + ctrl$g_operators = g_getOperatorList(custom_operators) + ctrl$g_max_depth = length(ctrl$g_operators) # stages: Scaling --> Feature filtering --> Classification + ctrl$g_act_cnt = max(sapply(ctrl$g_operators, length)) # max number of available operators at each stage + ctrl$g_state_names = g_genStateList(ctrl$g_operators) + ctrl$g_state_dim = length(ctrl$g_state_names) + ## Parameters for BO_PROBE: + ctrl$g_init_design = 4 # initial design size for MBO: g_init_design*sum(getParamLengths(par.set)) + ctrl$g_mbo_iter = 2 # iterations of MBO in each episode: g_mbo_iter*sum(getParamLengths(ps)) + + conf = rlR::getDefaultConf("AgentTable") + conf$set(policy.maxEpsilon = 1, policy.minEpsilon = 0.01, policy.aneal.steps = 60) + best_model = opt.reinbo.table(task, budget = budget, measure = list(mmce), train_set = train_set, init_val = -1, conf = conf, ctrl) + best_model +} diff --git a/R/reinbo_table_utils.R b/R/reinbo_table_utils.R new file mode 100644 index 0000000..459f671 --- /dev/null +++ b/R/reinbo_table_utils.R @@ -0,0 +1,57 @@ +#source("reinbo_table_reinbo.R") +# Get list of operators: +#' @example g_getOperatorList(NULL) +g_getOperatorList = function(custom_operators) { + default_operators = list( + preprocess = c("cpoScale()", "cpoScale(scale = FALSE)", "cpoScale(center = FALSE)", "cpoSpatialSign()", "NA"), + filter = c("cpoFilterAnova(perc)", "cpoFilterKruskal(perc)", "cpoPca(center = FALSE, rank)", "cpoFilterUnivariate(perc)", "NA"), + classifier = c("classif.ksvm", "classif.ranger", "classif.kknn", "classif.xgboost", "classif.naiveBayes")) + for (stage in names(default_operators)){ + if (!is.null(custom_operators[[stage]])) { + default_operators[stage] = custom_operators[stage] + } + } + return(default_operators) +} + +# Generate list of all potential states in Q table: +g_genStateList = function(operators) { + state_list = c("s") + last_stage = state_list + for (stage in c("preprocess", "filter")){ + current_stage = c() + for (i in last_stage){ + for (j in operators[stage]){ + current_stage = c(current_stage, paste0(i, "-[", j, "]")) + } + } + state_list = c(state_list, current_stage) + last_stage = current_stage + } + return(state_list) +} + + +# Get list of all potential actions at each state: +get_act_names_perf_state = function(g_operators){ + list = list("s" = g_operators$preprocess) + step1_states = sprintf("s-[%s]", g_operators$preprocess) + for (i in step1_states) { + text = sprintf("list$'%s' = g_operators$filter", i) + eval(parse(text = text)) + for (j in sprintf("%s-[%s]", i, g_operators$filter)) { + text = sprintf("list$'%s' = g_operators$classifier", j) + eval(parse(text = text)) + } + } + return(list) +} + +# Get model at end of each episode: +g_getRLPipeline = function(last_state) { + model = unlist(lapply(strsplit(last_state, "-")[[1]][-1], + function(x) { + x = gsub("[", x, replacement = "", fixed = TRUE) + gsub("]", x, replacement = "", fixed = TRUE)})) + return(model) +} diff --git a/R/reinbo_zzz.R b/R/reinbo_zzz.R new file mode 100644 index 0000000..c8b981a --- /dev/null +++ b/R/reinbo_zzz.R @@ -0,0 +1,10 @@ +#' @import R6 +#' @import mlr +#' @import mlrCPO +#' @import ParamHelpers +#' @import BBmisc +#' @import rlR +#' @import hash +#' @import mlrMBO +#' @import smoof +NULL # nocov diff --git a/README.md b/README.md new file mode 100644 index 0000000..8e00a7b --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +# Paper_2019_ReinBO +This repository is for reproducing the experimental results of the following paper, which will be published in the "LNCS workshop proceedings" volume: + +@article{xudong2019reinbo, title={ReinBo: Machine Learning pipeline search and configuration with Bayesian Optimization embedded Reinforcement Learning}, author={Xudong Sun and Jiali Lin and Bernd Bischl}, journal={arxiv preprint, https://arxiv.org/abs/1904.05381}, number={1904.05381}, year={2019} } + +Currently, the codes which could reproduce the experiments in the paper lies in the directory "benchmark". + +# To run the experiments + +- install the required packages +- learn how to use the R cran package batchtools for large scale benchmark study +- in folder benchmark, execute main.R, then submit jobs according to "batchtools" API +- There are in total 600 jobs \ No newline at end of file diff --git a/benchmark/algo_mlrmbo.R b/benchmark/algo_mlrmbo.R new file mode 100644 index 0000000..d982d6d --- /dev/null +++ b/benchmark/algo_mlrmbo.R @@ -0,0 +1,9 @@ +algo_fun_mlrmbo = function(job, data, instance, measure = list(mmce)) { + resample_opt_lock( + instance$mlr_task_full, + outer_loop_rins = instance$rins, + func_opt = opt.mlrmbo, + args_opt = list(budget = getGconf()$budget), + func_eval = lock_eval.mlrmbo, + args_eval = list()) +} \ No newline at end of file diff --git a/benchmark/bt_conf.R b/benchmark/bt_conf.R index ee4bc66..541c5bd 100644 --- a/benchmark/bt_conf.R +++ b/benchmark/bt_conf.R @@ -1,6 +1,6 @@ flag_debug = F task_ids = 37 -if (!flag_debug) task_ids = c(14, 23, 37, 53, 3917, 9946, 9952, 9978, 125921, 146817, 146820) +if (!flag_debug) task_ids = c(14, 23, 37, 53, 3917, 9946, 9952, 9978, 146817, 146820) getGconf = function() { conf_common = list( NCVOuterIter = 5L, diff --git a/benchmark/func_smac.R b/benchmark/func_smac.R new file mode 100644 index 0000000..17fcbf7 --- /dev/null +++ b/benchmark/func_smac.R @@ -0,0 +1,117 @@ +run = function(cs, budget = 1000) { + hh = reticulate::import("python_smac_space") + #scenario = Scenario({"run_obj": "quality", # we optimize quality (alternatively runtime) + # "runcount-limit": budget, # maximum function evaluations + # "cs": cs, # configuration space + # "deterministic": "true" + # }) + budget = 100 + scenario = hh$Scenario(list("run_obj" = "quality", # we optimize quality (alternatively runtime) + "runcount-limit" = budget, # maximum function evaluations + "cs" = cs, # configuration space + "deterministic" = "true", + "shared_model" = TRUE # deletable + )) + + # scenario$abort_on_first_run_crash = F + + print("Optimizing! Depending on your machine, this might take a few minutes.") + np = reticulate::import("numpy") + #fd = hh$ExecuteTAFuncDict(toy_smac_obj) + #smac = hh$SMAC(scenario = scenario, rng = np$random$RandomState(as.integer(4)), tae_runner = toy_smac_obj) + #smac = hh$SMAC(scenario = scenario, rng = np$random$RandomState(as.integer(4)), tae_runner = fd) + reticulate::source_python('smac_obj.py') + source("smac_obj.R") + #smac = hh$SMAC(scenario = scenario, rng = np$random$RandomState(as.integer(4)), tae_runner = smac_obj_from_cfg) + py_fun = reticulate::r_to_py(toy_smac_obj, convert = FALSE) + #py_fun = reticulate::r_to_py(function(x) 1, convert = TRUE) + smac = hh$SMAC(scenario = scenario, rng = np$random$RandomState(as.integer(4)), tae_runner = py_fun) + smac$get_tae_runner() + incumbent = smac$optimize() # problem + #inc_value = svm_from_cfg(incumbent) + incumbent + #print("Optimized Value: %.2f" % (inc_value)) +} + + +test_run = function() { + cfg = reticulate::import("python_smac_space") + cs = cfg$cs + run(cs) +} + +# Predict function: evaluate best model on test dataset +lock_eval.smac = function(task, measure, train_set, test_set, best_model){ + cfg = best_model + lrn = gen_mlrCPOPipe_from_smac_cfg(cfg) + mod = train(lrn, task, subset = train_set) + pred = predict(mod, task, subset = test_set) + mpred = performance(pred, measures = measure) + return(mpred) +} + + +gen_mlrCPOPipe_from_smac_cfg = function(cfg) { + #cfg = cfg.sample_configuration() + # convert ConfigSpace.configuration_space.ConfigurationSpace to ConfigSpace.configuration_space.Configuration + # For deactivated parameters, the configuration stores None-values. so we remove them. + #cfg = list(Model = "xgboost", Preprocess = "cpoScale(center = FALSE)", FeatureFilter = "cpoPca(center = FALSE, rank = rank_val)", lrn_xgboost_max_depth = 3, lrn_xgboost_eta = 0.03, fe_pca_rank = 0.5) # for testing and debug + model = cfg$Model + preprocess = cfg$Preprocess + pfilter = cfg$FeatureFilter + perc_val = NULL + rank_val = NULL + + ## + extract_hyper_prefix = function(prefix = "lrn", cfg) { + names4lrn_hyp = grep(pattern = prefix, x = names(cfg), value = T) + ps.learner = cfg[names4lrn_hyp] # evaluted later by R function eval + pattern = paste0("(", prefix, "_[:alpha:]+_)*") + #ns4hyper = gsub(pattern = pattern, x = names4lrn_hyp, replacement="", ignore.case = T) + ns4hyper = stringr::str_replace(string = names4lrn_hyp, pattern = pattern, replacement="") + names(ps.learner) = ns4hyper + ps.learner + } + ## + ps.learner = extract_hyper_prefix("lrn", cfg) # hyper-parameters for learner must exist + + names4Fe = grep(pattern = "fe", x = names(cfg), value = T) + + p = mlr::getTaskNFeats(subTask) # this subTask relies on global variable + + if(length(names4Fe) > 0) { + ps.Fe = extract_hyper_prefix("fe", cfg) + if(grepl(pattern = "perc", x = names(ps.Fe))) { + name4featureEng_perc = grep(pattern = "perc", x = names(ps.Fe), value = T) + perc_val = ps.Fe[[name4featureEng_perc]] + } + if(grepl(pattern = "rank", x = names(ps.Fe))) { + name4featureEng_rank = grep(pattern = "rank", x = names(ps.Fe), value = T) + rank_val = ceiling(ps.Fe[[name4featureEng_rank]] * p) + } + } + + lrn = sprintf("%s %%>>%% %s %%>>%% makeLearner('classif.%s', par.vals = ps.learner)", + preprocess, pfilter, model) + lrn = gsub(pattern = "NA %>>%", x = lrn, replacement = "", fixed = TRUE) + + + # set mtry after reducing the number of dimensions + if (model == "ranger") { + p1 = p + if (!is.null(perc_val)) {p1 = max(1, round(p*perc_val))} + if (!is.null(rank_val)) {p1 = rank_val} + ps.learner$mtry = max(1, as.integer(p1*ps.learner$mtry)) + } + lrn = paste0("library(mlrCPO);library(magrittr);", lrn) + obj_lrn = eval(parse(text = lrn)) + return(obj_lrn) +} + +test_gen_mlrCPOPipe_from_smac_cfg = function() { + subTask = mlr::iris.task + cfg = reticulate::import("python_smac_space") + cfg = cfg$stub + lrn = gen_mlrCPOPipe_from_smac_cfg(cfg) + lrn +} diff --git a/benchmark/irace_space.txt b/benchmark/irace_space.txt new file mode 100644 index 0000000..44c8d2a --- /dev/null +++ b/benchmark/irace_space.txt @@ -0,0 +1,45 @@ +## Template for parameter description file for Iterated Race. +## +## The format is one parameter per line. Each line contains: +## +## 1: Name of the parameter. An unquoted alphanumeric string, +## example: ants + +## 2: Switch to pass the parameter. A quoted (possibly empty) string, +## if the value and the switch must be separated, add a space at +## the end of the string. Example : "--version1 --ants " + +## 3: Type. An unquoted single letter, among +## i: Integer, c: categorical, o: ordinal, r: real. + +## 4: For c and o: All possible values, that is, a variable number of +## quoted or unquoted strings separated by commas within +## parenthesis. Empty strings and strings containing commas or +## spaces must be quoted. +## For i,r: a pair of unquoted numbers representing minimum and +## maximum values. + +## 5: A conditional parameter can be defined according to the values of +## one or several other parameters. This is done by adding a +## character '|' followed by an R expression involving the names of +## other parameters. This expression must return TRUE if the +## condition is satisfied, FALSE otherwise. + +# 1: 2: 3: 4: 5: +Preprocess "--Preprocess" c ("cpoScale", "cpoScale.scale", "cpoScale.center", "cpoSpatialSign", "NA") +Filter "--Filter" c ("cpoFilterAnova.perc", "cpoFilterKruskal.perc", "cpoFilterUnivariate.perc", "cpoPca.rank", "NA") +Classify "--Classify" c ("kknn", "ksvm", "xgboost", "ranger", "naiveBayes") +perc "--perc" r (0.1,1) | Filter %in% c("cpoFilterAnova.perc", "cpoFilterKruskal.perc", "cpoFilterUnivariate.perc") +rank "--rank" r (0.1,1) | Filter == "cpoPca.rank" +k "--k" i (1,20) | Classify == "kknn" +C "--C" r (-15,15) | Classify == "ksvm" +sigma "--sigma" r (-15,15) | Classify == "ksvm" +mtry "--mtry" r (0.1,0.66666) | Classify == "ranger" +sample.fraction "--sample.fraction" r (0.1,1) | Classify == "ranger" +eta "--eta" r (0.001,0.3) | Classify == "xgboost" +max_depth "--max_depth" i (1,15) | Classify == "xgboost" +subsample "--subsample" r (0.5,1) | Classify == "xgboost" +colsample_bytree "--colsample_bytree" r (0.5,1) | Classify == "xgboost" +min_child_weight "--min_child_weight" r (0,50) | Classify == "xgboost" +laplace "--laplace" r (0.01,100) | Classify == "naiveBayes" + diff --git a/benchmark/mlrmbo_func.R b/benchmark/mlrmbo_func.R new file mode 100644 index 0000000..824268d --- /dev/null +++ b/benchmark/mlrmbo_func.R @@ -0,0 +1,102 @@ +# mlrMBO algorithm: +opt.mlrmbo = function(task, budget, measure, train_set = NULL) { + subTask = task + if (!is.null(train_set)) subTask = subsetTask(task, train_set) + inner_loop = makeResampleInstance("CV", iters = getGconf()$NCVInnerIter, stratify = TRUE, subTask) + run = mlrMBO_func(subTask, instance = inner_loop, measure, budget) + mmodel = run$x + return(mmodel) +} + + +# Predict function: evaluate best model on test dataset +lock_eval.mlrmbo = function(task, measure, train_set, test_set, best_model){ + best_model$sigma = 2^(best_model$sigma) + best_model$C = 2^(best_model$C) + lrn = genLearner.mbo(task, best_model) + mod = train(lrn, task, subset = train_set) + pred = predict(mod, task, subset = test_set) + perf = performance(pred, measures = measure) + return(perf) +} + + +# hyper-parameter space +par.set = makeParamSet( + makeDiscreteParam('Pre', values = c("cpoScale()", "cpoScale(scale = FALSE)", "cpoScale(center = FALSE)", "cpoSpatialSign()", "no_operator")), + makeDiscreteParam('Filter', values = c("cpoFilterAnova(perc)", "cpoFilterKruskal(perc)", "cpoPca(center = FALSE, rank)", "cpoFilterUnivariate(perc)", "no_operator")), + makeNumericParam('perc', lower = .1, upper = 1, requires = quote(Filter %in% c("cpoFilterAnova(perc)", "cpoFilterKruskal(perc)", "cpoFilterUnivariate(perc)"))), + makeNumericParam('rank', lower = .1, upper = 1, requires = quote(Filter == "cpoPca(center = FALSE, rank)")), + makeDiscreteParam('Learner', values = c("kknn", "ksvm", "xgboost", "ranger", "naiveBayes")), + makeIntegerParam('k', lower = 1L, upper = 20L, requires = quote(Learner == "kknn")), + makeNumericParam("C", lower = -15, upper = 15, trafo = function(x) 2^x, requires = quote(Learner == 'ksvm')), + makeNumericParam("sigma", lower = -15, upper = 15, trafo = function(x) 2^x, requires = quote(Learner == 'ksvm')), + makeNumericParam("mtry", lower = 1/10, upper = 1/1.5, requires = quote(Learner == 'ranger')), + makeNumericParam("sample.fraction", lower = .1, upper = 1, requires = quote(Learner == 'ranger')), + makeNumericParam("eta", lower = .001, upper = .3, requires = quote(Learner == 'xgboost')), + makeIntegerParam("max_depth", lower = 1L, upper = 15L, requires = quote(Learner == 'xgboost')), + makeNumericParam("subsample", lower = .5, upper = 1, requires = quote(Learner == 'xgboost')), + makeNumericParam("colsample_bytree", lower = .5, upper = 1, requires = quote(Learner == 'xgboost')), + makeNumericParam("min_child_weight", lower = 0, upper = 50, requires = quote(Learner == 'xgboost')), + makeNumericParam("laplace", lower = .01, upper = 100, requires = quote(Learner == 'naiveBayes')) +) + + +# generate learner for task and specific parameter set +genLearner.mbo <- function(task, param_set){ + p = getTaskNFeats(task) + lrn = sprintf("%s %%>>%% %s %%>>%% makeLearner('classif.%s', par.vals = ps.learner)", + param_set$Pre, param_set$Filter, param_set$Learner) + lrn = gsub(pattern = "perc", x = lrn, replacement = "perc = param_set$perc", fixed = TRUE) + lrn = gsub(pattern = "rank", x = lrn, replacement = "rank = as.integer(max(1, round(p*param_set$rank)))", fixed = TRUE) + lrn = gsub(pattern = "no_operator %>>%", x = lrn, replacement = "", fixed = TRUE) + ps.learner = param_set + ps.learner$perc = NULL + ps.learner$rank = NULL + ps.learner$Pre = NULL + ps.learner$Filter = NULL + ps.learner$Learner = NULL + ps.learner[is.na(ps.learner)] = NULL + if (param_set$Learner == "ranger") { + p1 = p + if (!is.na(param_set$perc)) {p1 = max(1, round(p*param_set$perc))} + if (!is.na(param_set$rank)) {p1 = max(1, round(p*param_set$rank))} + ps.learner$mtry = max(1, as.integer(p1*param_set$mtry)) + } + lrn = eval(parse(text = lrn)) + return(lrn) +} + + +# using mlrMBO to optimize pipeline +mlrMBO_func <- function(task, instance, measure, budget){ + objfun = makeSingleObjectiveFunction( + fn = function(x) { + lrn = genLearner.mbo(task, x) + perf = resample(lrn, task, resampling = instance, measures = measure, show.info = FALSE)$aggr + return(perf) + }, + par.set = par.set, + has.simple.signature = FALSE, + minimize = TRUE + ) + ctrl = setMBOControlTermination(makeMBOControl(), iters = budget-4*length(par.set$pars)) + run = mbo(objfun, control = ctrl, show.info = FALSE) + return(run) +} + + + + +# Test: +# measure = list(mmce) +# task = sonar.task +# inner_loop = makeResampleInstance("CV", iters = 3, stratify = TRUE, task) +# outer_loop_rins = makeResampleInstance("CV", iters = 5, stratify = TRUE, task) +# opt_set = outer_loop_rins$train.inds[[1]] +# lock_set = outer_loop_rins$test.inds[[1]] +# mmodel = opt.mlrmbo(task, 66, measure) +# perf = lock_eval.mlrmbo(task, measure, opt_set, lock_set, mmodel) + + + diff --git a/benchmark/python_smac_space.py b/benchmark/python_smac_space.py new file mode 100644 index 0000000..a92522f --- /dev/null +++ b/benchmark/python_smac_space.py @@ -0,0 +1,73 @@ +# Import ConfigSpace and different types of parameters +from smac.configspace import ConfigurationSpace +from ConfigSpace.hyperparameters import CategoricalHyperparameter, \ + UniformFloatHyperparameter, UniformIntegerHyperparameter +from ConfigSpace.conditions import InCondition + +# Import SMAC-utilities +from smac.tae.execute_func import ExecuteTAFuncDict +from smac.scenario.scenario import Scenario +from smac.facade.smac_facade import SMAC + +cs = ConfigurationSpace() + +# We define a few possible types of SVM-kernels and add them as "kernel" to our cs +step1 = CategoricalHyperparameter("Preprocess", ['cpoScale()', 'cpoScale(scale = FALSE)', 'cpoScale(center = FALSE)', 'cpoSpatialSign()', 'NA'], default_value="NA") +cs.add_hyperparameter(step1) + +step2 = CategoricalHyperparameter("FeatureFilter", ['cpoFilterAnova(perc=perc_val)', 'cpoFilterKruskal(perc=perc_val)', 'cpoFilterUnivariate(perc=perc_val)', 'cpoPca(center = FALSE, rank = rank_val)', 'NA'], default_value = "NA") +cs.add_hyperparameter(step2) +anova_perc = UniformFloatHyperparameter("fe_anova_perc", 0.1, 1, default_value = 0.1) +kruskal_perc = UniformFloatHyperparameter("fe_kruskal_perc", 0.1, 1, default_value = 0.1) +univar_perc = UniformFloatHyperparameter("fe_univar_perc", 0.1, 1, default_value = 0.1) +pca_perc = UniformFloatHyperparameter("fe_pca_rank", 0, 0.9, default_value = 0.1) +cs.add_hyperparameters([anova_perc, kruskal_perc, univar_perc, pca_perc]) + +step2_child_anova = InCondition(child=anova_perc, parent=step2, values=["cpoFilterAnova(perc=perc_val)"]) +step2_child_kruskal = InCondition(child=kruskal_perc, parent=step2, values=["cpoFilterKruskal(perc=perc_val)"]) +step2_child_univar = InCondition(child=univar_perc, parent=step2, values=["cpoFilterUnivariate(perc=perc_val)"]) +step2_child_pca = InCondition(child=pca_perc, parent=step2, values=["cpoPca(center = FALSE, rank = rank_val)"]) +cs.add_conditions([step2_child_anova, step2_child_kruskal, step2_child_univar, step2_child_pca]) + +step3 = CategoricalHyperparameter("Model", ['kknn', 'ksvm', 'ranger', 'xgboost', 'naiveBayes']) +cs.add_hyperparameter(step3) + +hyper_kknn = UniformIntegerHyperparameter("lrn_kknn_k", 1, 19, default_value = 1) +hyper_ksvm_C = UniformFloatHyperparameter("lrn_svm_C", 2**(-15), 2**(15), default_value = 1) + +hyper_ksvm_sigma = UniformFloatHyperparameter("lrn_svm_sigma", 2**(-15), 2**(15), default_value = 1) + +hyper_ranger_mtry = UniformFloatHyperparameter("lrn_ranger_mtry", 0.1, 0.66666, default_value = 0.1) +hyper_ranger_sample_fraction = UniformFloatHyperparameter("lrn_ranger_sample.fraction", 0.1, 1, default_value = 0.1) +hyper_xgboost_eta = UniformFloatHyperparameter('lrn_xgboost_eta', 0.001, 0.3, default_value = 0.1) +hyper_xgboost_max_depth = UniformIntegerHyperparameter('lrn_xgboost_max_depth', 1, 14, default_value = 5) +hyper_xgboost_subsample = UniformFloatHyperparameter('lrn_xgboost_subsample', 0.5, 1, default_value = 0.5) +hyper_xgboost_colsample_bytree = UniformFloatHyperparameter('lrn_xgboost_colsample_bytree', 0.5, 1, default_value = 0.5) +hyper_xgboost_min_child_weight = UniformFloatHyperparameter('lrn_xgboost_min_child_weight', 0, 50, default_value = 0.5) +hyper_naiveBayes = UniformFloatHyperparameter('lrn_naiveBayes_laplace', 0.01, 100, default_value = 0.01) + +cs.add_hyperparameters([hyper_kknn, hyper_ksvm_C, hyper_ksvm_sigma, hyper_ranger_mtry, hyper_ranger_sample_fraction, hyper_xgboost_eta, hyper_xgboost_max_depth, hyper_xgboost_subsample, hyper_xgboost_colsample_bytree, hyper_xgboost_min_child_weight, hyper_naiveBayes]) + +step3_child_kknn = InCondition(child = hyper_kknn, parent = step3, values = ["kknn"]) +#cs.add_conditions([step3_child_kknn]) +step3_child_ksvm_c = InCondition(child = hyper_ksvm_C, parent = step3, values = ["ksvm"]) +#cs.add_conditions([step3_child_ksvm_c]) +step3_child_ksvm_sigma = InCondition(child = hyper_ksvm_sigma, parent = step3, values = ["ksvm"]) +#cs.add_conditions([step3_child_ksvm_sigma]) +## +step3_child_ranger_mtry = InCondition(child = hyper_ranger_mtry, parent = step3, values = ["ranger"]) +step3_child_ranger_sample_fraction = InCondition(child = hyper_ranger_sample_fraction, parent = step3, values = ["ranger"]) +## + +step3_child__xgboost_eta = InCondition(child = hyper_xgboost_eta, parent = step3, values = ["xgboost"]) +step3_child__xgboost_max_depth = InCondition(child = hyper_xgboost_max_depth, parent = step3, values = ["xgboost"]) +step3_child__xgboost_subsample = InCondition(child = hyper_xgboost_subsample, parent = step3, values = ["xgboost"]) +step3_child__xgboost_colsample_bytree = InCondition(child = hyper_xgboost_colsample_bytree, parent = step3, values = ["xgboost"]) +step3_child__xgboost_min_child_weight = InCondition(child = hyper_xgboost_min_child_weight, parent = step3, values = ["xgboost"]) +## +step3_child__naiveBayes_laplace = InCondition(child = hyper_naiveBayes, parent = step3, values = ["naiveBayes"]) + +cs.add_conditions([step3_child_kknn, step3_child_ksvm_c, step3_child_ksvm_sigma, step3_child_ranger_mtry, step3_child_ranger_sample_fraction, step3_child__xgboost_eta, step3_child__xgboost_subsample, step3_child__xgboost_max_depth, step3_child__xgboost_colsample_bytree, step3_child__xgboost_min_child_weight, step3_child__naiveBayes_laplace]) + +cfg = cs.sample_configuration() +stub = {k : cfg[k] for k in cfg if cfg[k]} diff --git a/benchmark/reinbo_table_hyperpara_space.R b/benchmark/reinbo_table_hyperpara_space.R index bcb0d15..404f31d 100644 --- a/benchmark/reinbo_table_hyperpara_space.R +++ b/benchmark/reinbo_table_hyperpara_space.R @@ -21,7 +21,7 @@ ps.naiveBayes = makeParamSet(makeNumericParam("laplace", lower = 0.01, upper = 1 ps.filter = makeParamSet(makeNumericParam("perc", lower = .1, upper = 1)) -ps.pca = makeParamSet(makeNumericParam("rank", lower = .1, upper = 1)) ## range(p/10, p/1.5), p is the number of features +ps.pca = makeParamSet(makeNumericParam("rank", lower = .1, upper = 1)) ## range(p/10, p), p is the number of features diff --git a/benchmark/reinbo_table_test.R b/benchmark/reinbo_table_test.R index cdc00d6..81f0d7f 100644 --- a/benchmark/reinbo_table_test.R +++ b/benchmark/reinbo_table_test.R @@ -25,5 +25,5 @@ test_set = outer_loop$test.inds[[1]] conf = rlR::getDefaultConf("AgentTable") conf$set(policy.maxEpsilon = 1, policy.minEpsilon = 0.01, policy.aneal.steps = 60) -best_model = opt.reinbo.table(task, budget = 1000L, measure = list(mmce), train_set = train_set, init_val = -1, conf = conf) +best_model = opt.reinbo.table(task, budget = 100L, measure = list(mmce), train_set = train_set, init_val = -1, conf = conf) pred = lock_eval.reinbo.table(task, measure = list(mmce), train_set, test_set, best_model) diff --git a/benchmark/smac_obj.R b/benchmark/smac_obj.R new file mode 100644 index 0000000..b62ff2f --- /dev/null +++ b/benchmark/smac_obj.R @@ -0,0 +1,17 @@ +# Objective to optimize: +toy_smac_obj = function(cfg) { + print(cfg) + runif(1) +} +smac_objective = function(cfg) { + # some variables are defined in the scope where this function is called + model_index <<- model_index + 1 + model_list[[model_index]] <<- cfg + lrn = gen_mlrCPOPipe_from_smac_cfg(cfg) + perf = resample(lrn, subTask, resampling = inner_loop, measures = measure, show.info = FALSE)$aggr + perf_list <<- c(perf_list, as.numeric(perf)) + return(perf) +} + + + diff --git a/benchmark/smac_obj.py b/benchmark/smac_obj.py new file mode 100644 index 0000000..d6ec23a --- /dev/null +++ b/benchmark/smac_obj.py @@ -0,0 +1,30 @@ +import rpy2 +import rpy2.robjects as robjects +import rpy2.robjects.numpy2ri +rpy2.robjects.numpy2ri.activate() +robjects.conversion.py2ri = rpy2.robjects.numpy2ri +from rpy2.robjects.packages import STAP +# if rpy2 < 2.6.1 do: +# from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage +# STAP = SignatureTranslatedAnonymousPackage +with open('smac_obj.R', 'r') as f: + string = f.read() +myfunc = STAP(string, "toy_smac_obj") +def smac_obj_from_cfg(cfg): + """ Creates a SVM based on a configuration and evaluates it on the + iris-dataset using cross-validation. + + Parameters: + ----------- + cfg: Configuration (ConfigSpace.ConfigurationSpace.Configuration) + Configuration containing the parameters. + Configurations are indexable! + + Returns: + -------- + A crossvalidated mean score for the svm on the loaded data-set. + """ + # For deactivated parameters, the configuration stores None-values. + # This is not accepted by the SVM, so we remove them. + cfg = {k : cfg[k] for k in cfg if cfg[k]} + return myfunc.toy_smac_obj(cfg) diff --git a/benchmark/tpe_func.R b/benchmark/tpe_func.R index e0a7525..75e8fb9 100644 --- a/benchmark/tpe_func.R +++ b/benchmark/tpe_func.R @@ -42,7 +42,7 @@ objective = function(args) { genLearner.tpe = function(args){ model = args$Classifier$model args$Classifier$model = NULL - ps.learner = args$Classifier + ps.learner = args$Classifier # list in R, which can be evaluated by eval filter = args$FeatureFilter$filter lrn = sprintf("%s %%>>%% %s %%>>%% makeLearner('classif.%s', par.vals = ps.learner)", args$Preprocess, filter, model) diff --git a/benchmark/tpe_space.py b/benchmark/tpe_space.py new file mode 100644 index 0000000..2a93357 --- /dev/null +++ b/benchmark/tpe_space.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on 15.2.2019 + +@author: Jiali Lin +""" + +from hyperopt import hp +import hyperopt.pyll.stochastic + +# Define the search space +space = { + # Step 1: + 'Preprocess': hp.choice('pre', + ['cpoScale()', + 'cpoScale(scale = FALSE)', + 'cpoScale(center = FALSE)', + 'cpoSpatialSign()', + 'NA']), + + + # Step 2: + 'FeatureFilter': hp.choice('feature', [ + {'filter': 'cpoFilterAnova(perc)', + 'perc': hp.uniform('ano_per', 0.1, 1)}, + + {'filter': 'cpoFilterKruskal(perc)', + 'perc': hp.uniform('kru_per', 0.1, 1)}, + + {'filter': 'cpoFilterUnivariate(perc)', + 'perc': hp.uniform('uni_per', 0.1, 1)}, + + {'filter': 'cpoPca(center = FALSE, rank)', + 'rank': hp.uniform('pca_rank', 0.1, 1)}, + + {'filter': 'NA'}]), + + + # Step 3: + 'Classifier': hp.choice('classify_model', [ + {'model': 'kknn', + 'k': 1 + hp.randint('kknn_k', 19)}, + + {'model': 'ksvm', + 'C': hp.uniform('ksvm_C', 2**(-15), 2**(15)), + 'sigma': hp.uniform('ksvm_sigma', 2**(-15), 2**(15))}, + + {'model': 'ranger', + 'mtry': hp.uniform('ranger_mtry', 0.1, 0.66666), + 'sample.fraction': hp.uniform('ranger_fra', 0.1, 1)}, + + {'model': 'xgboost', + 'eta': hp.uniform('xgboost_eta', 0.001, 0.3), + 'max_depth': 1 + hp.randint('xgboost_depth', 14), + 'subsample': hp.uniform('xgboost_sub', 0.5, 1), + 'colsample_bytree': hp.uniform('xgboost_col', 0.5, 1), + 'min_child_weight': hp.uniform('xgboost_min', 0, 50)}, + + {'model': 'naiveBayes', + 'laplace': hp.uniform('bay_laplace', 0.01, 100)} + + ])} + + + +# Sample one configuration: +# print(hyperopt.pyll.stochastic.sample(space)) +#print(hyperopt.pyll.stochastic.sample(space)) +#106 {'Classifier': {'model': 'ranger', 'mtry': 0.574453305013119, 'sample.fracti +#107 on': 0.8656502995483121}, 'FeatureFilter': {'filter': 'cpoFilterAnova(perc)' +#108 , 'perc': 0.3726989872044636}, 'Preprocess': 'NA'} diff --git a/demo.Rmd b/demo.Rmd new file mode 100644 index 0000000..3ba7e2b --- /dev/null +++ b/demo.Rmd @@ -0,0 +1,72 @@ +--- +title: "Poster_demo" +author: "Jiali Lin" +date: "9/12/2019" +output: + pdf_document: default +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +## Introduction +**Reinbo** is an AutoML package in R that optimizes machine learning pipeline with Bayesian Optimization embedded Reinforcement Learning. + +## Installation +* Clone or download this repository + +* Open **reinbo.Rproj** with RStudio + +* Click **build** -> **Install and Restart** to install ReinBo package + +## Using ReinBo +Load package to library and now ReinBo is ready for you to optimize a pipeline. +```{r eval = FALSE} +library(ReinBo) +best_model = reinbo(task = mlrTask, budget = 1000L, train_set = train_set, custom_operators = NULL) +``` +- **task**: the task must be a **mlr task** (currently only classification task is accepted.) +- **budget**: maximum number of pipelines to evaluate +- **train_set**: data set index used for pipeline optimization +- **custom_operators**: set **Null** to use all default operators for pipeline + +A typical ML pipline consists of 3 stages: preprocessing, filtering and classification. Below is a list of the current built-in operators at each stage that come with ReinBo: + +- **preprocess**: "cpoScale()", "cpoScale(scale = FALSE)", "cpoScale(center = FALSE)", "cpoSpatialSign()", "NA"; + +- **filter**: "cpoFilterAnova(perc)", "cpoFilterKruskal(perc)", "cpoPca(center = FALSE, rank)", "cpoFilterUnivariate(perc)", "NA"; + +- **classifier**: "classif.ksvm", "classif.ranger", "classif.kknn", "classif.xgboost", "classif.naiveBayes"; + +where "NA" indicates that no operator would be taken at that stage. + +Users can also select a subset of operators by setting e.g.: +```{r} +custom_operators = list(preprocess = c("cpoScale()", "cpoSpatialSign()", "NA"), + filter = NULL, # using all filtering operators + classifier = c("classif.kknn", "classif.naiveBayes")) +``` + +## Example +```{r message=FALSE, warning=FALSE, results='hide'} +library(ReinBo) +library(mlrCPO) +library(OpenML) +task = convertOMLTaskToMlr(getOMLTask(37))$mlr.task %>>% cpoDummyEncode(reference.cat = FALSE) +split = makeResampleInstance("Holdout", task) +train_set = split$train.inds[[1]] +test_set = split$test.inds[[1]] +best_model = reinbo(task = task, budget = 100L, train_set = train_set, custom_operators = NULL) +print(best_model$mmodel) +``` +```{r echo=FALSE} +print(best_model$mmodel) +``` + +**y** in the result is mmce (mean mis-classification error) of the best model. + + + + + diff --git a/demo/agg.R b/demo/agg.R new file mode 100644 index 0000000..588735c --- /dev/null +++ b/demo/agg.R @@ -0,0 +1,19 @@ +tb = getJobPars(findDone()) +getJobPars(findQueued()) +getJobPars(findSubmitted()) +unique(tb[, "problem"]) +ids = tb[(algorithm == "reinbo_table") & (problem == "LED-display-domain-7digit"), job.id, with = T] +ids = tb[(algorithm == "reinbo_table") & (problem == "wdbc"), job.id, with = T] +ids = findDone() +reslist = reduceResultsList(ids = ids, fun = function(job, res) { + res2 = list() + res2$prob.name = job$prob.name + res2$algo.name = job$algo.name + res2$job.id = job$job.id + res2$repl = job$repl + res2$mmce = mean(res$vec_mpred) + #res2$model = res$list_mmodel + res2 +}) +dt_res = rbindlist(reslist) +saveRDS(dt_res, file = "reinbo_new_cut_episode.rds") diff --git a/demo/algo_auto_sklearn.R b/demo/algo_auto_sklearn.R new file mode 100644 index 0000000..cdc0fe9 --- /dev/null +++ b/demo/algo_auto_sklearn.R @@ -0,0 +1,8 @@ +algo_fun_auto_sklearn = function(job, data, instance, measure = list(mmce), flag_light) { + resample_opt_lock(instance$mlr_task_full, + outer_loop_rins = instance$rins, + func_opt = opt.auto.sklearn, + args_opt = list(budget = getGconf()$budget, job_id = job$job.id, flag_light = flag_light), + func_eval = lock_eval.auto.sklearn, + args_eval = list()) +} diff --git a/demo/algo_irace.R b/demo/algo_irace.R new file mode 100644 index 0000000..296da1b --- /dev/null +++ b/demo/algo_irace.R @@ -0,0 +1,10 @@ +library(irace) +algo_fun_irace = function(job, data, instance, measure = list(mmce)) { + resample_opt_lock( + instance$mlr_task_full, + outer_loop_rins = instance$rins, + func_opt = opt.irace, + args_opt = list(budget = getGconf()$budget), + func_eval = lock_eval.irace, + args_eval = list()) +} \ No newline at end of file diff --git a/demo/algo_mlrmbo.R b/demo/algo_mlrmbo.R new file mode 100644 index 0000000..d982d6d --- /dev/null +++ b/demo/algo_mlrmbo.R @@ -0,0 +1,9 @@ +algo_fun_mlrmbo = function(job, data, instance, measure = list(mmce)) { + resample_opt_lock( + instance$mlr_task_full, + outer_loop_rins = instance$rins, + func_opt = opt.mlrmbo, + args_opt = list(budget = getGconf()$budget), + func_eval = lock_eval.mlrmbo, + args_eval = list()) +} \ No newline at end of file diff --git a/demo/algo_random_search.R b/demo/algo_random_search.R new file mode 100644 index 0000000..36e3080 --- /dev/null +++ b/demo/algo_random_search.R @@ -0,0 +1,9 @@ +algo_fun_random_search = function(job, data, instance, measure = list(mmce)) { + resample_opt_lock( + instance$mlr_task_full, + outer_loop_rins = instance$rins, + func_opt = opt.random.search, + args_opt = list(budget = getGconf()$budget), + func_eval = lock_eval.random.search, + args_eval = list()) +} \ No newline at end of file diff --git a/demo/algo_reinbo_table.R b/demo/algo_reinbo_table.R new file mode 100644 index 0000000..dd3fa36 --- /dev/null +++ b/demo/algo_reinbo_table.R @@ -0,0 +1,17 @@ +algo_fun_reinbo_table = function(job, data, instance, measure, init_val = -1, conf4agent = NULL) { + if (is.null(conf4agent)) { + conf = rlR::getDefaultConf("AgentTable") + if (init_val == -1) { + conf$set(policy.maxEpsilon = 1, policy.minEpsilon = 0.01, policy.aneal.steps = 60) + } + conf4agent = conf + } + + resample_opt_lock( + instance$mlr_task_full, + outer_loop_rins = instance$rins, + func_opt = opt.reinbo.table, + args_opt = list(budget = getGconf()$budget, init_val = init_val, conf = conf4agent), + func_eval = lock_eval.reinbo.table, + args_eval = list()) +} diff --git a/demo/algo_tpe.R b/demo/algo_tpe.R new file mode 100644 index 0000000..69d5895 --- /dev/null +++ b/demo/algo_tpe.R @@ -0,0 +1,9 @@ +algo_fun_tpe = function(job, data, instance, measure = list(mmce)) { + resample_opt_lock( + instance$mlr_task_full, + outer_loop_rins = instance$rins, + func_opt = opt.tpe, + args_opt = list(budget = getGconf()$budget), + func_eval = lock_eval.tpe, + args_eval = list()) +} diff --git a/demo/algo_tpot.R b/demo/algo_tpot.R new file mode 100644 index 0000000..be450a2 --- /dev/null +++ b/demo/algo_tpot.R @@ -0,0 +1,8 @@ +algo_fun_tpot = function(job, data, instance, measure = list(mmce)) { + resample_opt_lock(instance$mlr_task_full, + outer_loop_rins = instance$rins, + func_opt = opt.tpot, + args_opt = list(budget = getGconf()$budget), + func_eval = lock_eval.tpot, + args_eval = list()) +} diff --git a/demo/auto_sklearn_func.R b/demo/auto_sklearn_func.R new file mode 100644 index 0000000..fa25811 --- /dev/null +++ b/demo/auto_sklearn_func.R @@ -0,0 +1,99 @@ +makeRLearner.classif.autosklearn = function() { + makeRLearnerClassif( + cl = "classif.autosklearn", + package = "reticulate", + # For full paramset see https://automl.github.io/auto-sklearn/master/api.html + # Attention: Defaults are not exactly as in autosklearn + par.set = makeParamSet( + makeIntegerLearnerParam("time_left_for_this_task", lower = 1L, upper = Inf, default = 3600L), + makeIntegerLearnerParam("per_run_time_limit", lower = 1L, upper = Inf, default = 360L), + makeIntegerLearnerParam("initial_configurations_via_metalearning", lower = 0L, upper = Inf, default = 25L), + makeUntypedLearnerParam("include_estimators", default = NULL, special.vals = list(NULL)), + makeUntypedLearnerParam("include_preprocessors", default = NULL, special.vals = list(NULL)), + makeUntypedLearnerParam("exclude_estimators", default = NULL, special.vals = list(NULL)), + makeUntypedLearnerParam("exclude_preprocessors", default = NULL, special.vals = list(NULL)), + makeIntegerLearnerParam("ensemble_size", lower = 0L, upper = Inf, default = 0L), + makeIntegerLearnerParam("ensemble_nbest", lower = 0L, upper = Inf, default = 50L), + makeLogicalLearnerParam("delete_tmp_folder_after_terminate", default = FALSE), + makeLogicalLearnerParam("delete_output_folder_after_terminate", default = FALSE), + makeLogicalLearnerParam("shared_mode", default = FALSE), + makeUntypedLearnerParam("tmp_folder", default = NULL, special.vals = list(NULL)), + makeUntypedLearnerParam("output_folder", default = NULL, special.vals = list(NULL)), + makeIntegerLearnerParam("runcount_limit", lower = 1L, upper = 10L, default = 5L), + makeUntypedLearnerParam("smac_scenario_args", default = NULL, special.vals = list(NULL)), + makeDiscreteLearnerParam("resampling_strategy", default = "cv", values = c("cv", "partial-cv", "holdout-iterative-fit", "holdout")), + makeUntypedLearnerParam("resampling_strategy_arguments", default = NULL, special.vals = list(NULL)) + ), + properties = c("twoclass", "multiclass", "numerics", "prob", "missings", "factors"), + name = "Autosklearn", + short.name = "autosklearn", + note = "Defaults deviate from autosklearn defaults" + ) +} + + +trainLearner.classif.autosklearn = function(.learner, .task, .subset, .weights = NULL, ...) { + + autosklearn = import("autosklearn") + classifier = autosklearn$classification$AutoSklearnClassifier(...) + + train = getTaskData(.task, .subset, target.extra = TRUE) + feat.type = ifelse(vlapply(train$data, is.factor), "Categorical", "Numerical") + + classifier$fit(as.matrix(train$data), train$target, feat_type = feat.type) + classifier$fit_ensemble(train$target, ensemble_size = 1) + classifier$refit(as.matrix(train$data), train$target) ## Refit for cv method + return(classifier) +} + +predictLearner.classif.autosklearn = function(.learner, .model, .newdata, ...) { + as.factor(.model$learner.model$predict(as.matrix(.newdata))) +} + + + +# Auto-sklearn algorithm: +opt.auto.sklearn = function(task, budget, measure, job_id, train_set, flag_light) { + # job_id used for folder name + #randstr = stringi::stri_replace(toString(rnorm(1)), replacement = "", regex ="\\.") + if(flag_light) { + g_classifiers = list("random_forest", "k_nearest_neighbors", "libsvm_svc", "xgradient_boosting", "multinomial_nb") + #g_preprocess = list("pca", "no_preprocessing", "select_percentile_classification", "normalize", "standardize", "none", "minmax", "variance_threshold") data preprocessing methods does not work + g_preprocess = list("pca", "no_preprocessing", "select_percentile_classification") + } + else { + g_classifiers = NULL + g_preprocess = NULL + } + + automl = makeLearner("classif.autosklearn", + time_left_for_this_task = 100000L, + per_run_time_limit = 25L, + ensemble_size = 0, + initial_configurations_via_metalearning = 0L, + resampling_strategy = "cv", + include_preprocessors = g_preprocess, + include_estimators = g_classifiers, + # default tmp_folder name will cause no space left error + tmp_folder = paste0("../autosklearn_tmp/autosklearn_tmp", job_id), # it makes more sense touse seperate folder since different job_id are different problems + output_folder = paste0("../autosklearn_tmp/autosklearn_out", job_id), + delete_tmp_folder_after_terminate = FALSE, # to use together with shared_mode = T + #task 2 failed - \"FileExistsError: [Errno 17] File exists: '../autosklearn_tmp/autosklearn_tmp1' + #delete_tmp_folder_after_terminate=T, ## will cause error file exist + delete_output_folder_after_terminate = FALSE, + #delete_output_folder_after_terminate=T, + shared_mode = TRUE, + resampling_strategy_arguments = list(folds = getGconf()$NCVInnerIter), + smac_scenario_args = list(runcount_limit = budget) + ) + mmodel = train(automl, task, subset = train_set) + return(mmodel) +} + +# Predict performance of the best model on test/lock dataset: +lock_eval.auto.sklearn = function(task, measure, train_set = NULL, test_set, best_model) { + prediction = predict(best_model, task, subset = test_set) + mpred = performance(prediction, measures = measure) + return(mpred) +} + diff --git a/demo/auto_sklearn_test.R b/demo/auto_sklearn_test.R new file mode 100644 index 0000000..bc808b3 --- /dev/null +++ b/demo/auto_sklearn_test.R @@ -0,0 +1,66 @@ +rm(list = ls()) +library(mlr) +library(mlrCPO) +library(reticulate) +library(BBmisc) +library(OpenML) +source("system.R") +source("auto_sklearn_func.R") + +set.seed(1) +g_classifiers = list("random_forest", "k_nearest_neighbors", "libsvm_svc", "xgradient_boosting", "multinomial_nb") +g_preprocess = list("pca", "no_preprocessing", "select_percentile_classification") +#g_preprocess = list("pca", "no_preprocessing", "select_percentile_classification", "standardize", "none", "minmax", "variance_threshold") data preprocessing methods does not work +task = convertOMLTaskToMlr(getOMLTask(37))$mlr.task %>>% cpoDummyEncode(reference.cat = FALSE) +outer_loop_CV5 = makeResampleInstance("CV", iters = 5, task = task) + +train_set = outer_loop_CV5$train.inds[[1]] +test_set = outer_loop_CV5$test.inds[[1]] +train_data = getTaskData(task, train_set, target.extra = TRUE) +test_data = getTaskData(task, test_set, target.extra = TRUE) + +automl = makeLearner("classif.autosklearn", + time_left_for_this_task = 1000000L, + per_run_time_limit = 25L, + ensemble_size = 0, + include_preprocessors = g_preprocess, + include_estimators = g_classifiers, + initial_configurations_via_metalearning = 0L, + resampling_strategy = "cv", + resampling_strategy_arguments = list(folds = 5L), + smac_scenario_args = list(runcount_limit = 5L) +) + +a = Sys.time() +mod = train(automl, task, subset = train_set) +prediction = predict(mod, task, subset = test_set) +pred = performance(prediction) +print(Sys.time() - a) + + + + + +# autosklearn = import("autosklearn") +# sklearn = import("sklearn") +# automl = autosklearn$classification$AutoSklearnClassifier( +# time_left_for_this_task = 1000L, +# per_run_time_limit = 200L, +# ensemble_size = 0, +# # include_preprocessors = g_preprocess, +# # include_estimators = g_classifiers, +# initial_configurations_via_metalearning = 0L, +# resampling_strategy = "cv", +# resampling_strategy_arguments = list(folds = 5L), +# smac_scenario_args = dict(runcount_limit = 5L)) + + +# automl$fit(train_data$data, train_data$target, metric=autosklearn$metrics$accuracy) +# automl$fit_ensemble(train_data$target, ensemble_size = 1) +# automl$refit(train_data$data, train_data$target) +# predictions = automl$predict(test_data$data) +# pred = sklearn$metrics$accuracy_score(test_data$target, predictions) + + + + diff --git a/demo/bt_conf.R b/demo/bt_conf.R new file mode 100644 index 0000000..541c5bd --- /dev/null +++ b/demo/bt_conf.R @@ -0,0 +1,52 @@ +flag_debug = F +task_ids = 37 +if (!flag_debug) task_ids = c(14, 23, 37, 53, 3917, 9946, 9952, 9978, 146817, 146820) +getGconf = function() { + conf_common = list( + NCVOuterIter = 5L, + NCVInnerIter = 5L, + measures = list(mlr::mmce), + repl = 10L, + prob_seed = 1L, + RLMaxEpisode = 2000L # this number does not play a role, it only ensures RL could run for sufficient time + ) + + conf_debug = list( + budget = 40L, + conf_tpot = list(generations = 1L, population_size = 3L, offspring_size = 3L, config_dict = 'TPOT light') + ) + + conf_full = list( + budget = 1000L, + # TPOT will evaluate population_size + generations × offspring_size pipelines in total. + conf_tpot = list(generations = 20L, population_size = 10L, offspring_size = 50L) + ) + if (flag_debug) return(c(conf_debug, conf_common)) + return(c(conf_full, conf_common)) +} + +resources_light = list( + walltime = 60L*60*8, + memory = 1024L*2, + ntasks = 1L, + ncpus = 1L, + nodes = 1L, + clusters = "serial") + +resources_bigmem = list( + walltime = 60L*60*8, + memory = 1024L*4, + ntasks = 1L, + ncpus = 1L, + nodes = 1L, + clusters = "serial") + + + +resources = list( + walltime = 60L*60*12, + memory = 1024L*2, + ntasks = 1L, + ncpus = 1L, + nodes = 1L, + clusters = "serial") diff --git a/demo/bt_post_hoc.R b/demo/bt_post_hoc.R new file mode 100644 index 0000000..bd46007 --- /dev/null +++ b/demo/bt_post_hoc.R @@ -0,0 +1,10 @@ +library(batchtools) +source("system.R") +reg = loadRegistry("reg_test", writeable = T, work.dir = getwd()) +refun = function(job, res) { + cv5 = mean(res$vec_mpred) + list(cv5 = cv5) +} + +res = reduceResultsDataTable(ids = findDone(), fun = refun) +unwrap(res, sep = ".") diff --git a/demo/bt_submit.R b/demo/bt_submit.R new file mode 100644 index 0000000..7865061 --- /dev/null +++ b/demo/bt_submit.R @@ -0,0 +1,7 @@ +submitJobs(getJobPars()[algorithm != "reinbo_table", job.id, with = T]) +ids_sk = getJobPars()[algorithm == "auto_sklearn", job.id, with = T] # dont' run testJob for autosklearn since you can not easily kill it +submitJobs(ids_sk) +ids = getJobPars()[algorithm == "reinbo_table", job.id, with = T] +submitJobs(ids) +getJobPars()[(algorithm == "auto_sklearn") & (problem == "diabetes"), job.id, with = T] +getJobPars()[problem == "diabetes", job.id, with = T] diff --git a/demo/debug_auto_sklearn_test.R b/demo/debug_auto_sklearn_test.R new file mode 100644 index 0000000..a0c7585 --- /dev/null +++ b/demo/debug_auto_sklearn_test.R @@ -0,0 +1,70 @@ +rm(list = ls()) +library(mlr) +library(mlrCPO) +library(reticulate) +library(BBmisc) +library(OpenML) +use_condaenv("w_env") +source("auto-sklearn_fun.R") + +g_classifiers = list("random_forest", "k_nearest_neighbors", "libsvm_svc", "xgradient_boosting", "multinomial_nb") +g_preprocess = list("pca", "no_preprocessing", "select_percentile_classification") + + + +task = convertOMLTaskToMlr(getOMLTask(3))$mlr.task %>>% cpoDummyEncode(reference.cat = FALSE) +#head(getTaskData(task, target.extra = TRUE)$target) +set.seed(1) +outer_loop_CV5 = makeResampleInstance("CV", iters = 5, task = task) + +train_set = outer_loop_CV5$train.inds[[1]] +test_set = outer_loop_CV5$test.inds[[1]] +train_data = getTaskData(task, train_set, target.extra = TRUE) +test_data = getTaskData(task, test_set, target.extra = TRUE) + +set.seed(1) +automl = makeLearner("classif.autosklearn", + time_left_for_this_task = 1000000L, + per_run_time_limit = 25L, + ensemble_size = 0, + #include_preprocessors = g_preprocess, + #include_estimators = g_classifiers, + initial_configurations_via_metalearning = 0L, + resampling_strategy = "cv", + resampling_strategy_arguments = list(folds = 5L), + smac_scenario_args = list(runcount_limit = 5L) +) + +a = Sys.time() +mod = train(automl, task, subset = train_set) +prediction = predict(mod, task, subset = test_set) +pred = performance(prediction) +print(Sys.time() - a) + + + + + +# autosklearn = import("autosklearn") +# sklearn = import("sklearn") +# automl = autosklearn$classification$AutoSklearnClassifier( +# time_left_for_this_task = 1000L, +# per_run_time_limit = 200L, +# ensemble_size = 0, +# # include_preprocessors = g_preprocess, +# # include_estimators = g_classifiers, +# initial_configurations_via_metalearning = 0L, +# resampling_strategy = "cv", +# resampling_strategy_arguments = list(folds = 5L), +# smac_scenario_args = dict(runcount_limit = 5L)) + + +# automl$fit(train_data$data, train_data$target, metric=autosklearn$metrics$accuracy) +# automl$fit_ensemble(train_data$target, ensemble_size = 1) +# automl$refit(train_data$data, train_data$target) +# predictions = automl$predict(test_data$data) +# pred = sklearn$metrics$accuracy_score(test_data$target, predictions) + + + + diff --git a/demo/demo.R b/demo/demo.R new file mode 100644 index 0000000..2a93181 --- /dev/null +++ b/demo/demo.R @@ -0,0 +1,23 @@ +library(magrittr) +library(mlrCPO) +library(OpenML) +library(ReinBo) +task = convertOMLTaskToMlr(getOMLTask(37))$mlr.task %>>% cpoDummyEncode(reference.cat = FALSE) +outer_loop = makeResampleInstance("CV", iters = 5, stratify = TRUE, task) +train_set = outer_loop$train.inds[[1]] +test_set = outer_loop$test.inds[[1]] + + +# The pipeline search space could also be customized by setting, e.g., +custom_operators = list(preprocess = c("cpoScale()", "NA"), + filter = c("cpoPca(center = FALSE, rank)", "cpoFilterAnova(perc)", "NA"), + classifier = c("classif.kknn", "classif.naiveBayes")) + + + + +best_model = reinbo(task, custom_operators = NULL, budget = 100, train_set = train_set) +# best_model = reinbo(task, custom_operators = custom_operators, budget = 100, train_set = train_set) +pred = lock_eval.reinbo.table(task, measure = list(mmce), train_set, test_set, best_model) +best_model$env$agent$q_tab +best_model$env$agent$act_names_per_state diff --git a/demo/func_smac.R b/demo/func_smac.R new file mode 100644 index 0000000..17fcbf7 --- /dev/null +++ b/demo/func_smac.R @@ -0,0 +1,117 @@ +run = function(cs, budget = 1000) { + hh = reticulate::import("python_smac_space") + #scenario = Scenario({"run_obj": "quality", # we optimize quality (alternatively runtime) + # "runcount-limit": budget, # maximum function evaluations + # "cs": cs, # configuration space + # "deterministic": "true" + # }) + budget = 100 + scenario = hh$Scenario(list("run_obj" = "quality", # we optimize quality (alternatively runtime) + "runcount-limit" = budget, # maximum function evaluations + "cs" = cs, # configuration space + "deterministic" = "true", + "shared_model" = TRUE # deletable + )) + + # scenario$abort_on_first_run_crash = F + + print("Optimizing! Depending on your machine, this might take a few minutes.") + np = reticulate::import("numpy") + #fd = hh$ExecuteTAFuncDict(toy_smac_obj) + #smac = hh$SMAC(scenario = scenario, rng = np$random$RandomState(as.integer(4)), tae_runner = toy_smac_obj) + #smac = hh$SMAC(scenario = scenario, rng = np$random$RandomState(as.integer(4)), tae_runner = fd) + reticulate::source_python('smac_obj.py') + source("smac_obj.R") + #smac = hh$SMAC(scenario = scenario, rng = np$random$RandomState(as.integer(4)), tae_runner = smac_obj_from_cfg) + py_fun = reticulate::r_to_py(toy_smac_obj, convert = FALSE) + #py_fun = reticulate::r_to_py(function(x) 1, convert = TRUE) + smac = hh$SMAC(scenario = scenario, rng = np$random$RandomState(as.integer(4)), tae_runner = py_fun) + smac$get_tae_runner() + incumbent = smac$optimize() # problem + #inc_value = svm_from_cfg(incumbent) + incumbent + #print("Optimized Value: %.2f" % (inc_value)) +} + + +test_run = function() { + cfg = reticulate::import("python_smac_space") + cs = cfg$cs + run(cs) +} + +# Predict function: evaluate best model on test dataset +lock_eval.smac = function(task, measure, train_set, test_set, best_model){ + cfg = best_model + lrn = gen_mlrCPOPipe_from_smac_cfg(cfg) + mod = train(lrn, task, subset = train_set) + pred = predict(mod, task, subset = test_set) + mpred = performance(pred, measures = measure) + return(mpred) +} + + +gen_mlrCPOPipe_from_smac_cfg = function(cfg) { + #cfg = cfg.sample_configuration() + # convert ConfigSpace.configuration_space.ConfigurationSpace to ConfigSpace.configuration_space.Configuration + # For deactivated parameters, the configuration stores None-values. so we remove them. + #cfg = list(Model = "xgboost", Preprocess = "cpoScale(center = FALSE)", FeatureFilter = "cpoPca(center = FALSE, rank = rank_val)", lrn_xgboost_max_depth = 3, lrn_xgboost_eta = 0.03, fe_pca_rank = 0.5) # for testing and debug + model = cfg$Model + preprocess = cfg$Preprocess + pfilter = cfg$FeatureFilter + perc_val = NULL + rank_val = NULL + + ## + extract_hyper_prefix = function(prefix = "lrn", cfg) { + names4lrn_hyp = grep(pattern = prefix, x = names(cfg), value = T) + ps.learner = cfg[names4lrn_hyp] # evaluted later by R function eval + pattern = paste0("(", prefix, "_[:alpha:]+_)*") + #ns4hyper = gsub(pattern = pattern, x = names4lrn_hyp, replacement="", ignore.case = T) + ns4hyper = stringr::str_replace(string = names4lrn_hyp, pattern = pattern, replacement="") + names(ps.learner) = ns4hyper + ps.learner + } + ## + ps.learner = extract_hyper_prefix("lrn", cfg) # hyper-parameters for learner must exist + + names4Fe = grep(pattern = "fe", x = names(cfg), value = T) + + p = mlr::getTaskNFeats(subTask) # this subTask relies on global variable + + if(length(names4Fe) > 0) { + ps.Fe = extract_hyper_prefix("fe", cfg) + if(grepl(pattern = "perc", x = names(ps.Fe))) { + name4featureEng_perc = grep(pattern = "perc", x = names(ps.Fe), value = T) + perc_val = ps.Fe[[name4featureEng_perc]] + } + if(grepl(pattern = "rank", x = names(ps.Fe))) { + name4featureEng_rank = grep(pattern = "rank", x = names(ps.Fe), value = T) + rank_val = ceiling(ps.Fe[[name4featureEng_rank]] * p) + } + } + + lrn = sprintf("%s %%>>%% %s %%>>%% makeLearner('classif.%s', par.vals = ps.learner)", + preprocess, pfilter, model) + lrn = gsub(pattern = "NA %>>%", x = lrn, replacement = "", fixed = TRUE) + + + # set mtry after reducing the number of dimensions + if (model == "ranger") { + p1 = p + if (!is.null(perc_val)) {p1 = max(1, round(p*perc_val))} + if (!is.null(rank_val)) {p1 = rank_val} + ps.learner$mtry = max(1, as.integer(p1*ps.learner$mtry)) + } + lrn = paste0("library(mlrCPO);library(magrittr);", lrn) + obj_lrn = eval(parse(text = lrn)) + return(obj_lrn) +} + +test_gen_mlrCPOPipe_from_smac_cfg = function() { + subTask = mlr::iris.task + cfg = reticulate::import("python_smac_space") + cfg = cfg$stub + lrn = gen_mlrCPOPipe_from_smac_cfg(cfg) + lrn +} diff --git a/demo/header.R b/demo/header.R new file mode 100644 index 0000000..7968bdc --- /dev/null +++ b/demo/header.R @@ -0,0 +1,24 @@ +options(mlr.show.info = FALSE) +library(batchtools) +tosources = c("bt_conf.R", "utility.R") + +depend_reinbo_table = c("algo_reinbo_table.R", "reinbo_table_func.R", "reinbo_table_env.R", "reinbo_table_utils.R", "reinbo_table_hyperpara_space.R") +tosources = c(tosources, depend_reinbo_table) + +depend_auto_sklearn = c("algo_auto_sklearn.R", "auto_sklearn_func.R") +tosources = c(tosources, depend_auto_sklearn) + +depend_tpot = c("algo_tpot.R", "tpot_func.R") +tosources = c(tosources, depend_tpot) + +depend_random_search = c("algo_random_search.R", "random_search_func.R", "random_search_space.R") +tosources = c(tosources, depend_random_search) + +depend_tpe = c("algo_tpe.R", "tpe_func.R") +tosources = c(tosources, depend_tpe) + +depend_irace = c("algo_irace.R", "irace_func.R") +tosources = c(tosources, depend_irace) + + +pkgs = c("reticulate", "mlr", "mlrCPO", "OpenML", "parallelMap", "phng", "rlR", "hash", "mlrMBO", "R6", "foreach", "rlist", "magrittr", "irace") diff --git a/demo/install_depend.R b/demo/install_depend.R new file mode 100644 index 0000000..3babaa5 --- /dev/null +++ b/demo/install_depend.R @@ -0,0 +1,6 @@ +devtools::install_github("smilesun/parabox", ref = "tree") +library(mlr) +library(OpenML) +library(rlR) +library(hash) + diff --git a/demo/irace_func.R b/demo/irace_func.R new file mode 100644 index 0000000..256f69e --- /dev/null +++ b/demo/irace_func.R @@ -0,0 +1,74 @@ +# Irace algorithm: +opt.irace = function(task, budget, measure, train_set = NULL) { + measure <<- measure + subTask <<- task + if (!is.null(train_set)) subTask <<- subsetTask(task, train_set) + irace::irace( + scenario = list( + targetRunner = target.runner, + instances = lapply(1:(getGconf()$NCVInnerIter*budget), function(x) + makeResampleInstance(makeResampleDesc("Holdout", split = 1 - 1/getGconf()$NCVInnerIter, stratify = TRUE), subTask)), + maxExperiments = getGconf()$NCVInnerIter*budget + ), + parameters = readParameters("irace_space.txt", digits = 5, debugLevel = 0, text) + ) + load("./irace.Rdata") + mmodel = getFinalElites(iraceResults = iraceResults, n = 1) + return(mmodel) +} + + +# Target runner of Irace: +target.runner = function(experiment, config = list()) { + rin = experiment$instance ## holdout instance + lrn = genLearner.irace(subTask, experiment$configuration) + res = mlr::resample(lrn, subTask, resampling = rin, measures = measure, show.info = FALSE) + return(list(cost = res$aggr)) +} + + +# Predict function: evaluate best model on test dataset +lock_eval.irace = function(task, measure, train_set, test_set, best_model){ + ps = best_model + ps$.ID. = NULL + ps$.PARENT. = NULL + lrn = genLearner.irace(task, ps) + mod = train(lrn, task, subset = train_set) + pred = predict(mod, task, subset = test_set) + perf = performance(pred, measures = measure) + return(perf) +} + +# Generate mlr learner for configuration: +genLearner.irace = function(task, configuration){ + ps = configuration ## hypar-parameters + ps$sigma = 2^(ps$sigma) + ps$C = 2^(ps$C) + lrn = sprintf("%s %%>>%% %s %%>>%% makeLearner('classif.%s', par.vals = ps.learner)", + paste0(ps$Preprocess, "()"), paste0(ps$Filter, "()"), ps$Classify) + lrn = gsub(pattern = "NA() %>>%", x = lrn, replacement = "", fixed = TRUE) + # Preprocess: + lrn = gsub(pattern = ".scale()", x = lrn, replacement = "(scale = FALSE)", fixed = TRUE) + lrn = gsub(pattern = ".center()", x = lrn, replacement = "(center = FALSE)", fixed = TRUE) + # Filter: + lrn = gsub(pattern = ".perc()", x = lrn, replacement = "(perc = ps$perc)", fixed = TRUE) + p = getTaskNFeats(task) + lrn = gsub(pattern = ".rank()", x = lrn, replacement = "(center = FALSE, rank = as.integer(max(1, round(p*ps$rank))))", fixed = TRUE) + ## delete parameters irrelevant to classifier + ps.learner = as.list(ps) + ps.learner$Preprocess = NULL + ps.learner$Filter = NULL + ps.learner$Classify = NULL + ps.learner$perc = NULL + ps.learner$rank = NULL + ps.learner[is.na(ps.learner)] = NULL + if (ps$Classify == "ranger") { + p1 = p + if (!is.na(ps$perc)) {p1 = max(1, round(p*ps$perc))} + if (!is.na(ps$rank)) {p1 = max(1, round(p*ps$rank))} + ps.learner$mtry = max(1, as.integer(p1*ps$mtry)) + } + lrn = eval(parse(text = lrn)) + return(lrn) +} + diff --git a/demo/irace_space.txt b/demo/irace_space.txt new file mode 100644 index 0000000..44c8d2a --- /dev/null +++ b/demo/irace_space.txt @@ -0,0 +1,45 @@ +## Template for parameter description file for Iterated Race. +## +## The format is one parameter per line. Each line contains: +## +## 1: Name of the parameter. An unquoted alphanumeric string, +## example: ants + +## 2: Switch to pass the parameter. A quoted (possibly empty) string, +## if the value and the switch must be separated, add a space at +## the end of the string. Example : "--version1 --ants " + +## 3: Type. An unquoted single letter, among +## i: Integer, c: categorical, o: ordinal, r: real. + +## 4: For c and o: All possible values, that is, a variable number of +## quoted or unquoted strings separated by commas within +## parenthesis. Empty strings and strings containing commas or +## spaces must be quoted. +## For i,r: a pair of unquoted numbers representing minimum and +## maximum values. + +## 5: A conditional parameter can be defined according to the values of +## one or several other parameters. This is done by adding a +## character '|' followed by an R expression involving the names of +## other parameters. This expression must return TRUE if the +## condition is satisfied, FALSE otherwise. + +# 1: 2: 3: 4: 5: +Preprocess "--Preprocess" c ("cpoScale", "cpoScale.scale", "cpoScale.center", "cpoSpatialSign", "NA") +Filter "--Filter" c ("cpoFilterAnova.perc", "cpoFilterKruskal.perc", "cpoFilterUnivariate.perc", "cpoPca.rank", "NA") +Classify "--Classify" c ("kknn", "ksvm", "xgboost", "ranger", "naiveBayes") +perc "--perc" r (0.1,1) | Filter %in% c("cpoFilterAnova.perc", "cpoFilterKruskal.perc", "cpoFilterUnivariate.perc") +rank "--rank" r (0.1,1) | Filter == "cpoPca.rank" +k "--k" i (1,20) | Classify == "kknn" +C "--C" r (-15,15) | Classify == "ksvm" +sigma "--sigma" r (-15,15) | Classify == "ksvm" +mtry "--mtry" r (0.1,0.66666) | Classify == "ranger" +sample.fraction "--sample.fraction" r (0.1,1) | Classify == "ranger" +eta "--eta" r (0.001,0.3) | Classify == "xgboost" +max_depth "--max_depth" i (1,15) | Classify == "xgboost" +subsample "--subsample" r (0.5,1) | Classify == "xgboost" +colsample_bytree "--colsample_bytree" r (0.5,1) | Classify == "xgboost" +min_child_weight "--min_child_weight" r (0,50) | Classify == "xgboost" +laplace "--laplace" r (0.01,100) | Classify == "naiveBayes" + diff --git a/demo/lrz.batchtools.conf.R b/demo/lrz.batchtools.conf.R new file mode 100644 index 0000000..966f2fe --- /dev/null +++ b/demo/lrz.batchtools.conf.R @@ -0,0 +1,4 @@ +source("/home/hpc/pr74ze/ri89coc2/lrz_configs/config_files/batchtools/clusterFunctionsSlurmLrz.R") +cluster.functions = makeClusterFunctionsSlurmLrz("/home/hpc/pr74ze/ri89coc2/lrz_configs/config_files/batchtools/slurm_lmulrz.tmpl", array.jobs = FALSE) +default.resources = list(walltime = 3600L * 12, memory = 1024 * 3L, ntasks = 1L, ncpus = 1L, nodes = 1L, clusters = "serial") +max.concurrent.jobs = 999L diff --git a/demo/main.R b/demo/main.R new file mode 100644 index 0000000..b044850 --- /dev/null +++ b/demo/main.R @@ -0,0 +1,59 @@ +rm(list = ls()) +source("system.R") +source("header.R") +Reg_name = "reg" +datestr = stringi::stri_replace_all(Sys.Date(), regex = "-", replacement="_") ## use date as registry name +strhour = stringi::stri_replace_all(format(Sys.time(), "%H-%M"), regex = "-", replacement = "_") +#unlink("reg_table_1", recursive = TRUE) dangerous !! +reg_dir = paste0(Reg_name, datestr, "__", strhour) +reg = makeExperimentRegistry(file.dir = reg_dir, conf.file = mconf.file, + packages = pkgs, + source = tosources) +if (flag_local) reg$cluster.functions = makeClusterFunctionsMulticore(ncpus = 60L) # run on my own workstation +source("problem.R") +# opt = function(task, budget, measure, train_set, ...) { +# UseMethod("opt", task, budget, measure, train_set) +# } +# +# lock_eval = function(task, measure, train_set, test_set, best_model, ...) { +# UseMethod("lock_eval", task, measure, train_set, test_set, best_model) +# } +# + +algo.designs = list() + +algoname = "reinbo_table" +addAlgorithm(name = algoname, fun = algo_fun_reinbo_table) +algo.designs[[algoname]] = data.frame() + +algoname = "auto_sklearn" +addAlgorithm(name = algoname, fun = algo_fun_auto_sklearn) +algo.designs[[algoname]] = data.frame(flag_light = T) + +algoname = "tpot" +addAlgorithm(name = algoname, fun = algo_fun_tpot) +algo.designs[[algoname]] = data.frame() + + +algoname = "random_search" +addAlgorithm(name = algoname, fun = algo_fun_random_search) +algo.designs[[algoname]] = data.frame() + +algoname = "tpe" +addAlgorithm(name = algoname, fun = algo_fun_tpe) +algo.designs[[algoname]] = data.frame() + +algoname = "irace" +addAlgorithm(name = algoname, fun = algo_fun_irace) +algo.designs[[algoname]] = data.frame() + + +source("algo_reinbo_table.R") +source("algo_auto_sklearn.R") +source("algo_tpot.R") +source("algo_random_search.R") +source("algo_tpe.R") +source("algo_irace.R") + +addExperiments(algo.designs = algo.designs, repls = getGconf()$repl) +summarizeExperiments() diff --git a/demo/mlrmbo_func.R b/demo/mlrmbo_func.R new file mode 100644 index 0000000..824268d --- /dev/null +++ b/demo/mlrmbo_func.R @@ -0,0 +1,102 @@ +# mlrMBO algorithm: +opt.mlrmbo = function(task, budget, measure, train_set = NULL) { + subTask = task + if (!is.null(train_set)) subTask = subsetTask(task, train_set) + inner_loop = makeResampleInstance("CV", iters = getGconf()$NCVInnerIter, stratify = TRUE, subTask) + run = mlrMBO_func(subTask, instance = inner_loop, measure, budget) + mmodel = run$x + return(mmodel) +} + + +# Predict function: evaluate best model on test dataset +lock_eval.mlrmbo = function(task, measure, train_set, test_set, best_model){ + best_model$sigma = 2^(best_model$sigma) + best_model$C = 2^(best_model$C) + lrn = genLearner.mbo(task, best_model) + mod = train(lrn, task, subset = train_set) + pred = predict(mod, task, subset = test_set) + perf = performance(pred, measures = measure) + return(perf) +} + + +# hyper-parameter space +par.set = makeParamSet( + makeDiscreteParam('Pre', values = c("cpoScale()", "cpoScale(scale = FALSE)", "cpoScale(center = FALSE)", "cpoSpatialSign()", "no_operator")), + makeDiscreteParam('Filter', values = c("cpoFilterAnova(perc)", "cpoFilterKruskal(perc)", "cpoPca(center = FALSE, rank)", "cpoFilterUnivariate(perc)", "no_operator")), + makeNumericParam('perc', lower = .1, upper = 1, requires = quote(Filter %in% c("cpoFilterAnova(perc)", "cpoFilterKruskal(perc)", "cpoFilterUnivariate(perc)"))), + makeNumericParam('rank', lower = .1, upper = 1, requires = quote(Filter == "cpoPca(center = FALSE, rank)")), + makeDiscreteParam('Learner', values = c("kknn", "ksvm", "xgboost", "ranger", "naiveBayes")), + makeIntegerParam('k', lower = 1L, upper = 20L, requires = quote(Learner == "kknn")), + makeNumericParam("C", lower = -15, upper = 15, trafo = function(x) 2^x, requires = quote(Learner == 'ksvm')), + makeNumericParam("sigma", lower = -15, upper = 15, trafo = function(x) 2^x, requires = quote(Learner == 'ksvm')), + makeNumericParam("mtry", lower = 1/10, upper = 1/1.5, requires = quote(Learner == 'ranger')), + makeNumericParam("sample.fraction", lower = .1, upper = 1, requires = quote(Learner == 'ranger')), + makeNumericParam("eta", lower = .001, upper = .3, requires = quote(Learner == 'xgboost')), + makeIntegerParam("max_depth", lower = 1L, upper = 15L, requires = quote(Learner == 'xgboost')), + makeNumericParam("subsample", lower = .5, upper = 1, requires = quote(Learner == 'xgboost')), + makeNumericParam("colsample_bytree", lower = .5, upper = 1, requires = quote(Learner == 'xgboost')), + makeNumericParam("min_child_weight", lower = 0, upper = 50, requires = quote(Learner == 'xgboost')), + makeNumericParam("laplace", lower = .01, upper = 100, requires = quote(Learner == 'naiveBayes')) +) + + +# generate learner for task and specific parameter set +genLearner.mbo <- function(task, param_set){ + p = getTaskNFeats(task) + lrn = sprintf("%s %%>>%% %s %%>>%% makeLearner('classif.%s', par.vals = ps.learner)", + param_set$Pre, param_set$Filter, param_set$Learner) + lrn = gsub(pattern = "perc", x = lrn, replacement = "perc = param_set$perc", fixed = TRUE) + lrn = gsub(pattern = "rank", x = lrn, replacement = "rank = as.integer(max(1, round(p*param_set$rank)))", fixed = TRUE) + lrn = gsub(pattern = "no_operator %>>%", x = lrn, replacement = "", fixed = TRUE) + ps.learner = param_set + ps.learner$perc = NULL + ps.learner$rank = NULL + ps.learner$Pre = NULL + ps.learner$Filter = NULL + ps.learner$Learner = NULL + ps.learner[is.na(ps.learner)] = NULL + if (param_set$Learner == "ranger") { + p1 = p + if (!is.na(param_set$perc)) {p1 = max(1, round(p*param_set$perc))} + if (!is.na(param_set$rank)) {p1 = max(1, round(p*param_set$rank))} + ps.learner$mtry = max(1, as.integer(p1*param_set$mtry)) + } + lrn = eval(parse(text = lrn)) + return(lrn) +} + + +# using mlrMBO to optimize pipeline +mlrMBO_func <- function(task, instance, measure, budget){ + objfun = makeSingleObjectiveFunction( + fn = function(x) { + lrn = genLearner.mbo(task, x) + perf = resample(lrn, task, resampling = instance, measures = measure, show.info = FALSE)$aggr + return(perf) + }, + par.set = par.set, + has.simple.signature = FALSE, + minimize = TRUE + ) + ctrl = setMBOControlTermination(makeMBOControl(), iters = budget-4*length(par.set$pars)) + run = mbo(objfun, control = ctrl, show.info = FALSE) + return(run) +} + + + + +# Test: +# measure = list(mmce) +# task = sonar.task +# inner_loop = makeResampleInstance("CV", iters = 3, stratify = TRUE, task) +# outer_loop_rins = makeResampleInstance("CV", iters = 5, stratify = TRUE, task) +# opt_set = outer_loop_rins$train.inds[[1]] +# lock_set = outer_loop_rins$test.inds[[1]] +# mmodel = opt.mlrmbo(task, 66, measure) +# perf = lock_eval.mlrmbo(task, measure, opt_set, lock_set, mmodel) + + + diff --git a/demo/obsolete.batchtools.conf.R b/demo/obsolete.batchtools.conf.R new file mode 100644 index 0000000..8abc74a --- /dev/null +++ b/demo/obsolete.batchtools.conf.R @@ -0,0 +1,5 @@ +sys = Sys.info() +if (as.list(sys)$user != "sunxd") { + source("lrz.batchtools.conf.R") +} + diff --git a/demo/plot2test.R b/demo/plot2test.R new file mode 100644 index 0000000..06bfe3e --- /dev/null +++ b/demo/plot2test.R @@ -0,0 +1,121 @@ +# Plot and test + +library(data.table) +library(ggplot2) +library(tidyr) +library(reshape2) +library(xtable) +library(knitr) + + +## preparation +dt_new = readRDS("reinbo_new_cut_episode.rds") +dt_new$algorithm = "reinbo" + +#dt_reinbo_old = read.csv("../Experiment_results/ML-ReinBo.csv") +#dt_reinbo_old$algo = "reinbo2" +dt_reinbo_old = NULL +temp = read.csv("../Experiment_results/Auto-sklearn.csv") +temp$algorithm = "Autosklearn" +dt_reinbo_old = rbind(dt_reinbo_old, temp) +# +#temp = read.csv("../Experiment_results/Auto-sklearn_light.csv") +#temp$algo = "Autosklearn-light" +#dt_reinbo_old = rbind(dt_reinbo_old, temp) + +temp = read.csv("../Experiment_results/TPE.csv") +temp$algorithm = "TPE" +dt_reinbo_old = rbind(dt_reinbo_old, temp) + +# +temp = read.csv("../Experiment_results/TPOT.csv") +temp$algorithm = "Tpot" +dt_reinbo_old = rbind(dt_reinbo_old, temp) +# +temp = read.csv("../Experiment_results/TPOT_light.csv") +temp$algorithm = "Tpot-light" +dt_reinbo_old = rbind(dt_reinbo_old, temp) +# +temp = read.csv("../Experiment_results/Random_search.csv") +temp$algorithm = "RandomSearch" +dt_reinbo_old = rbind(dt_reinbo_old, temp) + + + +dt_reinbo_old$prob.name = dt_reinbo_old$name +dt_res = rbind(dt_reinbo_old[, c("prob.name", "mmce", "algorithm")], dt_new[, c("prob.name", "mmce", "algorithm")]) + +# check if all jobs finished +dt_res[, .N, by = "prob.name"] + +## table +dt_light = dt_res[, .(mmce = mean(mmce)), by = .(prob.name, algorithm)] +dt_light + +dt_table = spread(dt_light, key = "algorithm", value = "mmce") +dt_table +cns = colnames(dt_table) +cns[1] = "dataset name" +colnames(dt_table) = cns +xtable(dt_table) +knitr::kable(dt_table) +ltxtable=xtable(dt_table, align=rep("l",ncol(dt_table)+1), digits = 4) +print(ltxtable, floating=TRUE, hline.after=NULL, include.rownames=TRUE, include.colnames=TRUE) # tested + +# example to add asterix +# pval <- rev(sort(c(outer(1:6, 10^-(1:3))))) +# symp <- symnum(pval, corr = FALSE, +# cutpoints = c(0, .001,.01,.05, .1, 1), +# symbols = c("***","**","*","."," ")) +# noquote(cbind(P.val = format(pval), Signif = symp)) + +## plot +size = 10 # font size +gp = ggplot() + geom_boxplot(data = as.data.frame(dt_res), aes(x = algorithm, y = mmce, fill = algorithm)) + theme_bw() + theme(axis.text.x = element_text(angle = 90, size = size), axis.text.y = element_text(size = size), axis.title = element_text(size = size), strip.text = element_text(size=size), legend.text = element_text(size = size), legend.position="bottom") + facet_wrap("prob.name", scale = "free_y", ncol = 3) +#ggsave(gp, file = "prob_algo_repli_compare.pdf", width=3, height=3, units="in", scale=5, device = pdf) +ggsave(gp, file = "prob_algo_repli_compare.pdf", device = "pdf",scale = 0.9) + +## test +fun_best_against_other = function(temp, candiate = NULL) { + light = temp[, .(mmce = mean(mmce)), by = "algorithm"] # take mean value + ind = light[, .(which.min(mmce))] + prob_name = unique(temp$prob.name) + ref = light$algorithm[as.vector(as.matrix(ind))] + #cat(sprintf(" \nprob *** %s*** of best algorithm name ***%s***\n", prob_name, ref)) + if(!is.null(candiate)) { + checkmate::assert_character(candiate) + ref = candiate + } + moptions = unique(temp$algorithm) + #moptions = setdiff(moptions, ref) + x = temp[algorithm == ref]$mmce + res = lapply(moptions, function(name) { + y = temp[algorithm == name]$mmce + if (length(x) != length(y)) return(100) + worse_than_best = (wilcox.test(y, x, alternative = "greater", exact = FALSE)$p.value < 0.05) + better_than_best = (wilcox.test(x, y, alternative = "greater", exact = FALSE)$p.value < 0.05) + val = temp[algorithm == name, mean(mmce)] + strval = as.character(sprintf("%.4f", val)) + if (is.null(candiate)) { + if (worse_than_best) return(strval) # -1 lose + if ((!better_than_best) & (name ==ref)) return(paste0("\\underline{\\textbf{", strval, "}}")) # 0 tie + else return(paste0("\\textbf{", strval, "}")) # win! + } + # candiate is not null + # win lose + if (worse_than_best) return(-1) # -1 lose + if (!better_than_best) return(0) # 0 tie + return(1) # win! + }) + names(res) = moptions + res$best_algo = ref + as.data.table(res) +} + +dt_winner = dt_res[, fun_best_against_other(.SD), by = .(prob.name), .SDcols = colnames(dt_res)] +res = knitr::kable(dt_winner, format = "latex", digits = 4, escape = F) +capture.output(print(res), file = "latex.txt") + +algos = as.character(unique(dt_res$algorithm)) +name = "reinbo" # suppose if reinbo is the best +dt_res[, fun_best_against_other(.SD, name), by = .(prob.name), .SDcols = colnames(dt_res)] diff --git a/demo/post_analysis.R b/demo/post_analysis.R new file mode 100644 index 0000000..2fde64d --- /dev/null +++ b/demo/post_analysis.R @@ -0,0 +1,7 @@ +result_list = reduceResultsList(ids = findDone(), fun = function(job, res) { + model = res$mmodel$Model + operators = strsplit(as.character(model), "\t")[[1]] + data.frame(pre = operators[1], filter = operators[2], classifier = operators[3]) +}) +result = rbindlist(result_list) +Freq = c(as.list(table(result$pre)), as.list(table(result$filter)), as.list(table(result$classifier))) diff --git a/demo/problem.R b/demo/problem.R new file mode 100644 index 0000000..6c1beb1 --- /dev/null +++ b/demo/problem.R @@ -0,0 +1,12 @@ +tasks = lapply(task_ids, getOMLTask) +tasks = lapply(tasks, convertOMLTaskToMlr) + +prob_fun = function(data, job) { + mlr_task_full = data %>>% cpoDummyEncode(reference.cat = FALSE) + outer_iters = getGconf()$NCVOuterIter + outer_loop_rins = makeResampleInstance("CV", iters = outer_iters, stratify = TRUE, mlr_task_full) + list(rins = outer_loop_rins, mlr_task_full = mlr_task_full) +} + +for (task in tasks) + addProblem(name = getTaskId(task$mlr.task), data = task$mlr.task, fun = prob_fun, seed = getGconf()$prob_seed) diff --git a/demo/python_smac_space.py b/demo/python_smac_space.py new file mode 100644 index 0000000..a92522f --- /dev/null +++ b/demo/python_smac_space.py @@ -0,0 +1,73 @@ +# Import ConfigSpace and different types of parameters +from smac.configspace import ConfigurationSpace +from ConfigSpace.hyperparameters import CategoricalHyperparameter, \ + UniformFloatHyperparameter, UniformIntegerHyperparameter +from ConfigSpace.conditions import InCondition + +# Import SMAC-utilities +from smac.tae.execute_func import ExecuteTAFuncDict +from smac.scenario.scenario import Scenario +from smac.facade.smac_facade import SMAC + +cs = ConfigurationSpace() + +# We define a few possible types of SVM-kernels and add them as "kernel" to our cs +step1 = CategoricalHyperparameter("Preprocess", ['cpoScale()', 'cpoScale(scale = FALSE)', 'cpoScale(center = FALSE)', 'cpoSpatialSign()', 'NA'], default_value="NA") +cs.add_hyperparameter(step1) + +step2 = CategoricalHyperparameter("FeatureFilter", ['cpoFilterAnova(perc=perc_val)', 'cpoFilterKruskal(perc=perc_val)', 'cpoFilterUnivariate(perc=perc_val)', 'cpoPca(center = FALSE, rank = rank_val)', 'NA'], default_value = "NA") +cs.add_hyperparameter(step2) +anova_perc = UniformFloatHyperparameter("fe_anova_perc", 0.1, 1, default_value = 0.1) +kruskal_perc = UniformFloatHyperparameter("fe_kruskal_perc", 0.1, 1, default_value = 0.1) +univar_perc = UniformFloatHyperparameter("fe_univar_perc", 0.1, 1, default_value = 0.1) +pca_perc = UniformFloatHyperparameter("fe_pca_rank", 0, 0.9, default_value = 0.1) +cs.add_hyperparameters([anova_perc, kruskal_perc, univar_perc, pca_perc]) + +step2_child_anova = InCondition(child=anova_perc, parent=step2, values=["cpoFilterAnova(perc=perc_val)"]) +step2_child_kruskal = InCondition(child=kruskal_perc, parent=step2, values=["cpoFilterKruskal(perc=perc_val)"]) +step2_child_univar = InCondition(child=univar_perc, parent=step2, values=["cpoFilterUnivariate(perc=perc_val)"]) +step2_child_pca = InCondition(child=pca_perc, parent=step2, values=["cpoPca(center = FALSE, rank = rank_val)"]) +cs.add_conditions([step2_child_anova, step2_child_kruskal, step2_child_univar, step2_child_pca]) + +step3 = CategoricalHyperparameter("Model", ['kknn', 'ksvm', 'ranger', 'xgboost', 'naiveBayes']) +cs.add_hyperparameter(step3) + +hyper_kknn = UniformIntegerHyperparameter("lrn_kknn_k", 1, 19, default_value = 1) +hyper_ksvm_C = UniformFloatHyperparameter("lrn_svm_C", 2**(-15), 2**(15), default_value = 1) + +hyper_ksvm_sigma = UniformFloatHyperparameter("lrn_svm_sigma", 2**(-15), 2**(15), default_value = 1) + +hyper_ranger_mtry = UniformFloatHyperparameter("lrn_ranger_mtry", 0.1, 0.66666, default_value = 0.1) +hyper_ranger_sample_fraction = UniformFloatHyperparameter("lrn_ranger_sample.fraction", 0.1, 1, default_value = 0.1) +hyper_xgboost_eta = UniformFloatHyperparameter('lrn_xgboost_eta', 0.001, 0.3, default_value = 0.1) +hyper_xgboost_max_depth = UniformIntegerHyperparameter('lrn_xgboost_max_depth', 1, 14, default_value = 5) +hyper_xgboost_subsample = UniformFloatHyperparameter('lrn_xgboost_subsample', 0.5, 1, default_value = 0.5) +hyper_xgboost_colsample_bytree = UniformFloatHyperparameter('lrn_xgboost_colsample_bytree', 0.5, 1, default_value = 0.5) +hyper_xgboost_min_child_weight = UniformFloatHyperparameter('lrn_xgboost_min_child_weight', 0, 50, default_value = 0.5) +hyper_naiveBayes = UniformFloatHyperparameter('lrn_naiveBayes_laplace', 0.01, 100, default_value = 0.01) + +cs.add_hyperparameters([hyper_kknn, hyper_ksvm_C, hyper_ksvm_sigma, hyper_ranger_mtry, hyper_ranger_sample_fraction, hyper_xgboost_eta, hyper_xgboost_max_depth, hyper_xgboost_subsample, hyper_xgboost_colsample_bytree, hyper_xgboost_min_child_weight, hyper_naiveBayes]) + +step3_child_kknn = InCondition(child = hyper_kknn, parent = step3, values = ["kknn"]) +#cs.add_conditions([step3_child_kknn]) +step3_child_ksvm_c = InCondition(child = hyper_ksvm_C, parent = step3, values = ["ksvm"]) +#cs.add_conditions([step3_child_ksvm_c]) +step3_child_ksvm_sigma = InCondition(child = hyper_ksvm_sigma, parent = step3, values = ["ksvm"]) +#cs.add_conditions([step3_child_ksvm_sigma]) +## +step3_child_ranger_mtry = InCondition(child = hyper_ranger_mtry, parent = step3, values = ["ranger"]) +step3_child_ranger_sample_fraction = InCondition(child = hyper_ranger_sample_fraction, parent = step3, values = ["ranger"]) +## + +step3_child__xgboost_eta = InCondition(child = hyper_xgboost_eta, parent = step3, values = ["xgboost"]) +step3_child__xgboost_max_depth = InCondition(child = hyper_xgboost_max_depth, parent = step3, values = ["xgboost"]) +step3_child__xgboost_subsample = InCondition(child = hyper_xgboost_subsample, parent = step3, values = ["xgboost"]) +step3_child__xgboost_colsample_bytree = InCondition(child = hyper_xgboost_colsample_bytree, parent = step3, values = ["xgboost"]) +step3_child__xgboost_min_child_weight = InCondition(child = hyper_xgboost_min_child_weight, parent = step3, values = ["xgboost"]) +## +step3_child__naiveBayes_laplace = InCondition(child = hyper_naiveBayes, parent = step3, values = ["naiveBayes"]) + +cs.add_conditions([step3_child_kknn, step3_child_ksvm_c, step3_child_ksvm_sigma, step3_child_ranger_mtry, step3_child_ranger_sample_fraction, step3_child__xgboost_eta, step3_child__xgboost_subsample, step3_child__xgboost_max_depth, step3_child__xgboost_colsample_bytree, step3_child__xgboost_min_child_weight, step3_child__naiveBayes_laplace]) + +cfg = cs.sample_configuration() +stub = {k : cfg[k] for k in cfg if cfg[k]} diff --git a/demo/random_search_func.R b/demo/random_search_func.R new file mode 100644 index 0000000..6b81553 --- /dev/null +++ b/demo/random_search_func.R @@ -0,0 +1,54 @@ +# Random Search algorithm: +opt.random.search = function(task, budget, measure, train_set = NULL) { + subTask = task + if (!is.null(train_set)) subTask = subsetTask(task, train_set) + inner_loop = makeResampleInstance("CV", iters = getGconf()$NCVInnerIter, stratify = TRUE, subTask) + ps <<- step1$sample(budget) + for (i in 1:budget) { + perf = mlr_fun(subTask, ps[i, ], measure, cv_instance = inner_loop) + ps[i, "perf"] = perf + } + mmodel = ps[ps$perf == min(ps$perf),][1,] + mmodel$perf = NULL + return(mmodel) +} + + +# Mlr function: evaluate sampled model +mlr_fun = function(task, model, measure, cv_instance) { + lrn = genLearner(task, model, measure) + perf = resample(lrn, task, resampling = cv_instance, measures = measure, show.info = FALSE)$aggr + return(perf) +} + + +# Predict function: evaluate best model on test dataset +lock_eval.random.search = function(task, measure, train_set, test_set, best_model){ + lrn = genLearner(task, best_model, measure) + mod = train(lrn, task, subset = train_set) + pred = predict(mod, task, subset = test_set) + perf = performance(pred, measures = measure) + return(perf) +} + +# Generate mlr learner for configuration: +genLearner = function(task, model, measure){ + p = getTaskNFeats(task) + lrn = sprintf("%s %%>>%% %s %%>>%% makeLearner('classif.%s', par.vals = ps.learner)", + model$Preprocess[1], model$Filter[1], model$Classify[1]) + lrn = gsub(pattern = "perc", x = lrn, replacement = "perc = model$perc", fixed = TRUE) + lrn = gsub(pattern = "rank", x = lrn, replacement = "rank = as.integer(max(1, round(p*model$rank)))", fixed = TRUE) + lrn = gsub(pattern = "NA %>>%", x = lrn, replacement = "", fixed = TRUE) + ps.learner = model[, -(1:5)] ## delete parameters irrelevant to classifier + ps.learner = as.list(ps.learner) + ps.learner[is.na(ps.learner)] = NULL + if (model$Classify[1] == "ranger") { + p1 = p + if (!is.na(model$perc)) {p1 = max(1, round(p*model$perc))} + if (!is.na(model$rank)) {p1 = max(1, round(p*model$rank))} + ps.learner$mtry = max(1, as.integer(p1*model$mtry)) + } + lrn = eval(parse(text = lrn)) + return(lrn) +} + diff --git a/demo/random_search_space.R b/demo/random_search_space.R new file mode 100644 index 0000000..d3631e5 --- /dev/null +++ b/demo/random_search_space.R @@ -0,0 +1,65 @@ +# Parameter set tree: +step1 = ParamSetTree$new("pre", + ParamCategorical$new(id = "Preprocess", + values = c("cpoScale()", "cpoScale(scale = FALSE)", "cpoScale(center = FALSE)", "cpoSpatialSign()", "NA"))) + +step2 = ParamSetTree$new("filter", + ParamCategorical$new(id = "Filter", + values = c("cpoFilterAnova(perc)", "cpoFilterKruskal(perc)", "cpoPca(center = FALSE, rank)", "cpoFilterUnivariate(perc)", "NA")), + addDep(ParamReal$new(id = "perc", lower = .1, upper = 1), + did = "Filter", expr = quote(Filter %in% c("cpoFilterAnova(perc)", "cpoFilterKruskal(perc)", "cpoFilterUnivariate(perc)"))), + addDep(ParamReal$new(id = "rank", lower = .1, upper = 1), + did = "Filter", expr = quote(Filter == "cpoPca(center = FALSE, rank)"))) + +step3 = ParamSetTree$new("class", + ParamCategorical$new(id = "Classify", + values = c("kknn", "ksvm", "xgboost", "ranger", "naiveBayes")), + + addDep(ParamInt$new(id = "k", lower = 1L, upper = 20L), + did = "Classify", expr = quote(Classify == "kknn")), + + addDep(ParamReal$new(id = "C", lower = 2^(-15), upper = 2^(15)), + did = "Classify", expr = quote(Classify == "ksvm")), + + addDep(ParamReal$new(id = "sigma", lower = 2^(-15), upper = 2^(15)), + did = "Classify", expr = quote(Classify == "ksvm")), + + addDep(ParamReal$new(id = "mtry", lower = 1/10, upper = 1/1.5), + did = "Classify", expr = quote(Classify == "ranger")), + + addDep(ParamReal$new(id = "sample.fraction", lower = .1, upper = 1), + did = "Classify", expr = quote(Classify == "ranger")), + + addDep(ParamReal$new(id = "eta", lower = .001, upper = .3), + did = "Classify", expr = quote(Classify == "xgboost")), + + addDep(ParamInt$new(id = "max_depth", lower = 1L, upper = 15L), + did = "Classify", expr = quote(Classify == "xgboost")), + + addDep(ParamReal$new(id = "subsample", lower = .5, upper = 1), + did = "Classify", expr = quote(Classify == "xgboost")), + + addDep(ParamReal$new(id = "colsample_bytree", lower = .5, upper = 1), + did = "Classify", expr = quote(Classify == "xgboost")), + + addDep(ParamReal$new(id = "min_child_weight", lower = 0, upper = 50), + did = "Classify", expr = quote(Classify == "xgboost")), + + addDep(ParamReal$new(id = "laplace", lower = 0.01, upper = 100), + did = "Classify", expr = quote(Classify == "naiveBayes")) +) + +step2$setChild(step3) +step1$setChild(step2) + + + + + + + + + + + + diff --git a/demo/reinbo_table_conf.R b/demo/reinbo_table_conf.R new file mode 100644 index 0000000..2577c9a --- /dev/null +++ b/demo/reinbo_table_conf.R @@ -0,0 +1,33 @@ +source("reinbo_utils.R") +# A complete pipeline/model consists of 3 stages: +# Preprocessing --> Feature filtering --> Classification + +# Defult pipeline pool: +# Pre-processors: "Scale()", "cpoScale(scale = FALSE)", "cpoScale(center = FALSE)", "cpoSpatialSign()" +# Feature filters: "cpoFilterAnova()", "cpoFilterKruskal()", "cpoPca()", "cpoFilterUnivariate()" +# Classifiers: "classif.ksvm", "classif.ranger", "classif.kknn", "classif.xgboost", "classif.naiveBayes" + +# "NA" indicates that no operator would be carried out at this stage. + + + +# Use defaul pipeline pool: +custom_operators = list() +# The pipeline search space could also be customized by setting, e.g., +custom_operators = list(preprocess = c("cpoScale()", "NA"), + filter = c("cpoPca(center = FALSE, rank)", "cpoFilterAnova(perc)", "NA"), + classifier = c("classif.kknn", "classif.naiveBayes")) + + + + +## Parameters for RL environment: +g_operators = g_getOperatorList(custom_operators) +g_max_depth = length(g_operators) # stages: Scaling --> Feature filtering --> Classification +g_act_cnt = max(sapply(g_operators, length)) # max number of available operators at each stage +g_state_names = g_genStateList(g_operators) +g_state_dim = length(g_state_names) + +## Parameters for BO_PROBE: +g_init_design = 4 # initial design size for MBO: g_init_design*sum(getParamLengths(par.set)) +g_mbo_iter = 2 # iterations of MBO in each episode: g_mbo_iter*sum(getParamLengths(ps)) diff --git a/demo/reinbo_table_demo.R b/demo/reinbo_table_demo.R new file mode 100644 index 0000000..5f2c73c --- /dev/null +++ b/demo/reinbo_table_demo.R @@ -0,0 +1,41 @@ +rm(list = ls()) +library(mlr) +library(mlrCPO) +library(reticulate) +library(BBmisc) +library(OpenML) +library(hash) +library(rlR) +library(mlrMBO) +library(phng) +library(R6) + +source("reinbo_table_hyperpara_space.R") +source("reinbo_table_utils.R") +source("reinbo_table_env.R") +source("reinbo_table_func.R") +source("system.R") +source("bt_conf.R") + +library(magrittr) +library(mlrCPO) +library(OpenML) +task = convertOMLTaskToMlr(getOMLTask(37))$mlr.task %>>% cpoDummyEncode(reference.cat = FALSE) +outer_loop = makeResampleInstance("CV", iters = 5, stratify = TRUE, task) +train_set = outer_loop$train.inds[[1]] +test_set = outer_loop$test.inds[[1]] + + +# The pipeline search space could also be customized by setting, e.g., +custom_operators = list(preprocess = c("cpoScale()", "NA"), + filter = c("cpoPca(center = FALSE, rank)", "cpoFilterAnova(perc)", "NA"), + classifier = c("classif.kknn", "classif.naiveBayes")) + + + + +best_model = reinbo(task, custom_operators = NULL, budget = 100, train_set = train_set) +# best_model = reinbo(task, custom_operators = custom_operators, budget = 100, train_set = train_set) +pred = lock_eval.reinbo.table(task, measure = list(mmce), train_set, test_set, best_model) +best_model$env$agent$q_tab +best_model$env$agent$act_names_per_state diff --git a/demo/reinbo_table_env.R b/demo/reinbo_table_env.R new file mode 100644 index 0000000..3206439 --- /dev/null +++ b/demo/reinbo_table_env.R @@ -0,0 +1,132 @@ +Q_table_Env = R6::R6Class( + "Q_table_Env", + inherit = rlR::Environment, + public = list( + step_cnt = NULL, + s_r_d_info = NULL, + task = NULL, + mbo_cache = NULL, # store pipeline, hyperparameter set and corresponding performance for MBO + model_best_perf = NULL, # best performance of sampled model until now + model_trained = NULL, # store all trained models (limited to budget) + budget = NULL, # maximun models to be evaluated + measure = NULL, + cv_instance = NULL, + ctrl = NULL, + initialize = function(task, budget, measure, cv_instance, ctrl){ + self$flag_continous = FALSE # non-continuous action + self$flag_tensor = FALSE # no use of cnn + self$ctrl = ctrl + self$act_cnt = self$ctrl$g_act_cnt # available operators/actions at each stage + self$state_dim = self$ctrl$g_state_dim + self$step_cnt = 0L + self$s_r_d_info = list( + state = "s", + reward = 0, + done = FALSE, + info = list()) + self$task = task + self$mbo_cache = hash() + self$model_trained = NULL + self$budget = budget + self$measure = measure + self$cv_instance = cv_instance + }, + + evaluateArm = function(vec_arm) { + return(vec_arm) + }, + + # This function will be called at each step of the learning + step = function(action) { + operators = self$ctrl$g_operators[[names(self$ctrl$g_operators)[self$step_cnt + 1]]] + operator = operators[action] + if (action > length(operators)) {operator = operators[action %% length(operators)]} + self$s_r_d_info[["state"]] = paste0(self$s_r_d_info[["state"]], "-[", operator, "]") + #print(self$s_r_d_info[["state"]]) + self$s_r_d_info[["reward"]] = 0 + self$step_cnt = self$step_cnt + 1L + if (self$step_cnt >= self$ctrl$g_max_depth) { + model = g_getRLPipeline(self$s_r_d_info[["state"]]) + #print(paste(model, collapse = " --> ")) + # stop RL agent if no enough budget for this episode: + model_id = paste(model, collapse = "\t") + if (has.key(model_id, self$mbo_cache)){ + require_budget = self$ctrl$g_mbo_iter*sum(getParamLengths(g_getParamSetFun(model))) + } else { + require_budget = (self$ctrl$g_init_design + self$ctrl$g_mbo_iter)*sum(getParamLengths(g_getParamSetFun(model))) + } + if(self$budget < require_budget) stop("too small total budget for reinbo table!") + if (self$budget - length(self$model_trained) < require_budget) { + self$agent$interact$idx_episode = self$agent$interact$maxiter + self$s_r_d_info[["done"]] = TRUE + } else { + # train model with hyperparameter tuning: + self$tuning(model) + self$s_r_d_info[["reward"]] = self$model_best_perf # best performance of the model until now + self$s_r_d_info[["done"]] = TRUE + #print(paste("Best Perfomance:", self$model_best_perf)) + } + } + return(self$s_r_d_info) + }, + + + # This function will be called at the beginning of the learning and at the end of each episode + reset = function() { + self$step_cnt = 0 + self$s_r_d_info[["state"]] = "s" + self$s_r_d_info[["done"]] = FALSE + self$s_r_d_info + }, + + + # Hyperparameter tuning for generated model, return best performance as reward and update mbo_cache + tuning = function(model) { + model_id = paste(model, collapse = "\t") # mdoel_id for search in mbo_cache + ps = g_getParamSetFun(model) # generate parameter set + + # check if we have already evaluated this model + + # if already in mbo_cache: + if (has.key(model_id, self$mbo_cache)){ + previous_perf = max(self$mbo_cache[[model_id]][ , "y"]) # best performance until now + epis_unimproved = self$mbo_cache[[model_id]][1, "epis_unimproved"] # number of episodes that performance has not been improved + # if in more than 2 episodes that the performance of this model has not been improved, + # stop further hyperparameter tuning: + if (epis_unimproved > 2) { + self$model_best_perf = previous_perf + } else { + # else: use parameter set and performance in memory as initial design + design = self$mbo_cache[[model_id]][ , -length(self$mbo_cache[[model_id]])] + # run several iterations of MBO: + run = mbo_fun(self$task, model, design, self$measure, self$cv_instance, self$ctrl) + # best accuracy: + self$model_best_perf = run$y + # update mbo_cache: + self$mbo_cache[[model_id]] = run$opt.path$env$path + # add result to self$model_trained: + new = run$opt.path$env$path$y[run$opt.path$env$dob != 0] + self$model_trained = c(self$model_trained, new) + # check if the performance of this model has been improved in this episode: + if (run$y <= previous_perf) { + self$mbo_cache[[model_id]]["epis_unimproved"] = epis_unimproved + 1 + } else { + self$mbo_cache[[model_id]]["epis_unimproved"] = 0 + } + } + } else { + + # if not in mbo_cache: + design = generateDesign(n = self$ctrl$g_init_design*sum(getParamLengths(ps)), par.set = ps) + run = mbo_fun(self$task, model, design, self$measure, self$cv_instance, self$ctrl) # potential warning: generateDesign could only produce 3 points instead of 1000, see issue 442 of mlrMBO + self$model_best_perf = run$y + self$mbo_cache[[model_id]] = run$opt.path$env$path + self$mbo_cache[[model_id]]["epis_unimproved"] = 0 + new = run$opt.path$env$path$y + self$model_trained = c(self$model_trained, new) + } + } + ) +) + + diff --git a/demo/reinbo_table_func.R b/demo/reinbo_table_func.R new file mode 100644 index 0000000..eedfd4a --- /dev/null +++ b/demo/reinbo_table_func.R @@ -0,0 +1,109 @@ +# ML_ReinBo algorithm: +opt.reinbo.table = function(task, budget, measure, init_val, train_set = NULL, conf, ctrl) { + subTask = task + if (!is.null(train_set)) subTask = subsetTask(task, train_set) + inner_loop = makeResampleInstance("CV", iters = getGconf()$NCVInnerIter, stratify = TRUE, subTask) + env = runQTable(subTask, budget, measure, inner_loop, init_val, conf, ctrl) + mmodel = getBestModel(env$mbo_cache) + return(list(mmodel = mmodel, env = env)) +} + +# Predict function: evaluate best model on test dataset +lock_eval.reinbo.table = function(task, measure, train_set, test_set, best_model){ + best_model = best_model$mmodel + lrn = genLearnerForBestModel(task, best_model, measure) + mod = train(lrn, task, subset = train_set) + pred = predict(mod, task, subset = test_set) + perf = performance(pred, measures = measure) + return(perf) +} + + +# Reinforcement learning part: +#' @param ctrl pipeline configuration +runQTable <- function(task, budget, measure, instance, init_val, conf, ctrl) { + env = Q_table_Env$new(task, budget, measure, instance, ctrl) + agent = initAgent(name = "AgentTable", env = env, conf = conf, q_init = init_val, + state_names = ctrl$g_state_names, + act_names_per_state = get_act_names_perf_state(ctrl$g_operators), + vis_after_episode = TRUE) + agent$learn(getGconf()$RLMaxEpisode) + return(env) +} + +# MBO function: hyperparameter tuning +#' @param model character vector +mbo_fun = function(task, model, design, measure, cv_instance, ctrl) { + ps = g_getParamSetFun(model) # get parameter set from string representation of a model + object = makeSingleObjectiveFunction( + fn = function(x) { + -reinbo_mlr_fun(task, model, x, measure, cv_instance) + runif(1)/100000 + }, + par.set = ps, + has.simple.signature = FALSE, + minimize = FALSE + ) + ctrlmbo = setMBOControlTermination(makeMBOControl(), iters = ctrl$g_mbo_iter * sum(getParamLengths(ps))) # 2 times the parameter set size + run = mbo(object, design = design, control = ctrlmbo, show.info = FALSE) + ## in (function (fn, nvars, max = FALSE, pop.size = 1000, max.generations = 100, : Stopped because hard maximum generation limit was hit. + ## Genoud is a function that combines evolutionary search algorithms with derivative-based (Newton or quasi-Newton) methods to solve difficult optimization problems. + ## not always occur: Warning in generateDesign(control$infill.opt.focussearch.points, ps.local,: generateDesign could only produce 20 points instead of 1000! + ## in https://github.com/mlr-org/mlrMBO/issues/442, is being worked on https://github.com/mlr-org/mlrMBO/pull/444 + return(run) +} + + +# Mlr function: caculate performance of generated model given specific param_set +reinbo_mlr_fun = function(task, model, param_set, measure, cv_instance){ + lrn = genLearner.reinbo(task, model, param_set, measure) + perf = resample(lrn, task, resampling = cv_instance, measures = measure, show.info = FALSE)$aggr + return(perf) +} + + + +# To get best model from mbo_cache of environment: +getBestModel = function(cache){ + models = keys(cache) + results = data.frame(model = 0, y = 0) + for (i in 1:length(models)) { + results[i, 1] = models[i] + results[i, 2] = max(cache[[models[i]]][, "y"]) + } + key = results[results$y == max(results$y), "model"][1] + ps = cache[[key]] + ps = ps[(ps$y == max(ps$y)), (colnames(ps) != "epis_unimproved")][1, ] + return(data.frame(Model = key, ps)) +} + +genLearnerForBestModel = function(task, best_model, measure){ + model = strsplit(as.character(best_model$Model), "\t")[[1]] + param_set = as.list(best_model) + param_set$Model = NULL + param_set$y = NULL + if (!is.null(param_set$C)) { param_set$C = 2^param_set$C } + if (!is.null(param_set$sigma)) { param_set$sigma = 2^param_set$sigma } + lrn = genLearner.reinbo(task, model, param_set, measure) + return(lrn) +} + + +genLearner.reinbo = function(task, model, param_set, measure){ + p = getTaskNFeats(task) + lrn = sprintf("%s %%>>%% %s %%>>%% makeLearner('%s', par.vals = ps.learner)", + model[1], model[2], model[3]) + lrn = gsub(pattern = "perc", x = lrn, replacement = "perc = param_set$perc", fixed = TRUE) + lrn = gsub(pattern = "rank", x = lrn, replacement = "rank = as.integer(max(1, round(p*param_set$rank)))", fixed = TRUE) + lrn = gsub(pattern = "NA %>>%", x = lrn, replacement = "", fixed = TRUE) + ps.learner = param_set + ps.learner$perc = NULL + ps.learner$rank = NULL + if (model[3] == "classif.ranger") { + p1 = p + if (!is.null(param_set$perc)) {p1 = max(1, round(p*param_set$perc))} + if (!is.null(param_set$rank)) {p1 = max(1, round(p*param_set$rank))} + ps.learner$mtry = max(1, as.integer(p1*param_set$mtry)) + } + lrn = eval(parse(text = lrn)) + return(lrn) +} diff --git a/demo/reinbo_table_hyperpara_space.R b/demo/reinbo_table_hyperpara_space.R new file mode 100644 index 0000000..404f31d --- /dev/null +++ b/demo/reinbo_table_hyperpara_space.R @@ -0,0 +1,42 @@ +##### Parameter set of operators for hyperparameter tuning: +ps.ksvm = makeParamSet( + makeNumericParam("C", lower = -15, upper = 15, trafo = function(x) 2^x), + makeNumericParam("sigma", lower = -15, upper = 15, trafo = function(x) 2^x)) + +ps.ranger = makeParamSet( + makeNumericParam("mtry", lower = 1/10, upper = 1/1.5), ## range(p/10, p/1.5), p is the number of features + makeNumericParam("sample.fraction", lower = .1, upper = 1)) + +ps.xgboost = makeParamSet( + makeNumericParam("eta", lower = .001, upper = .3), + makeIntegerParam("max_depth", lower = 1L, upper = 15L), + makeNumericParam("subsample", lower = 0.5, upper = 1), + makeNumericParam("colsample_bytree", lower = 0.5, upper = 1), + makeNumericParam("min_child_weight", lower = 0, upper = 50) + ) + +ps.kknn = makeParamSet(makeIntegerParam("k", lower = 1L, upper = 20L)) + +ps.naiveBayes = makeParamSet(makeNumericParam("laplace", lower = 0.01, upper = 100)) + +ps.filter = makeParamSet(makeNumericParam("perc", lower = .1, upper = 1)) + +ps.pca = makeParamSet(makeNumericParam("rank", lower = .1, upper = 1)) ## range(p/10, p), p is the number of features + + + +##### Get parameter set for generated model: +g_getParamSetFun = function(model) { + ps.classif = sub(pattern = "classif", model[3], replacement = "ps") + ps.classif = eval(parse(text = ps.classif)) # hyperparameter set for classifier + if (model[2] == "NA") { + return(ps.classif) + } else if (length(grep(pattern = "perc", x = model)) > 0) { + return(c(ps.classif, ps.filter)) + } else { + return(c(ps.classif, ps.pca)) + } +} + + + diff --git a/demo/reinbo_table_reinbo.R b/demo/reinbo_table_reinbo.R new file mode 100644 index 0000000..66adc9d --- /dev/null +++ b/demo/reinbo_table_reinbo.R @@ -0,0 +1,19 @@ +#### function reinbo +#'@param task an mlr task +reinbo = function(task, custom_operators, budget, train_set) { + ## Parameters for RL environment: + ctrl = list() + ctrl$g_operators = g_getOperatorList(custom_operators) + ctrl$g_max_depth = length(ctrl$g_operators) # stages: Scaling --> Feature filtering --> Classification + ctrl$g_act_cnt = max(sapply(ctrl$g_operators, length)) # max number of available operators at each stage + ctrl$g_state_names = g_genStateList(ctrl$g_operators) + ctrl$g_state_dim = length(ctrl$g_state_names) + ## Parameters for BO_PROBE: + ctrl$g_init_design = 4 # initial design size for MBO: g_init_design*sum(getParamLengths(par.set)) + ctrl$g_mbo_iter = 2 # iterations of MBO in each episode: g_mbo_iter*sum(getParamLengths(ps)) + + conf = rlR::getDefaultConf("AgentTable") + conf$set(policy.maxEpsilon = 1, policy.minEpsilon = 0.01, policy.aneal.steps = 60) + best_model = opt.reinbo.table(task, budget = 100L, measure = list(mmce), train_set = train_set, init_val = -1, conf = conf, ctrl) + best_model +} diff --git a/demo/reinbo_table_test.R b/demo/reinbo_table_test.R new file mode 100644 index 0000000..fbe3b0c --- /dev/null +++ b/demo/reinbo_table_test.R @@ -0,0 +1,31 @@ +rm(list = ls()) +library(mlr) +library(mlrCPO) +library(reticulate) +library(BBmisc) +library(OpenML) +library(hash) +library(rlR) +library(mlrMBO) +library(phng) +library(R6) + +source("reinbo_table_hyperpara_space.R") +source("reinbo_table_utils.R") +source("reinbo_table_env.R") +source("reinbo_table_func.R") +source("system.R") +source("bt_conf.R") + +task = convertOMLTaskToMlr(getOMLTask(37))$mlr.task %>>% cpoDummyEncode(reference.cat = FALSE) +outer_loop = makeResampleInstance("CV", iters = 5, stratify = TRUE, task) + +train_set = outer_loop$train.inds[[1]] +test_set = outer_loop$test.inds[[1]] + +conf = rlR::getDefaultConf("AgentTable") +conf$set(policy.maxEpsilon = 1, policy.minEpsilon = 0.01, policy.aneal.steps = 60) +best_model = opt.reinbo.table(task, budget = 100L, measure = list(mmce), train_set = train_set, init_val = -1, conf = conf) +pred = lock_eval.reinbo.table(task, measure = list(mmce), train_set, test_set, best_model) +best_model$env$agent$q_tab +best_model$env$agent$act_names_per_state diff --git a/demo/reinbo_table_utils.R b/demo/reinbo_table_utils.R new file mode 100644 index 0000000..af90408 --- /dev/null +++ b/demo/reinbo_table_utils.R @@ -0,0 +1,57 @@ +source("reinbo_table_reinbo.R") +# Get list of operators: +#' @example g_getOperatorList(NULL) +g_getOperatorList = function(custom_operators) { + default_operators = list( + preprocess = c("cpoScale()", "cpoScale(scale = FALSE)", "cpoScale(center = FALSE)", "cpoSpatialSign()", "NA"), + filter = c("cpoFilterAnova(perc)", "cpoFilterKruskal(perc)", "cpoPca(center = FALSE, rank)", "cpoFilterUnivariate(perc)", "NA"), + classifier = c("classif.ksvm", "classif.ranger", "classif.kknn", "classif.xgboost", "classif.naiveBayes")) + for (stage in names(default_operators)){ + if (!is.null(custom_operators[[stage]])) { + default_operators[stage] = custom_operators[stage] + } + } + return(default_operators) +} + +# Generate list of all potential states in Q table: +g_genStateList = function(operators) { + state_list = c("s") + last_stage = state_list + for (stage in c("preprocess", "filter")){ + current_stage = c() + for (i in last_stage){ + for (j in operators[stage]){ + current_stage = c(current_stage, paste0(i, "-[", j, "]")) + } + } + state_list = c(state_list, current_stage) + last_stage = current_stage + } + return(state_list) +} + + +# Get list of all potential actions at each state: +get_act_names_perf_state = function(g_operators){ + list = list("s" = g_operators$preprocess) + step1_states = sprintf("s-[%s]", g_operators$preprocess) + for (i in step1_states) { + text = sprintf("list$'%s' = g_operators$filter", i) + eval(parse(text = text)) + for (j in sprintf("%s-[%s]", i, g_operators$filter)) { + text = sprintf("list$'%s' = g_operators$classifier", j) + eval(parse(text = text)) + } + } + return(list) +} + +# Get model at end of each episode: +g_getRLPipeline = function(last_state) { + model = unlist(lapply(strsplit(last_state, "-")[[1]][-1], + function(x) { + x = gsub("[", x, replacement = "", fixed = TRUE) + gsub("]", x, replacement = "", fixed = TRUE)})) + return(model) +} diff --git a/demo/smac_obj.R b/demo/smac_obj.R new file mode 100644 index 0000000..b62ff2f --- /dev/null +++ b/demo/smac_obj.R @@ -0,0 +1,17 @@ +# Objective to optimize: +toy_smac_obj = function(cfg) { + print(cfg) + runif(1) +} +smac_objective = function(cfg) { + # some variables are defined in the scope where this function is called + model_index <<- model_index + 1 + model_list[[model_index]] <<- cfg + lrn = gen_mlrCPOPipe_from_smac_cfg(cfg) + perf = resample(lrn, subTask, resampling = inner_loop, measures = measure, show.info = FALSE)$aggr + perf_list <<- c(perf_list, as.numeric(perf)) + return(perf) +} + + + diff --git a/demo/smac_obj.py b/demo/smac_obj.py new file mode 100644 index 0000000..d6ec23a --- /dev/null +++ b/demo/smac_obj.py @@ -0,0 +1,30 @@ +import rpy2 +import rpy2.robjects as robjects +import rpy2.robjects.numpy2ri +rpy2.robjects.numpy2ri.activate() +robjects.conversion.py2ri = rpy2.robjects.numpy2ri +from rpy2.robjects.packages import STAP +# if rpy2 < 2.6.1 do: +# from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage +# STAP = SignatureTranslatedAnonymousPackage +with open('smac_obj.R', 'r') as f: + string = f.read() +myfunc = STAP(string, "toy_smac_obj") +def smac_obj_from_cfg(cfg): + """ Creates a SVM based on a configuration and evaluates it on the + iris-dataset using cross-validation. + + Parameters: + ----------- + cfg: Configuration (ConfigSpace.ConfigurationSpace.Configuration) + Configuration containing the parameters. + Configurations are indexable! + + Returns: + -------- + A crossvalidated mean score for the svm on the loaded data-set. + """ + # For deactivated parameters, the configuration stores None-values. + # This is not accepted by the SVM, so we remove them. + cfg = {k : cfg[k] for k in cfg if cfg[k]} + return myfunc.toy_smac_obj(cfg) diff --git a/demo/system.R b/demo/system.R new file mode 100644 index 0000000..e28875f --- /dev/null +++ b/demo/system.R @@ -0,0 +1,12 @@ +sys = Sys.info() +flag_local = as.list(sys)$user == "JialiLin" +mconf.file = NULL + +if (flag_local) { + reticulate::use_python("/usr/local/bin/python3") + mconf.file = NA +} else { + reticulate::use_condaenv("w_env") + mconf.file = "lrz.batchtools.conf.R" +} + diff --git a/demo/tpe_func.R b/demo/tpe_func.R new file mode 100644 index 0000000..75e8fb9 --- /dev/null +++ b/demo/tpe_func.R @@ -0,0 +1,62 @@ +# TPE algorithm: +opt.tpe = function(task, budget, measure, train_set = NULL) { + subTask <<- task + if (!is.null(train_set)) subTask <<- subsetTask(task, train_set) + inner_loop <<- makeResampleInstance("CV", iters = getGconf()$NCVInnerIter, stratify = TRUE, subTask) + source_python("tpe_space.py") + hp = import("hyperopt") + model_index <<- 0 + model_list <<- list() + perf_list <<- NULL + measure <<- measure + best = hp$fmin(objective, space = space, algo = hp$tpe$suggest, max_evals = budget) + best_model_index = which(perf_list == min(perf_list))[1] + mmodel = model_list[[best_model_index]] + return(mmodel) +} + + +# Predict function: evaluate best model on test dataset +lock_eval.tpe = function(task, measure, train_set, test_set, best_model){ + lrn = genLearner.tpe(best_model) + mod = train(lrn, task, subset = train_set) + pred = predict(mod, task, subset = test_set) + mpred = performance(pred, measures = measure) + return(mpred) +} + + +# Objective to optimize: +objective = function(args) { + model_index <<- model_index + 1 + model_list[[model_index]] <<- args + lrn = genLearner.tpe(args) + perf = resample(lrn, subTask, resampling = inner_loop, measures = measure, show.info = FALSE)$aggr + perf_list <<- c(perf_list, as.numeric(perf)) + return(perf) +} +# one sample of args: args = hp$pyll$stochastic$sample(py$space) + + +# Generate mlr learner for configuration: +genLearner.tpe = function(args){ + model = args$Classifier$model + args$Classifier$model = NULL + ps.learner = args$Classifier # list in R, which can be evaluated by eval + filter = args$FeatureFilter$filter + lrn = sprintf("%s %%>>%% %s %%>>%% makeLearner('classif.%s', par.vals = ps.learner)", + args$Preprocess, filter, model) + lrn = gsub(pattern = "NA %>>%", x = lrn, replacement = "", fixed = TRUE) + lrn = gsub(pattern = "perc", x = lrn, replacement = "perc = args$FeatureFilter$perc", fixed = TRUE) + p = getTaskNFeats(subTask) + lrn = gsub(pattern = "rank", x = lrn, replacement = "rank = as.integer(max(1, round(p*args$FeatureFilter$rank)))", fixed = TRUE) + if (model == "ranger") { + p1 = p + if (!is.null(args$FeatureFilter$perc)) {p1 = max(1, round(p*args$FeatureFilter$perc))} + if (!is.null(args$FeatureFilter$rank)) {p1 = max(1, round(p*args$FeatureFilter$rank))} + ps.learner$mtry = max(1, as.integer(p1*args$FeatureFilter$mtry)) + } + lrn = eval(parse(text = lrn)) + return(lrn) +} + diff --git a/demo/tpe_space.py b/demo/tpe_space.py new file mode 100644 index 0000000..2a93357 --- /dev/null +++ b/demo/tpe_space.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on 15.2.2019 + +@author: Jiali Lin +""" + +from hyperopt import hp +import hyperopt.pyll.stochastic + +# Define the search space +space = { + # Step 1: + 'Preprocess': hp.choice('pre', + ['cpoScale()', + 'cpoScale(scale = FALSE)', + 'cpoScale(center = FALSE)', + 'cpoSpatialSign()', + 'NA']), + + + # Step 2: + 'FeatureFilter': hp.choice('feature', [ + {'filter': 'cpoFilterAnova(perc)', + 'perc': hp.uniform('ano_per', 0.1, 1)}, + + {'filter': 'cpoFilterKruskal(perc)', + 'perc': hp.uniform('kru_per', 0.1, 1)}, + + {'filter': 'cpoFilterUnivariate(perc)', + 'perc': hp.uniform('uni_per', 0.1, 1)}, + + {'filter': 'cpoPca(center = FALSE, rank)', + 'rank': hp.uniform('pca_rank', 0.1, 1)}, + + {'filter': 'NA'}]), + + + # Step 3: + 'Classifier': hp.choice('classify_model', [ + {'model': 'kknn', + 'k': 1 + hp.randint('kknn_k', 19)}, + + {'model': 'ksvm', + 'C': hp.uniform('ksvm_C', 2**(-15), 2**(15)), + 'sigma': hp.uniform('ksvm_sigma', 2**(-15), 2**(15))}, + + {'model': 'ranger', + 'mtry': hp.uniform('ranger_mtry', 0.1, 0.66666), + 'sample.fraction': hp.uniform('ranger_fra', 0.1, 1)}, + + {'model': 'xgboost', + 'eta': hp.uniform('xgboost_eta', 0.001, 0.3), + 'max_depth': 1 + hp.randint('xgboost_depth', 14), + 'subsample': hp.uniform('xgboost_sub', 0.5, 1), + 'colsample_bytree': hp.uniform('xgboost_col', 0.5, 1), + 'min_child_weight': hp.uniform('xgboost_min', 0, 50)}, + + {'model': 'naiveBayes', + 'laplace': hp.uniform('bay_laplace', 0.01, 100)} + + ])} + + + +# Sample one configuration: +# print(hyperopt.pyll.stochastic.sample(space)) +#print(hyperopt.pyll.stochastic.sample(space)) +#106 {'Classifier': {'model': 'ranger', 'mtry': 0.574453305013119, 'sample.fracti +#107 on': 0.8656502995483121}, 'FeatureFilter': {'filter': 'cpoFilterAnova(perc)' +#108 , 'perc': 0.3726989872044636}, 'Preprocess': 'NA'} diff --git a/demo/tpot_func.R b/demo/tpot_func.R new file mode 100644 index 0000000..1b63286 --- /dev/null +++ b/demo/tpot_func.R @@ -0,0 +1,17 @@ +source("system.R") +tpot = import("tpot") +opt.tpot = function(task, budget, measure, train_set) { + conf_tpot = getGconf()$conf_tpot + pipeline_optimizer = tpot$TPOTClassifier(generations = conf_tpot$generations, population_size = conf_tpot$population_size, + offspring_size = conf_tpot$offspring_size, cv = getGconf()$NCVInnerIter, + config_dict = conf_tpot$config_dict) + train_data = getTaskData(task, train_set, target.extra = TRUE) + pipeline_optimizer$fit(train_data$data, as.numeric(train_data$target)) + return(pipeline_optimizer) +} + +lock_eval.tpot = function(task, measure = NULL, train_set = NULL, test_set, best_model) { + test_data = getTaskData(task, test_set, target.extra = TRUE) + mpred = 1 - best_model$score(test_data$data, as.numeric(test_data$target)) + return(mpred) +} \ No newline at end of file diff --git a/demo/tpot_test.R b/demo/tpot_test.R new file mode 100644 index 0000000..a542881 --- /dev/null +++ b/demo/tpot_test.R @@ -0,0 +1,27 @@ +rm(list = ls()) +library(mlr) +library(mlrCPO) +library(reticulate) +library(BBmisc) +library(OpenML) +source("system.R") + +set.seed(1) +task = convertOMLTaskToMlr(getOMLTask(37))$mlr.task %>>% cpoDummyEncode(reference.cat = FALSE) +outer_loop_CV5 = makeResampleInstance("CV", iters = 5, task = task) +train_set = outer_loop_CV5$train.inds[[1]] +test_set = outer_loop_CV5$test.inds[[1]] +train_data = getTaskData(task, train_set, target.extra = TRUE) +test_data = getTaskData(task, test_set, target.extra = TRUE) + +tpot = import("tpot") +pipeline_optimizer = tpot$TPOTClassifier(generations = 2L, population_size = 3L, + offspring_size = 3L, cv = 5L, + config_dict = 'TPOT light') + +pipeline_optimizer$fit(train_data$data, as.numeric(train_data$target)) +pred = 1 - pipeline_optimizer$score(test_data$data, as.numeric(test_data$target)) + + + + diff --git a/demo/utility.R b/demo/utility.R new file mode 100644 index 0000000..f4f09d9 --- /dev/null +++ b/demo/utility.R @@ -0,0 +1,14 @@ +resample_opt_lock = function(mlr_task_full, outer_loop_rins, func_opt, func_eval, args_opt = list(), args_eval = list()) { + outer_iters = getGconf()$NCVOuterIter + measure = getGconf()$measures + list_lock = foreach(outer_iter = 1:outer_iters) %do% { + opt_set = outer_loop_rins$train.inds[[outer_iter]] + lock_set = outer_loop_rins$test.inds[[outer_iter]] + mmodel = do.call(func_opt, args = c(list(task = mlr_task_full, train_set = opt_set, measure = measure), args_opt)) + mpred = do.call(func_eval, args = c(list(task = mlr_task_full, train_set = opt_set, test_set = lock_set, measure = measure, best_model = mmodel), args_eval)) + return(list(mmodel = mmodel, mpred = mpred)) + } + list_mmodel = rlist::list.map(list_lock, mmodel) + vec_mpred = unlist(rlist::list.map(list_lock, mpred)) + return(list(list_mmodel = list_mmodel, vec_mpred = vec_mpred)) +} diff --git a/lte/agg.R b/lte/agg.R new file mode 100644 index 0000000..588735c --- /dev/null +++ b/lte/agg.R @@ -0,0 +1,19 @@ +tb = getJobPars(findDone()) +getJobPars(findQueued()) +getJobPars(findSubmitted()) +unique(tb[, "problem"]) +ids = tb[(algorithm == "reinbo_table") & (problem == "LED-display-domain-7digit"), job.id, with = T] +ids = tb[(algorithm == "reinbo_table") & (problem == "wdbc"), job.id, with = T] +ids = findDone() +reslist = reduceResultsList(ids = ids, fun = function(job, res) { + res2 = list() + res2$prob.name = job$prob.name + res2$algo.name = job$algo.name + res2$job.id = job$job.id + res2$repl = job$repl + res2$mmce = mean(res$vec_mpred) + #res2$model = res$list_mmodel + res2 +}) +dt_res = rbindlist(reslist) +saveRDS(dt_res, file = "reinbo_new_cut_episode.rds") diff --git a/lte/algo_auto_sklearn.R b/lte/algo_auto_sklearn.R new file mode 100644 index 0000000..cdc0fe9 --- /dev/null +++ b/lte/algo_auto_sklearn.R @@ -0,0 +1,8 @@ +algo_fun_auto_sklearn = function(job, data, instance, measure = list(mmce), flag_light) { + resample_opt_lock(instance$mlr_task_full, + outer_loop_rins = instance$rins, + func_opt = opt.auto.sklearn, + args_opt = list(budget = getGconf()$budget, job_id = job$job.id, flag_light = flag_light), + func_eval = lock_eval.auto.sklearn, + args_eval = list()) +} diff --git a/lte/algo_irace.R b/lte/algo_irace.R new file mode 100644 index 0000000..296da1b --- /dev/null +++ b/lte/algo_irace.R @@ -0,0 +1,10 @@ +library(irace) +algo_fun_irace = function(job, data, instance, measure = list(mmce)) { + resample_opt_lock( + instance$mlr_task_full, + outer_loop_rins = instance$rins, + func_opt = opt.irace, + args_opt = list(budget = getGconf()$budget), + func_eval = lock_eval.irace, + args_eval = list()) +} \ No newline at end of file diff --git a/lte/algo_mlrmbo.R b/lte/algo_mlrmbo.R new file mode 100644 index 0000000..d982d6d --- /dev/null +++ b/lte/algo_mlrmbo.R @@ -0,0 +1,9 @@ +algo_fun_mlrmbo = function(job, data, instance, measure = list(mmce)) { + resample_opt_lock( + instance$mlr_task_full, + outer_loop_rins = instance$rins, + func_opt = opt.mlrmbo, + args_opt = list(budget = getGconf()$budget), + func_eval = lock_eval.mlrmbo, + args_eval = list()) +} \ No newline at end of file diff --git a/lte/algo_random_search.R b/lte/algo_random_search.R new file mode 100644 index 0000000..36e3080 --- /dev/null +++ b/lte/algo_random_search.R @@ -0,0 +1,9 @@ +algo_fun_random_search = function(job, data, instance, measure = list(mmce)) { + resample_opt_lock( + instance$mlr_task_full, + outer_loop_rins = instance$rins, + func_opt = opt.random.search, + args_opt = list(budget = getGconf()$budget), + func_eval = lock_eval.random.search, + args_eval = list()) +} \ No newline at end of file diff --git a/lte/algo_reinbo_table.R b/lte/algo_reinbo_table.R new file mode 100644 index 0000000..dd3fa36 --- /dev/null +++ b/lte/algo_reinbo_table.R @@ -0,0 +1,17 @@ +algo_fun_reinbo_table = function(job, data, instance, measure, init_val = -1, conf4agent = NULL) { + if (is.null(conf4agent)) { + conf = rlR::getDefaultConf("AgentTable") + if (init_val == -1) { + conf$set(policy.maxEpsilon = 1, policy.minEpsilon = 0.01, policy.aneal.steps = 60) + } + conf4agent = conf + } + + resample_opt_lock( + instance$mlr_task_full, + outer_loop_rins = instance$rins, + func_opt = opt.reinbo.table, + args_opt = list(budget = getGconf()$budget, init_val = init_val, conf = conf4agent), + func_eval = lock_eval.reinbo.table, + args_eval = list()) +} diff --git a/lte/algo_tpe.R b/lte/algo_tpe.R new file mode 100644 index 0000000..69d5895 --- /dev/null +++ b/lte/algo_tpe.R @@ -0,0 +1,9 @@ +algo_fun_tpe = function(job, data, instance, measure = list(mmce)) { + resample_opt_lock( + instance$mlr_task_full, + outer_loop_rins = instance$rins, + func_opt = opt.tpe, + args_opt = list(budget = getGconf()$budget), + func_eval = lock_eval.tpe, + args_eval = list()) +} diff --git a/lte/algo_tpot.R b/lte/algo_tpot.R new file mode 100644 index 0000000..be450a2 --- /dev/null +++ b/lte/algo_tpot.R @@ -0,0 +1,8 @@ +algo_fun_tpot = function(job, data, instance, measure = list(mmce)) { + resample_opt_lock(instance$mlr_task_full, + outer_loop_rins = instance$rins, + func_opt = opt.tpot, + args_opt = list(budget = getGconf()$budget), + func_eval = lock_eval.tpot, + args_eval = list()) +} diff --git a/lte/auto_sklearn_func.R b/lte/auto_sklearn_func.R new file mode 100644 index 0000000..fa25811 --- /dev/null +++ b/lte/auto_sklearn_func.R @@ -0,0 +1,99 @@ +makeRLearner.classif.autosklearn = function() { + makeRLearnerClassif( + cl = "classif.autosklearn", + package = "reticulate", + # For full paramset see https://automl.github.io/auto-sklearn/master/api.html + # Attention: Defaults are not exactly as in autosklearn + par.set = makeParamSet( + makeIntegerLearnerParam("time_left_for_this_task", lower = 1L, upper = Inf, default = 3600L), + makeIntegerLearnerParam("per_run_time_limit", lower = 1L, upper = Inf, default = 360L), + makeIntegerLearnerParam("initial_configurations_via_metalearning", lower = 0L, upper = Inf, default = 25L), + makeUntypedLearnerParam("include_estimators", default = NULL, special.vals = list(NULL)), + makeUntypedLearnerParam("include_preprocessors", default = NULL, special.vals = list(NULL)), + makeUntypedLearnerParam("exclude_estimators", default = NULL, special.vals = list(NULL)), + makeUntypedLearnerParam("exclude_preprocessors", default = NULL, special.vals = list(NULL)), + makeIntegerLearnerParam("ensemble_size", lower = 0L, upper = Inf, default = 0L), + makeIntegerLearnerParam("ensemble_nbest", lower = 0L, upper = Inf, default = 50L), + makeLogicalLearnerParam("delete_tmp_folder_after_terminate", default = FALSE), + makeLogicalLearnerParam("delete_output_folder_after_terminate", default = FALSE), + makeLogicalLearnerParam("shared_mode", default = FALSE), + makeUntypedLearnerParam("tmp_folder", default = NULL, special.vals = list(NULL)), + makeUntypedLearnerParam("output_folder", default = NULL, special.vals = list(NULL)), + makeIntegerLearnerParam("runcount_limit", lower = 1L, upper = 10L, default = 5L), + makeUntypedLearnerParam("smac_scenario_args", default = NULL, special.vals = list(NULL)), + makeDiscreteLearnerParam("resampling_strategy", default = "cv", values = c("cv", "partial-cv", "holdout-iterative-fit", "holdout")), + makeUntypedLearnerParam("resampling_strategy_arguments", default = NULL, special.vals = list(NULL)) + ), + properties = c("twoclass", "multiclass", "numerics", "prob", "missings", "factors"), + name = "Autosklearn", + short.name = "autosklearn", + note = "Defaults deviate from autosklearn defaults" + ) +} + + +trainLearner.classif.autosklearn = function(.learner, .task, .subset, .weights = NULL, ...) { + + autosklearn = import("autosklearn") + classifier = autosklearn$classification$AutoSklearnClassifier(...) + + train = getTaskData(.task, .subset, target.extra = TRUE) + feat.type = ifelse(vlapply(train$data, is.factor), "Categorical", "Numerical") + + classifier$fit(as.matrix(train$data), train$target, feat_type = feat.type) + classifier$fit_ensemble(train$target, ensemble_size = 1) + classifier$refit(as.matrix(train$data), train$target) ## Refit for cv method + return(classifier) +} + +predictLearner.classif.autosklearn = function(.learner, .model, .newdata, ...) { + as.factor(.model$learner.model$predict(as.matrix(.newdata))) +} + + + +# Auto-sklearn algorithm: +opt.auto.sklearn = function(task, budget, measure, job_id, train_set, flag_light) { + # job_id used for folder name + #randstr = stringi::stri_replace(toString(rnorm(1)), replacement = "", regex ="\\.") + if(flag_light) { + g_classifiers = list("random_forest", "k_nearest_neighbors", "libsvm_svc", "xgradient_boosting", "multinomial_nb") + #g_preprocess = list("pca", "no_preprocessing", "select_percentile_classification", "normalize", "standardize", "none", "minmax", "variance_threshold") data preprocessing methods does not work + g_preprocess = list("pca", "no_preprocessing", "select_percentile_classification") + } + else { + g_classifiers = NULL + g_preprocess = NULL + } + + automl = makeLearner("classif.autosklearn", + time_left_for_this_task = 100000L, + per_run_time_limit = 25L, + ensemble_size = 0, + initial_configurations_via_metalearning = 0L, + resampling_strategy = "cv", + include_preprocessors = g_preprocess, + include_estimators = g_classifiers, + # default tmp_folder name will cause no space left error + tmp_folder = paste0("../autosklearn_tmp/autosklearn_tmp", job_id), # it makes more sense touse seperate folder since different job_id are different problems + output_folder = paste0("../autosklearn_tmp/autosklearn_out", job_id), + delete_tmp_folder_after_terminate = FALSE, # to use together with shared_mode = T + #task 2 failed - \"FileExistsError: [Errno 17] File exists: '../autosklearn_tmp/autosklearn_tmp1' + #delete_tmp_folder_after_terminate=T, ## will cause error file exist + delete_output_folder_after_terminate = FALSE, + #delete_output_folder_after_terminate=T, + shared_mode = TRUE, + resampling_strategy_arguments = list(folds = getGconf()$NCVInnerIter), + smac_scenario_args = list(runcount_limit = budget) + ) + mmodel = train(automl, task, subset = train_set) + return(mmodel) +} + +# Predict performance of the best model on test/lock dataset: +lock_eval.auto.sklearn = function(task, measure, train_set = NULL, test_set, best_model) { + prediction = predict(best_model, task, subset = test_set) + mpred = performance(prediction, measures = measure) + return(mpred) +} + diff --git a/lte/auto_sklearn_test.R b/lte/auto_sklearn_test.R new file mode 100644 index 0000000..bc808b3 --- /dev/null +++ b/lte/auto_sklearn_test.R @@ -0,0 +1,66 @@ +rm(list = ls()) +library(mlr) +library(mlrCPO) +library(reticulate) +library(BBmisc) +library(OpenML) +source("system.R") +source("auto_sklearn_func.R") + +set.seed(1) +g_classifiers = list("random_forest", "k_nearest_neighbors", "libsvm_svc", "xgradient_boosting", "multinomial_nb") +g_preprocess = list("pca", "no_preprocessing", "select_percentile_classification") +#g_preprocess = list("pca", "no_preprocessing", "select_percentile_classification", "standardize", "none", "minmax", "variance_threshold") data preprocessing methods does not work +task = convertOMLTaskToMlr(getOMLTask(37))$mlr.task %>>% cpoDummyEncode(reference.cat = FALSE) +outer_loop_CV5 = makeResampleInstance("CV", iters = 5, task = task) + +train_set = outer_loop_CV5$train.inds[[1]] +test_set = outer_loop_CV5$test.inds[[1]] +train_data = getTaskData(task, train_set, target.extra = TRUE) +test_data = getTaskData(task, test_set, target.extra = TRUE) + +automl = makeLearner("classif.autosklearn", + time_left_for_this_task = 1000000L, + per_run_time_limit = 25L, + ensemble_size = 0, + include_preprocessors = g_preprocess, + include_estimators = g_classifiers, + initial_configurations_via_metalearning = 0L, + resampling_strategy = "cv", + resampling_strategy_arguments = list(folds = 5L), + smac_scenario_args = list(runcount_limit = 5L) +) + +a = Sys.time() +mod = train(automl, task, subset = train_set) +prediction = predict(mod, task, subset = test_set) +pred = performance(prediction) +print(Sys.time() - a) + + + + + +# autosklearn = import("autosklearn") +# sklearn = import("sklearn") +# automl = autosklearn$classification$AutoSklearnClassifier( +# time_left_for_this_task = 1000L, +# per_run_time_limit = 200L, +# ensemble_size = 0, +# # include_preprocessors = g_preprocess, +# # include_estimators = g_classifiers, +# initial_configurations_via_metalearning = 0L, +# resampling_strategy = "cv", +# resampling_strategy_arguments = list(folds = 5L), +# smac_scenario_args = dict(runcount_limit = 5L)) + + +# automl$fit(train_data$data, train_data$target, metric=autosklearn$metrics$accuracy) +# automl$fit_ensemble(train_data$target, ensemble_size = 1) +# automl$refit(train_data$data, train_data$target) +# predictions = automl$predict(test_data$data) +# pred = sklearn$metrics$accuracy_score(test_data$target, predictions) + + + + diff --git a/lte/bt_conf.R b/lte/bt_conf.R new file mode 100644 index 0000000..541c5bd --- /dev/null +++ b/lte/bt_conf.R @@ -0,0 +1,52 @@ +flag_debug = F +task_ids = 37 +if (!flag_debug) task_ids = c(14, 23, 37, 53, 3917, 9946, 9952, 9978, 146817, 146820) +getGconf = function() { + conf_common = list( + NCVOuterIter = 5L, + NCVInnerIter = 5L, + measures = list(mlr::mmce), + repl = 10L, + prob_seed = 1L, + RLMaxEpisode = 2000L # this number does not play a role, it only ensures RL could run for sufficient time + ) + + conf_debug = list( + budget = 40L, + conf_tpot = list(generations = 1L, population_size = 3L, offspring_size = 3L, config_dict = 'TPOT light') + ) + + conf_full = list( + budget = 1000L, + # TPOT will evaluate population_size + generations × offspring_size pipelines in total. + conf_tpot = list(generations = 20L, population_size = 10L, offspring_size = 50L) + ) + if (flag_debug) return(c(conf_debug, conf_common)) + return(c(conf_full, conf_common)) +} + +resources_light = list( + walltime = 60L*60*8, + memory = 1024L*2, + ntasks = 1L, + ncpus = 1L, + nodes = 1L, + clusters = "serial") + +resources_bigmem = list( + walltime = 60L*60*8, + memory = 1024L*4, + ntasks = 1L, + ncpus = 1L, + nodes = 1L, + clusters = "serial") + + + +resources = list( + walltime = 60L*60*12, + memory = 1024L*2, + ntasks = 1L, + ncpus = 1L, + nodes = 1L, + clusters = "serial") diff --git a/lte/bt_post_hoc.R b/lte/bt_post_hoc.R new file mode 100644 index 0000000..bd46007 --- /dev/null +++ b/lte/bt_post_hoc.R @@ -0,0 +1,10 @@ +library(batchtools) +source("system.R") +reg = loadRegistry("reg_test", writeable = T, work.dir = getwd()) +refun = function(job, res) { + cv5 = mean(res$vec_mpred) + list(cv5 = cv5) +} + +res = reduceResultsDataTable(ids = findDone(), fun = refun) +unwrap(res, sep = ".") diff --git a/lte/bt_submit.R b/lte/bt_submit.R new file mode 100644 index 0000000..7865061 --- /dev/null +++ b/lte/bt_submit.R @@ -0,0 +1,7 @@ +submitJobs(getJobPars()[algorithm != "reinbo_table", job.id, with = T]) +ids_sk = getJobPars()[algorithm == "auto_sklearn", job.id, with = T] # dont' run testJob for autosklearn since you can not easily kill it +submitJobs(ids_sk) +ids = getJobPars()[algorithm == "reinbo_table", job.id, with = T] +submitJobs(ids) +getJobPars()[(algorithm == "auto_sklearn") & (problem == "diabetes"), job.id, with = T] +getJobPars()[problem == "diabetes", job.id, with = T] diff --git a/lte/debug_auto_sklearn_test.R b/lte/debug_auto_sklearn_test.R new file mode 100644 index 0000000..a0c7585 --- /dev/null +++ b/lte/debug_auto_sklearn_test.R @@ -0,0 +1,70 @@ +rm(list = ls()) +library(mlr) +library(mlrCPO) +library(reticulate) +library(BBmisc) +library(OpenML) +use_condaenv("w_env") +source("auto-sklearn_fun.R") + +g_classifiers = list("random_forest", "k_nearest_neighbors", "libsvm_svc", "xgradient_boosting", "multinomial_nb") +g_preprocess = list("pca", "no_preprocessing", "select_percentile_classification") + + + +task = convertOMLTaskToMlr(getOMLTask(3))$mlr.task %>>% cpoDummyEncode(reference.cat = FALSE) +#head(getTaskData(task, target.extra = TRUE)$target) +set.seed(1) +outer_loop_CV5 = makeResampleInstance("CV", iters = 5, task = task) + +train_set = outer_loop_CV5$train.inds[[1]] +test_set = outer_loop_CV5$test.inds[[1]] +train_data = getTaskData(task, train_set, target.extra = TRUE) +test_data = getTaskData(task, test_set, target.extra = TRUE) + +set.seed(1) +automl = makeLearner("classif.autosklearn", + time_left_for_this_task = 1000000L, + per_run_time_limit = 25L, + ensemble_size = 0, + #include_preprocessors = g_preprocess, + #include_estimators = g_classifiers, + initial_configurations_via_metalearning = 0L, + resampling_strategy = "cv", + resampling_strategy_arguments = list(folds = 5L), + smac_scenario_args = list(runcount_limit = 5L) +) + +a = Sys.time() +mod = train(automl, task, subset = train_set) +prediction = predict(mod, task, subset = test_set) +pred = performance(prediction) +print(Sys.time() - a) + + + + + +# autosklearn = import("autosklearn") +# sklearn = import("sklearn") +# automl = autosklearn$classification$AutoSklearnClassifier( +# time_left_for_this_task = 1000L, +# per_run_time_limit = 200L, +# ensemble_size = 0, +# # include_preprocessors = g_preprocess, +# # include_estimators = g_classifiers, +# initial_configurations_via_metalearning = 0L, +# resampling_strategy = "cv", +# resampling_strategy_arguments = list(folds = 5L), +# smac_scenario_args = dict(runcount_limit = 5L)) + + +# automl$fit(train_data$data, train_data$target, metric=autosklearn$metrics$accuracy) +# automl$fit_ensemble(train_data$target, ensemble_size = 1) +# automl$refit(train_data$data, train_data$target) +# predictions = automl$predict(test_data$data) +# pred = sklearn$metrics$accuracy_score(test_data$target, predictions) + + + + diff --git a/lte/func_smac.R b/lte/func_smac.R new file mode 100644 index 0000000..17fcbf7 --- /dev/null +++ b/lte/func_smac.R @@ -0,0 +1,117 @@ +run = function(cs, budget = 1000) { + hh = reticulate::import("python_smac_space") + #scenario = Scenario({"run_obj": "quality", # we optimize quality (alternatively runtime) + # "runcount-limit": budget, # maximum function evaluations + # "cs": cs, # configuration space + # "deterministic": "true" + # }) + budget = 100 + scenario = hh$Scenario(list("run_obj" = "quality", # we optimize quality (alternatively runtime) + "runcount-limit" = budget, # maximum function evaluations + "cs" = cs, # configuration space + "deterministic" = "true", + "shared_model" = TRUE # deletable + )) + + # scenario$abort_on_first_run_crash = F + + print("Optimizing! Depending on your machine, this might take a few minutes.") + np = reticulate::import("numpy") + #fd = hh$ExecuteTAFuncDict(toy_smac_obj) + #smac = hh$SMAC(scenario = scenario, rng = np$random$RandomState(as.integer(4)), tae_runner = toy_smac_obj) + #smac = hh$SMAC(scenario = scenario, rng = np$random$RandomState(as.integer(4)), tae_runner = fd) + reticulate::source_python('smac_obj.py') + source("smac_obj.R") + #smac = hh$SMAC(scenario = scenario, rng = np$random$RandomState(as.integer(4)), tae_runner = smac_obj_from_cfg) + py_fun = reticulate::r_to_py(toy_smac_obj, convert = FALSE) + #py_fun = reticulate::r_to_py(function(x) 1, convert = TRUE) + smac = hh$SMAC(scenario = scenario, rng = np$random$RandomState(as.integer(4)), tae_runner = py_fun) + smac$get_tae_runner() + incumbent = smac$optimize() # problem + #inc_value = svm_from_cfg(incumbent) + incumbent + #print("Optimized Value: %.2f" % (inc_value)) +} + + +test_run = function() { + cfg = reticulate::import("python_smac_space") + cs = cfg$cs + run(cs) +} + +# Predict function: evaluate best model on test dataset +lock_eval.smac = function(task, measure, train_set, test_set, best_model){ + cfg = best_model + lrn = gen_mlrCPOPipe_from_smac_cfg(cfg) + mod = train(lrn, task, subset = train_set) + pred = predict(mod, task, subset = test_set) + mpred = performance(pred, measures = measure) + return(mpred) +} + + +gen_mlrCPOPipe_from_smac_cfg = function(cfg) { + #cfg = cfg.sample_configuration() + # convert ConfigSpace.configuration_space.ConfigurationSpace to ConfigSpace.configuration_space.Configuration + # For deactivated parameters, the configuration stores None-values. so we remove them. + #cfg = list(Model = "xgboost", Preprocess = "cpoScale(center = FALSE)", FeatureFilter = "cpoPca(center = FALSE, rank = rank_val)", lrn_xgboost_max_depth = 3, lrn_xgboost_eta = 0.03, fe_pca_rank = 0.5) # for testing and debug + model = cfg$Model + preprocess = cfg$Preprocess + pfilter = cfg$FeatureFilter + perc_val = NULL + rank_val = NULL + + ## + extract_hyper_prefix = function(prefix = "lrn", cfg) { + names4lrn_hyp = grep(pattern = prefix, x = names(cfg), value = T) + ps.learner = cfg[names4lrn_hyp] # evaluted later by R function eval + pattern = paste0("(", prefix, "_[:alpha:]+_)*") + #ns4hyper = gsub(pattern = pattern, x = names4lrn_hyp, replacement="", ignore.case = T) + ns4hyper = stringr::str_replace(string = names4lrn_hyp, pattern = pattern, replacement="") + names(ps.learner) = ns4hyper + ps.learner + } + ## + ps.learner = extract_hyper_prefix("lrn", cfg) # hyper-parameters for learner must exist + + names4Fe = grep(pattern = "fe", x = names(cfg), value = T) + + p = mlr::getTaskNFeats(subTask) # this subTask relies on global variable + + if(length(names4Fe) > 0) { + ps.Fe = extract_hyper_prefix("fe", cfg) + if(grepl(pattern = "perc", x = names(ps.Fe))) { + name4featureEng_perc = grep(pattern = "perc", x = names(ps.Fe), value = T) + perc_val = ps.Fe[[name4featureEng_perc]] + } + if(grepl(pattern = "rank", x = names(ps.Fe))) { + name4featureEng_rank = grep(pattern = "rank", x = names(ps.Fe), value = T) + rank_val = ceiling(ps.Fe[[name4featureEng_rank]] * p) + } + } + + lrn = sprintf("%s %%>>%% %s %%>>%% makeLearner('classif.%s', par.vals = ps.learner)", + preprocess, pfilter, model) + lrn = gsub(pattern = "NA %>>%", x = lrn, replacement = "", fixed = TRUE) + + + # set mtry after reducing the number of dimensions + if (model == "ranger") { + p1 = p + if (!is.null(perc_val)) {p1 = max(1, round(p*perc_val))} + if (!is.null(rank_val)) {p1 = rank_val} + ps.learner$mtry = max(1, as.integer(p1*ps.learner$mtry)) + } + lrn = paste0("library(mlrCPO);library(magrittr);", lrn) + obj_lrn = eval(parse(text = lrn)) + return(obj_lrn) +} + +test_gen_mlrCPOPipe_from_smac_cfg = function() { + subTask = mlr::iris.task + cfg = reticulate::import("python_smac_space") + cfg = cfg$stub + lrn = gen_mlrCPOPipe_from_smac_cfg(cfg) + lrn +} diff --git a/lte/header.R b/lte/header.R new file mode 100644 index 0000000..7968bdc --- /dev/null +++ b/lte/header.R @@ -0,0 +1,24 @@ +options(mlr.show.info = FALSE) +library(batchtools) +tosources = c("bt_conf.R", "utility.R") + +depend_reinbo_table = c("algo_reinbo_table.R", "reinbo_table_func.R", "reinbo_table_env.R", "reinbo_table_utils.R", "reinbo_table_hyperpara_space.R") +tosources = c(tosources, depend_reinbo_table) + +depend_auto_sklearn = c("algo_auto_sklearn.R", "auto_sklearn_func.R") +tosources = c(tosources, depend_auto_sklearn) + +depend_tpot = c("algo_tpot.R", "tpot_func.R") +tosources = c(tosources, depend_tpot) + +depend_random_search = c("algo_random_search.R", "random_search_func.R", "random_search_space.R") +tosources = c(tosources, depend_random_search) + +depend_tpe = c("algo_tpe.R", "tpe_func.R") +tosources = c(tosources, depend_tpe) + +depend_irace = c("algo_irace.R", "irace_func.R") +tosources = c(tosources, depend_irace) + + +pkgs = c("reticulate", "mlr", "mlrCPO", "OpenML", "parallelMap", "phng", "rlR", "hash", "mlrMBO", "R6", "foreach", "rlist", "magrittr", "irace") diff --git a/lte/install_depend.R b/lte/install_depend.R new file mode 100644 index 0000000..3babaa5 --- /dev/null +++ b/lte/install_depend.R @@ -0,0 +1,6 @@ +devtools::install_github("smilesun/parabox", ref = "tree") +library(mlr) +library(OpenML) +library(rlR) +library(hash) + diff --git a/lte/irace_func.R b/lte/irace_func.R new file mode 100644 index 0000000..256f69e --- /dev/null +++ b/lte/irace_func.R @@ -0,0 +1,74 @@ +# Irace algorithm: +opt.irace = function(task, budget, measure, train_set = NULL) { + measure <<- measure + subTask <<- task + if (!is.null(train_set)) subTask <<- subsetTask(task, train_set) + irace::irace( + scenario = list( + targetRunner = target.runner, + instances = lapply(1:(getGconf()$NCVInnerIter*budget), function(x) + makeResampleInstance(makeResampleDesc("Holdout", split = 1 - 1/getGconf()$NCVInnerIter, stratify = TRUE), subTask)), + maxExperiments = getGconf()$NCVInnerIter*budget + ), + parameters = readParameters("irace_space.txt", digits = 5, debugLevel = 0, text) + ) + load("./irace.Rdata") + mmodel = getFinalElites(iraceResults = iraceResults, n = 1) + return(mmodel) +} + + +# Target runner of Irace: +target.runner = function(experiment, config = list()) { + rin = experiment$instance ## holdout instance + lrn = genLearner.irace(subTask, experiment$configuration) + res = mlr::resample(lrn, subTask, resampling = rin, measures = measure, show.info = FALSE) + return(list(cost = res$aggr)) +} + + +# Predict function: evaluate best model on test dataset +lock_eval.irace = function(task, measure, train_set, test_set, best_model){ + ps = best_model + ps$.ID. = NULL + ps$.PARENT. = NULL + lrn = genLearner.irace(task, ps) + mod = train(lrn, task, subset = train_set) + pred = predict(mod, task, subset = test_set) + perf = performance(pred, measures = measure) + return(perf) +} + +# Generate mlr learner for configuration: +genLearner.irace = function(task, configuration){ + ps = configuration ## hypar-parameters + ps$sigma = 2^(ps$sigma) + ps$C = 2^(ps$C) + lrn = sprintf("%s %%>>%% %s %%>>%% makeLearner('classif.%s', par.vals = ps.learner)", + paste0(ps$Preprocess, "()"), paste0(ps$Filter, "()"), ps$Classify) + lrn = gsub(pattern = "NA() %>>%", x = lrn, replacement = "", fixed = TRUE) + # Preprocess: + lrn = gsub(pattern = ".scale()", x = lrn, replacement = "(scale = FALSE)", fixed = TRUE) + lrn = gsub(pattern = ".center()", x = lrn, replacement = "(center = FALSE)", fixed = TRUE) + # Filter: + lrn = gsub(pattern = ".perc()", x = lrn, replacement = "(perc = ps$perc)", fixed = TRUE) + p = getTaskNFeats(task) + lrn = gsub(pattern = ".rank()", x = lrn, replacement = "(center = FALSE, rank = as.integer(max(1, round(p*ps$rank))))", fixed = TRUE) + ## delete parameters irrelevant to classifier + ps.learner = as.list(ps) + ps.learner$Preprocess = NULL + ps.learner$Filter = NULL + ps.learner$Classify = NULL + ps.learner$perc = NULL + ps.learner$rank = NULL + ps.learner[is.na(ps.learner)] = NULL + if (ps$Classify == "ranger") { + p1 = p + if (!is.na(ps$perc)) {p1 = max(1, round(p*ps$perc))} + if (!is.na(ps$rank)) {p1 = max(1, round(p*ps$rank))} + ps.learner$mtry = max(1, as.integer(p1*ps$mtry)) + } + lrn = eval(parse(text = lrn)) + return(lrn) +} + diff --git a/lte/irace_space.txt b/lte/irace_space.txt new file mode 100644 index 0000000..44c8d2a --- /dev/null +++ b/lte/irace_space.txt @@ -0,0 +1,45 @@ +## Template for parameter description file for Iterated Race. +## +## The format is one parameter per line. Each line contains: +## +## 1: Name of the parameter. An unquoted alphanumeric string, +## example: ants + +## 2: Switch to pass the parameter. A quoted (possibly empty) string, +## if the value and the switch must be separated, add a space at +## the end of the string. Example : "--version1 --ants " + +## 3: Type. An unquoted single letter, among +## i: Integer, c: categorical, o: ordinal, r: real. + +## 4: For c and o: All possible values, that is, a variable number of +## quoted or unquoted strings separated by commas within +## parenthesis. Empty strings and strings containing commas or +## spaces must be quoted. +## For i,r: a pair of unquoted numbers representing minimum and +## maximum values. + +## 5: A conditional parameter can be defined according to the values of +## one or several other parameters. This is done by adding a +## character '|' followed by an R expression involving the names of +## other parameters. This expression must return TRUE if the +## condition is satisfied, FALSE otherwise. + +# 1: 2: 3: 4: 5: +Preprocess "--Preprocess" c ("cpoScale", "cpoScale.scale", "cpoScale.center", "cpoSpatialSign", "NA") +Filter "--Filter" c ("cpoFilterAnova.perc", "cpoFilterKruskal.perc", "cpoFilterUnivariate.perc", "cpoPca.rank", "NA") +Classify "--Classify" c ("kknn", "ksvm", "xgboost", "ranger", "naiveBayes") +perc "--perc" r (0.1,1) | Filter %in% c("cpoFilterAnova.perc", "cpoFilterKruskal.perc", "cpoFilterUnivariate.perc") +rank "--rank" r (0.1,1) | Filter == "cpoPca.rank" +k "--k" i (1,20) | Classify == "kknn" +C "--C" r (-15,15) | Classify == "ksvm" +sigma "--sigma" r (-15,15) | Classify == "ksvm" +mtry "--mtry" r (0.1,0.66666) | Classify == "ranger" +sample.fraction "--sample.fraction" r (0.1,1) | Classify == "ranger" +eta "--eta" r (0.001,0.3) | Classify == "xgboost" +max_depth "--max_depth" i (1,15) | Classify == "xgboost" +subsample "--subsample" r (0.5,1) | Classify == "xgboost" +colsample_bytree "--colsample_bytree" r (0.5,1) | Classify == "xgboost" +min_child_weight "--min_child_weight" r (0,50) | Classify == "xgboost" +laplace "--laplace" r (0.01,100) | Classify == "naiveBayes" + diff --git a/lte/lrz.batchtools.conf.R b/lte/lrz.batchtools.conf.R new file mode 100644 index 0000000..966f2fe --- /dev/null +++ b/lte/lrz.batchtools.conf.R @@ -0,0 +1,4 @@ +source("/home/hpc/pr74ze/ri89coc2/lrz_configs/config_files/batchtools/clusterFunctionsSlurmLrz.R") +cluster.functions = makeClusterFunctionsSlurmLrz("/home/hpc/pr74ze/ri89coc2/lrz_configs/config_files/batchtools/slurm_lmulrz.tmpl", array.jobs = FALSE) +default.resources = list(walltime = 3600L * 12, memory = 1024 * 3L, ntasks = 1L, ncpus = 1L, nodes = 1L, clusters = "serial") +max.concurrent.jobs = 999L diff --git a/lte/main.R b/lte/main.R new file mode 100644 index 0000000..b044850 --- /dev/null +++ b/lte/main.R @@ -0,0 +1,59 @@ +rm(list = ls()) +source("system.R") +source("header.R") +Reg_name = "reg" +datestr = stringi::stri_replace_all(Sys.Date(), regex = "-", replacement="_") ## use date as registry name +strhour = stringi::stri_replace_all(format(Sys.time(), "%H-%M"), regex = "-", replacement = "_") +#unlink("reg_table_1", recursive = TRUE) dangerous !! +reg_dir = paste0(Reg_name, datestr, "__", strhour) +reg = makeExperimentRegistry(file.dir = reg_dir, conf.file = mconf.file, + packages = pkgs, + source = tosources) +if (flag_local) reg$cluster.functions = makeClusterFunctionsMulticore(ncpus = 60L) # run on my own workstation +source("problem.R") +# opt = function(task, budget, measure, train_set, ...) { +# UseMethod("opt", task, budget, measure, train_set) +# } +# +# lock_eval = function(task, measure, train_set, test_set, best_model, ...) { +# UseMethod("lock_eval", task, measure, train_set, test_set, best_model) +# } +# + +algo.designs = list() + +algoname = "reinbo_table" +addAlgorithm(name = algoname, fun = algo_fun_reinbo_table) +algo.designs[[algoname]] = data.frame() + +algoname = "auto_sklearn" +addAlgorithm(name = algoname, fun = algo_fun_auto_sklearn) +algo.designs[[algoname]] = data.frame(flag_light = T) + +algoname = "tpot" +addAlgorithm(name = algoname, fun = algo_fun_tpot) +algo.designs[[algoname]] = data.frame() + + +algoname = "random_search" +addAlgorithm(name = algoname, fun = algo_fun_random_search) +algo.designs[[algoname]] = data.frame() + +algoname = "tpe" +addAlgorithm(name = algoname, fun = algo_fun_tpe) +algo.designs[[algoname]] = data.frame() + +algoname = "irace" +addAlgorithm(name = algoname, fun = algo_fun_irace) +algo.designs[[algoname]] = data.frame() + + +source("algo_reinbo_table.R") +source("algo_auto_sklearn.R") +source("algo_tpot.R") +source("algo_random_search.R") +source("algo_tpe.R") +source("algo_irace.R") + +addExperiments(algo.designs = algo.designs, repls = getGconf()$repl) +summarizeExperiments() diff --git a/lte/mlrmbo_func.R b/lte/mlrmbo_func.R new file mode 100644 index 0000000..824268d --- /dev/null +++ b/lte/mlrmbo_func.R @@ -0,0 +1,102 @@ +# mlrMBO algorithm: +opt.mlrmbo = function(task, budget, measure, train_set = NULL) { + subTask = task + if (!is.null(train_set)) subTask = subsetTask(task, train_set) + inner_loop = makeResampleInstance("CV", iters = getGconf()$NCVInnerIter, stratify = TRUE, subTask) + run = mlrMBO_func(subTask, instance = inner_loop, measure, budget) + mmodel = run$x + return(mmodel) +} + + +# Predict function: evaluate best model on test dataset +lock_eval.mlrmbo = function(task, measure, train_set, test_set, best_model){ + best_model$sigma = 2^(best_model$sigma) + best_model$C = 2^(best_model$C) + lrn = genLearner.mbo(task, best_model) + mod = train(lrn, task, subset = train_set) + pred = predict(mod, task, subset = test_set) + perf = performance(pred, measures = measure) + return(perf) +} + + +# hyper-parameter space +par.set = makeParamSet( + makeDiscreteParam('Pre', values = c("cpoScale()", "cpoScale(scale = FALSE)", "cpoScale(center = FALSE)", "cpoSpatialSign()", "no_operator")), + makeDiscreteParam('Filter', values = c("cpoFilterAnova(perc)", "cpoFilterKruskal(perc)", "cpoPca(center = FALSE, rank)", "cpoFilterUnivariate(perc)", "no_operator")), + makeNumericParam('perc', lower = .1, upper = 1, requires = quote(Filter %in% c("cpoFilterAnova(perc)", "cpoFilterKruskal(perc)", "cpoFilterUnivariate(perc)"))), + makeNumericParam('rank', lower = .1, upper = 1, requires = quote(Filter == "cpoPca(center = FALSE, rank)")), + makeDiscreteParam('Learner', values = c("kknn", "ksvm", "xgboost", "ranger", "naiveBayes")), + makeIntegerParam('k', lower = 1L, upper = 20L, requires = quote(Learner == "kknn")), + makeNumericParam("C", lower = -15, upper = 15, trafo = function(x) 2^x, requires = quote(Learner == 'ksvm')), + makeNumericParam("sigma", lower = -15, upper = 15, trafo = function(x) 2^x, requires = quote(Learner == 'ksvm')), + makeNumericParam("mtry", lower = 1/10, upper = 1/1.5, requires = quote(Learner == 'ranger')), + makeNumericParam("sample.fraction", lower = .1, upper = 1, requires = quote(Learner == 'ranger')), + makeNumericParam("eta", lower = .001, upper = .3, requires = quote(Learner == 'xgboost')), + makeIntegerParam("max_depth", lower = 1L, upper = 15L, requires = quote(Learner == 'xgboost')), + makeNumericParam("subsample", lower = .5, upper = 1, requires = quote(Learner == 'xgboost')), + makeNumericParam("colsample_bytree", lower = .5, upper = 1, requires = quote(Learner == 'xgboost')), + makeNumericParam("min_child_weight", lower = 0, upper = 50, requires = quote(Learner == 'xgboost')), + makeNumericParam("laplace", lower = .01, upper = 100, requires = quote(Learner == 'naiveBayes')) +) + + +# generate learner for task and specific parameter set +genLearner.mbo <- function(task, param_set){ + p = getTaskNFeats(task) + lrn = sprintf("%s %%>>%% %s %%>>%% makeLearner('classif.%s', par.vals = ps.learner)", + param_set$Pre, param_set$Filter, param_set$Learner) + lrn = gsub(pattern = "perc", x = lrn, replacement = "perc = param_set$perc", fixed = TRUE) + lrn = gsub(pattern = "rank", x = lrn, replacement = "rank = as.integer(max(1, round(p*param_set$rank)))", fixed = TRUE) + lrn = gsub(pattern = "no_operator %>>%", x = lrn, replacement = "", fixed = TRUE) + ps.learner = param_set + ps.learner$perc = NULL + ps.learner$rank = NULL + ps.learner$Pre = NULL + ps.learner$Filter = NULL + ps.learner$Learner = NULL + ps.learner[is.na(ps.learner)] = NULL + if (param_set$Learner == "ranger") { + p1 = p + if (!is.na(param_set$perc)) {p1 = max(1, round(p*param_set$perc))} + if (!is.na(param_set$rank)) {p1 = max(1, round(p*param_set$rank))} + ps.learner$mtry = max(1, as.integer(p1*param_set$mtry)) + } + lrn = eval(parse(text = lrn)) + return(lrn) +} + + +# using mlrMBO to optimize pipeline +mlrMBO_func <- function(task, instance, measure, budget){ + objfun = makeSingleObjectiveFunction( + fn = function(x) { + lrn = genLearner.mbo(task, x) + perf = resample(lrn, task, resampling = instance, measures = measure, show.info = FALSE)$aggr + return(perf) + }, + par.set = par.set, + has.simple.signature = FALSE, + minimize = TRUE + ) + ctrl = setMBOControlTermination(makeMBOControl(), iters = budget-4*length(par.set$pars)) + run = mbo(objfun, control = ctrl, show.info = FALSE) + return(run) +} + + + + +# Test: +# measure = list(mmce) +# task = sonar.task +# inner_loop = makeResampleInstance("CV", iters = 3, stratify = TRUE, task) +# outer_loop_rins = makeResampleInstance("CV", iters = 5, stratify = TRUE, task) +# opt_set = outer_loop_rins$train.inds[[1]] +# lock_set = outer_loop_rins$test.inds[[1]] +# mmodel = opt.mlrmbo(task, 66, measure) +# perf = lock_eval.mlrmbo(task, measure, opt_set, lock_set, mmodel) + + + diff --git a/lte/obsolete.batchtools.conf.R b/lte/obsolete.batchtools.conf.R new file mode 100644 index 0000000..8abc74a --- /dev/null +++ b/lte/obsolete.batchtools.conf.R @@ -0,0 +1,5 @@ +sys = Sys.info() +if (as.list(sys)$user != "sunxd") { + source("lrz.batchtools.conf.R") +} + diff --git a/lte/plot2test.R b/lte/plot2test.R new file mode 100644 index 0000000..06bfe3e --- /dev/null +++ b/lte/plot2test.R @@ -0,0 +1,121 @@ +# Plot and test + +library(data.table) +library(ggplot2) +library(tidyr) +library(reshape2) +library(xtable) +library(knitr) + + +## preparation +dt_new = readRDS("reinbo_new_cut_episode.rds") +dt_new$algorithm = "reinbo" + +#dt_reinbo_old = read.csv("../Experiment_results/ML-ReinBo.csv") +#dt_reinbo_old$algo = "reinbo2" +dt_reinbo_old = NULL +temp = read.csv("../Experiment_results/Auto-sklearn.csv") +temp$algorithm = "Autosklearn" +dt_reinbo_old = rbind(dt_reinbo_old, temp) +# +#temp = read.csv("../Experiment_results/Auto-sklearn_light.csv") +#temp$algo = "Autosklearn-light" +#dt_reinbo_old = rbind(dt_reinbo_old, temp) + +temp = read.csv("../Experiment_results/TPE.csv") +temp$algorithm = "TPE" +dt_reinbo_old = rbind(dt_reinbo_old, temp) + +# +temp = read.csv("../Experiment_results/TPOT.csv") +temp$algorithm = "Tpot" +dt_reinbo_old = rbind(dt_reinbo_old, temp) +# +temp = read.csv("../Experiment_results/TPOT_light.csv") +temp$algorithm = "Tpot-light" +dt_reinbo_old = rbind(dt_reinbo_old, temp) +# +temp = read.csv("../Experiment_results/Random_search.csv") +temp$algorithm = "RandomSearch" +dt_reinbo_old = rbind(dt_reinbo_old, temp) + + + +dt_reinbo_old$prob.name = dt_reinbo_old$name +dt_res = rbind(dt_reinbo_old[, c("prob.name", "mmce", "algorithm")], dt_new[, c("prob.name", "mmce", "algorithm")]) + +# check if all jobs finished +dt_res[, .N, by = "prob.name"] + +## table +dt_light = dt_res[, .(mmce = mean(mmce)), by = .(prob.name, algorithm)] +dt_light + +dt_table = spread(dt_light, key = "algorithm", value = "mmce") +dt_table +cns = colnames(dt_table) +cns[1] = "dataset name" +colnames(dt_table) = cns +xtable(dt_table) +knitr::kable(dt_table) +ltxtable=xtable(dt_table, align=rep("l",ncol(dt_table)+1), digits = 4) +print(ltxtable, floating=TRUE, hline.after=NULL, include.rownames=TRUE, include.colnames=TRUE) # tested + +# example to add asterix +# pval <- rev(sort(c(outer(1:6, 10^-(1:3))))) +# symp <- symnum(pval, corr = FALSE, +# cutpoints = c(0, .001,.01,.05, .1, 1), +# symbols = c("***","**","*","."," ")) +# noquote(cbind(P.val = format(pval), Signif = symp)) + +## plot +size = 10 # font size +gp = ggplot() + geom_boxplot(data = as.data.frame(dt_res), aes(x = algorithm, y = mmce, fill = algorithm)) + theme_bw() + theme(axis.text.x = element_text(angle = 90, size = size), axis.text.y = element_text(size = size), axis.title = element_text(size = size), strip.text = element_text(size=size), legend.text = element_text(size = size), legend.position="bottom") + facet_wrap("prob.name", scale = "free_y", ncol = 3) +#ggsave(gp, file = "prob_algo_repli_compare.pdf", width=3, height=3, units="in", scale=5, device = pdf) +ggsave(gp, file = "prob_algo_repli_compare.pdf", device = "pdf",scale = 0.9) + +## test +fun_best_against_other = function(temp, candiate = NULL) { + light = temp[, .(mmce = mean(mmce)), by = "algorithm"] # take mean value + ind = light[, .(which.min(mmce))] + prob_name = unique(temp$prob.name) + ref = light$algorithm[as.vector(as.matrix(ind))] + #cat(sprintf(" \nprob *** %s*** of best algorithm name ***%s***\n", prob_name, ref)) + if(!is.null(candiate)) { + checkmate::assert_character(candiate) + ref = candiate + } + moptions = unique(temp$algorithm) + #moptions = setdiff(moptions, ref) + x = temp[algorithm == ref]$mmce + res = lapply(moptions, function(name) { + y = temp[algorithm == name]$mmce + if (length(x) != length(y)) return(100) + worse_than_best = (wilcox.test(y, x, alternative = "greater", exact = FALSE)$p.value < 0.05) + better_than_best = (wilcox.test(x, y, alternative = "greater", exact = FALSE)$p.value < 0.05) + val = temp[algorithm == name, mean(mmce)] + strval = as.character(sprintf("%.4f", val)) + if (is.null(candiate)) { + if (worse_than_best) return(strval) # -1 lose + if ((!better_than_best) & (name ==ref)) return(paste0("\\underline{\\textbf{", strval, "}}")) # 0 tie + else return(paste0("\\textbf{", strval, "}")) # win! + } + # candiate is not null + # win lose + if (worse_than_best) return(-1) # -1 lose + if (!better_than_best) return(0) # 0 tie + return(1) # win! + }) + names(res) = moptions + res$best_algo = ref + as.data.table(res) +} + +dt_winner = dt_res[, fun_best_against_other(.SD), by = .(prob.name), .SDcols = colnames(dt_res)] +res = knitr::kable(dt_winner, format = "latex", digits = 4, escape = F) +capture.output(print(res), file = "latex.txt") + +algos = as.character(unique(dt_res$algorithm)) +name = "reinbo" # suppose if reinbo is the best +dt_res[, fun_best_against_other(.SD, name), by = .(prob.name), .SDcols = colnames(dt_res)] diff --git a/lte/post_analysis.R b/lte/post_analysis.R new file mode 100644 index 0000000..2fde64d --- /dev/null +++ b/lte/post_analysis.R @@ -0,0 +1,7 @@ +result_list = reduceResultsList(ids = findDone(), fun = function(job, res) { + model = res$mmodel$Model + operators = strsplit(as.character(model), "\t")[[1]] + data.frame(pre = operators[1], filter = operators[2], classifier = operators[3]) +}) +result = rbindlist(result_list) +Freq = c(as.list(table(result$pre)), as.list(table(result$filter)), as.list(table(result$classifier))) diff --git a/lte/problem.R b/lte/problem.R new file mode 100644 index 0000000..6c1beb1 --- /dev/null +++ b/lte/problem.R @@ -0,0 +1,12 @@ +tasks = lapply(task_ids, getOMLTask) +tasks = lapply(tasks, convertOMLTaskToMlr) + +prob_fun = function(data, job) { + mlr_task_full = data %>>% cpoDummyEncode(reference.cat = FALSE) + outer_iters = getGconf()$NCVOuterIter + outer_loop_rins = makeResampleInstance("CV", iters = outer_iters, stratify = TRUE, mlr_task_full) + list(rins = outer_loop_rins, mlr_task_full = mlr_task_full) +} + +for (task in tasks) + addProblem(name = getTaskId(task$mlr.task), data = task$mlr.task, fun = prob_fun, seed = getGconf()$prob_seed) diff --git a/lte/python_smac_space.py b/lte/python_smac_space.py new file mode 100644 index 0000000..a92522f --- /dev/null +++ b/lte/python_smac_space.py @@ -0,0 +1,73 @@ +# Import ConfigSpace and different types of parameters +from smac.configspace import ConfigurationSpace +from ConfigSpace.hyperparameters import CategoricalHyperparameter, \ + UniformFloatHyperparameter, UniformIntegerHyperparameter +from ConfigSpace.conditions import InCondition + +# Import SMAC-utilities +from smac.tae.execute_func import ExecuteTAFuncDict +from smac.scenario.scenario import Scenario +from smac.facade.smac_facade import SMAC + +cs = ConfigurationSpace() + +# We define a few possible types of SVM-kernels and add them as "kernel" to our cs +step1 = CategoricalHyperparameter("Preprocess", ['cpoScale()', 'cpoScale(scale = FALSE)', 'cpoScale(center = FALSE)', 'cpoSpatialSign()', 'NA'], default_value="NA") +cs.add_hyperparameter(step1) + +step2 = CategoricalHyperparameter("FeatureFilter", ['cpoFilterAnova(perc=perc_val)', 'cpoFilterKruskal(perc=perc_val)', 'cpoFilterUnivariate(perc=perc_val)', 'cpoPca(center = FALSE, rank = rank_val)', 'NA'], default_value = "NA") +cs.add_hyperparameter(step2) +anova_perc = UniformFloatHyperparameter("fe_anova_perc", 0.1, 1, default_value = 0.1) +kruskal_perc = UniformFloatHyperparameter("fe_kruskal_perc", 0.1, 1, default_value = 0.1) +univar_perc = UniformFloatHyperparameter("fe_univar_perc", 0.1, 1, default_value = 0.1) +pca_perc = UniformFloatHyperparameter("fe_pca_rank", 0, 0.9, default_value = 0.1) +cs.add_hyperparameters([anova_perc, kruskal_perc, univar_perc, pca_perc]) + +step2_child_anova = InCondition(child=anova_perc, parent=step2, values=["cpoFilterAnova(perc=perc_val)"]) +step2_child_kruskal = InCondition(child=kruskal_perc, parent=step2, values=["cpoFilterKruskal(perc=perc_val)"]) +step2_child_univar = InCondition(child=univar_perc, parent=step2, values=["cpoFilterUnivariate(perc=perc_val)"]) +step2_child_pca = InCondition(child=pca_perc, parent=step2, values=["cpoPca(center = FALSE, rank = rank_val)"]) +cs.add_conditions([step2_child_anova, step2_child_kruskal, step2_child_univar, step2_child_pca]) + +step3 = CategoricalHyperparameter("Model", ['kknn', 'ksvm', 'ranger', 'xgboost', 'naiveBayes']) +cs.add_hyperparameter(step3) + +hyper_kknn = UniformIntegerHyperparameter("lrn_kknn_k", 1, 19, default_value = 1) +hyper_ksvm_C = UniformFloatHyperparameter("lrn_svm_C", 2**(-15), 2**(15), default_value = 1) + +hyper_ksvm_sigma = UniformFloatHyperparameter("lrn_svm_sigma", 2**(-15), 2**(15), default_value = 1) + +hyper_ranger_mtry = UniformFloatHyperparameter("lrn_ranger_mtry", 0.1, 0.66666, default_value = 0.1) +hyper_ranger_sample_fraction = UniformFloatHyperparameter("lrn_ranger_sample.fraction", 0.1, 1, default_value = 0.1) +hyper_xgboost_eta = UniformFloatHyperparameter('lrn_xgboost_eta', 0.001, 0.3, default_value = 0.1) +hyper_xgboost_max_depth = UniformIntegerHyperparameter('lrn_xgboost_max_depth', 1, 14, default_value = 5) +hyper_xgboost_subsample = UniformFloatHyperparameter('lrn_xgboost_subsample', 0.5, 1, default_value = 0.5) +hyper_xgboost_colsample_bytree = UniformFloatHyperparameter('lrn_xgboost_colsample_bytree', 0.5, 1, default_value = 0.5) +hyper_xgboost_min_child_weight = UniformFloatHyperparameter('lrn_xgboost_min_child_weight', 0, 50, default_value = 0.5) +hyper_naiveBayes = UniformFloatHyperparameter('lrn_naiveBayes_laplace', 0.01, 100, default_value = 0.01) + +cs.add_hyperparameters([hyper_kknn, hyper_ksvm_C, hyper_ksvm_sigma, hyper_ranger_mtry, hyper_ranger_sample_fraction, hyper_xgboost_eta, hyper_xgboost_max_depth, hyper_xgboost_subsample, hyper_xgboost_colsample_bytree, hyper_xgboost_min_child_weight, hyper_naiveBayes]) + +step3_child_kknn = InCondition(child = hyper_kknn, parent = step3, values = ["kknn"]) +#cs.add_conditions([step3_child_kknn]) +step3_child_ksvm_c = InCondition(child = hyper_ksvm_C, parent = step3, values = ["ksvm"]) +#cs.add_conditions([step3_child_ksvm_c]) +step3_child_ksvm_sigma = InCondition(child = hyper_ksvm_sigma, parent = step3, values = ["ksvm"]) +#cs.add_conditions([step3_child_ksvm_sigma]) +## +step3_child_ranger_mtry = InCondition(child = hyper_ranger_mtry, parent = step3, values = ["ranger"]) +step3_child_ranger_sample_fraction = InCondition(child = hyper_ranger_sample_fraction, parent = step3, values = ["ranger"]) +## + +step3_child__xgboost_eta = InCondition(child = hyper_xgboost_eta, parent = step3, values = ["xgboost"]) +step3_child__xgboost_max_depth = InCondition(child = hyper_xgboost_max_depth, parent = step3, values = ["xgboost"]) +step3_child__xgboost_subsample = InCondition(child = hyper_xgboost_subsample, parent = step3, values = ["xgboost"]) +step3_child__xgboost_colsample_bytree = InCondition(child = hyper_xgboost_colsample_bytree, parent = step3, values = ["xgboost"]) +step3_child__xgboost_min_child_weight = InCondition(child = hyper_xgboost_min_child_weight, parent = step3, values = ["xgboost"]) +## +step3_child__naiveBayes_laplace = InCondition(child = hyper_naiveBayes, parent = step3, values = ["naiveBayes"]) + +cs.add_conditions([step3_child_kknn, step3_child_ksvm_c, step3_child_ksvm_sigma, step3_child_ranger_mtry, step3_child_ranger_sample_fraction, step3_child__xgboost_eta, step3_child__xgboost_subsample, step3_child__xgboost_max_depth, step3_child__xgboost_colsample_bytree, step3_child__xgboost_min_child_weight, step3_child__naiveBayes_laplace]) + +cfg = cs.sample_configuration() +stub = {k : cfg[k] for k in cfg if cfg[k]} diff --git a/lte/random_search_func.R b/lte/random_search_func.R new file mode 100644 index 0000000..6b81553 --- /dev/null +++ b/lte/random_search_func.R @@ -0,0 +1,54 @@ +# Random Search algorithm: +opt.random.search = function(task, budget, measure, train_set = NULL) { + subTask = task + if (!is.null(train_set)) subTask = subsetTask(task, train_set) + inner_loop = makeResampleInstance("CV", iters = getGconf()$NCVInnerIter, stratify = TRUE, subTask) + ps <<- step1$sample(budget) + for (i in 1:budget) { + perf = mlr_fun(subTask, ps[i, ], measure, cv_instance = inner_loop) + ps[i, "perf"] = perf + } + mmodel = ps[ps$perf == min(ps$perf),][1,] + mmodel$perf = NULL + return(mmodel) +} + + +# Mlr function: evaluate sampled model +mlr_fun = function(task, model, measure, cv_instance) { + lrn = genLearner(task, model, measure) + perf = resample(lrn, task, resampling = cv_instance, measures = measure, show.info = FALSE)$aggr + return(perf) +} + + +# Predict function: evaluate best model on test dataset +lock_eval.random.search = function(task, measure, train_set, test_set, best_model){ + lrn = genLearner(task, best_model, measure) + mod = train(lrn, task, subset = train_set) + pred = predict(mod, task, subset = test_set) + perf = performance(pred, measures = measure) + return(perf) +} + +# Generate mlr learner for configuration: +genLearner = function(task, model, measure){ + p = getTaskNFeats(task) + lrn = sprintf("%s %%>>%% %s %%>>%% makeLearner('classif.%s', par.vals = ps.learner)", + model$Preprocess[1], model$Filter[1], model$Classify[1]) + lrn = gsub(pattern = "perc", x = lrn, replacement = "perc = model$perc", fixed = TRUE) + lrn = gsub(pattern = "rank", x = lrn, replacement = "rank = as.integer(max(1, round(p*model$rank)))", fixed = TRUE) + lrn = gsub(pattern = "NA %>>%", x = lrn, replacement = "", fixed = TRUE) + ps.learner = model[, -(1:5)] ## delete parameters irrelevant to classifier + ps.learner = as.list(ps.learner) + ps.learner[is.na(ps.learner)] = NULL + if (model$Classify[1] == "ranger") { + p1 = p + if (!is.na(model$perc)) {p1 = max(1, round(p*model$perc))} + if (!is.na(model$rank)) {p1 = max(1, round(p*model$rank))} + ps.learner$mtry = max(1, as.integer(p1*model$mtry)) + } + lrn = eval(parse(text = lrn)) + return(lrn) +} + diff --git a/lte/random_search_space.R b/lte/random_search_space.R new file mode 100644 index 0000000..d3631e5 --- /dev/null +++ b/lte/random_search_space.R @@ -0,0 +1,65 @@ +# Parameter set tree: +step1 = ParamSetTree$new("pre", + ParamCategorical$new(id = "Preprocess", + values = c("cpoScale()", "cpoScale(scale = FALSE)", "cpoScale(center = FALSE)", "cpoSpatialSign()", "NA"))) + +step2 = ParamSetTree$new("filter", + ParamCategorical$new(id = "Filter", + values = c("cpoFilterAnova(perc)", "cpoFilterKruskal(perc)", "cpoPca(center = FALSE, rank)", "cpoFilterUnivariate(perc)", "NA")), + addDep(ParamReal$new(id = "perc", lower = .1, upper = 1), + did = "Filter", expr = quote(Filter %in% c("cpoFilterAnova(perc)", "cpoFilterKruskal(perc)", "cpoFilterUnivariate(perc)"))), + addDep(ParamReal$new(id = "rank", lower = .1, upper = 1), + did = "Filter", expr = quote(Filter == "cpoPca(center = FALSE, rank)"))) + +step3 = ParamSetTree$new("class", + ParamCategorical$new(id = "Classify", + values = c("kknn", "ksvm", "xgboost", "ranger", "naiveBayes")), + + addDep(ParamInt$new(id = "k", lower = 1L, upper = 20L), + did = "Classify", expr = quote(Classify == "kknn")), + + addDep(ParamReal$new(id = "C", lower = 2^(-15), upper = 2^(15)), + did = "Classify", expr = quote(Classify == "ksvm")), + + addDep(ParamReal$new(id = "sigma", lower = 2^(-15), upper = 2^(15)), + did = "Classify", expr = quote(Classify == "ksvm")), + + addDep(ParamReal$new(id = "mtry", lower = 1/10, upper = 1/1.5), + did = "Classify", expr = quote(Classify == "ranger")), + + addDep(ParamReal$new(id = "sample.fraction", lower = .1, upper = 1), + did = "Classify", expr = quote(Classify == "ranger")), + + addDep(ParamReal$new(id = "eta", lower = .001, upper = .3), + did = "Classify", expr = quote(Classify == "xgboost")), + + addDep(ParamInt$new(id = "max_depth", lower = 1L, upper = 15L), + did = "Classify", expr = quote(Classify == "xgboost")), + + addDep(ParamReal$new(id = "subsample", lower = .5, upper = 1), + did = "Classify", expr = quote(Classify == "xgboost")), + + addDep(ParamReal$new(id = "colsample_bytree", lower = .5, upper = 1), + did = "Classify", expr = quote(Classify == "xgboost")), + + addDep(ParamReal$new(id = "min_child_weight", lower = 0, upper = 50), + did = "Classify", expr = quote(Classify == "xgboost")), + + addDep(ParamReal$new(id = "laplace", lower = 0.01, upper = 100), + did = "Classify", expr = quote(Classify == "naiveBayes")) +) + +step2$setChild(step3) +step1$setChild(step2) + + + + + + + + + + + + diff --git a/lte/reinbo_table_conf.R b/lte/reinbo_table_conf.R new file mode 100644 index 0000000..2577c9a --- /dev/null +++ b/lte/reinbo_table_conf.R @@ -0,0 +1,33 @@ +source("reinbo_utils.R") +# A complete pipeline/model consists of 3 stages: +# Preprocessing --> Feature filtering --> Classification + +# Defult pipeline pool: +# Pre-processors: "Scale()", "cpoScale(scale = FALSE)", "cpoScale(center = FALSE)", "cpoSpatialSign()" +# Feature filters: "cpoFilterAnova()", "cpoFilterKruskal()", "cpoPca()", "cpoFilterUnivariate()" +# Classifiers: "classif.ksvm", "classif.ranger", "classif.kknn", "classif.xgboost", "classif.naiveBayes" + +# "NA" indicates that no operator would be carried out at this stage. + + + +# Use defaul pipeline pool: +custom_operators = list() +# The pipeline search space could also be customized by setting, e.g., +custom_operators = list(preprocess = c("cpoScale()", "NA"), + filter = c("cpoPca(center = FALSE, rank)", "cpoFilterAnova(perc)", "NA"), + classifier = c("classif.kknn", "classif.naiveBayes")) + + + + +## Parameters for RL environment: +g_operators = g_getOperatorList(custom_operators) +g_max_depth = length(g_operators) # stages: Scaling --> Feature filtering --> Classification +g_act_cnt = max(sapply(g_operators, length)) # max number of available operators at each stage +g_state_names = g_genStateList(g_operators) +g_state_dim = length(g_state_names) + +## Parameters for BO_PROBE: +g_init_design = 4 # initial design size for MBO: g_init_design*sum(getParamLengths(par.set)) +g_mbo_iter = 2 # iterations of MBO in each episode: g_mbo_iter*sum(getParamLengths(ps)) diff --git a/lte/reinbo_table_demo.R b/lte/reinbo_table_demo.R new file mode 100644 index 0000000..bb004f8 --- /dev/null +++ b/lte/reinbo_table_demo.R @@ -0,0 +1,38 @@ +rm(list = ls()) +library(mlr) +library(mlrCPO) +library(reticulate) +library(BBmisc) +library(OpenML) +library(hash) +library(rlR) +library(mlrMBO) +library(phng) +library(R6) + +source("reinbo_table_hyperpara_space.R") +source("reinbo_table_utils.R") +source("reinbo_table_env.R") +source("reinbo_table_func.R") +source("system.R") +source("bt_conf.R") + +task = convertOMLTaskToMlr(getOMLTask(37))$mlr.task %>>% cpoDummyEncode(reference.cat = FALSE) +outer_loop = makeResampleInstance("CV", iters = 5, stratify = TRUE, task) +train_set = outer_loop$train.inds[[1]] +test_set = outer_loop$test.inds[[1]] + + +# The pipeline search space could also be customized by setting, e.g., +custom_operators = list(preprocess = c("cpoScale()", "NA"), + filter = c("cpoPca(center = FALSE, rank)", "cpoFilterAnova(perc)", "NA"), + classifier = c("classif.kknn", "classif.naiveBayes")) + + + + +best_model = reinbo(task, custom_operators = NULL, budget = 100, train_set = train_set) +# best_model = reinbo(task, custom_operators = custom_operators, budget = 100, train_set = train_set) +pred = lock_eval.reinbo.table(task, measure = list(mmce), train_set, test_set, best_model) +best_model$env$agent$q_tab +best_model$env$agent$act_names_per_state diff --git a/lte/reinbo_table_env.R b/lte/reinbo_table_env.R new file mode 100644 index 0000000..3206439 --- /dev/null +++ b/lte/reinbo_table_env.R @@ -0,0 +1,132 @@ +Q_table_Env = R6::R6Class( + "Q_table_Env", + inherit = rlR::Environment, + public = list( + step_cnt = NULL, + s_r_d_info = NULL, + task = NULL, + mbo_cache = NULL, # store pipeline, hyperparameter set and corresponding performance for MBO + model_best_perf = NULL, # best performance of sampled model until now + model_trained = NULL, # store all trained models (limited to budget) + budget = NULL, # maximun models to be evaluated + measure = NULL, + cv_instance = NULL, + ctrl = NULL, + initialize = function(task, budget, measure, cv_instance, ctrl){ + self$flag_continous = FALSE # non-continuous action + self$flag_tensor = FALSE # no use of cnn + self$ctrl = ctrl + self$act_cnt = self$ctrl$g_act_cnt # available operators/actions at each stage + self$state_dim = self$ctrl$g_state_dim + self$step_cnt = 0L + self$s_r_d_info = list( + state = "s", + reward = 0, + done = FALSE, + info = list()) + self$task = task + self$mbo_cache = hash() + self$model_trained = NULL + self$budget = budget + self$measure = measure + self$cv_instance = cv_instance + }, + + evaluateArm = function(vec_arm) { + return(vec_arm) + }, + + # This function will be called at each step of the learning + step = function(action) { + operators = self$ctrl$g_operators[[names(self$ctrl$g_operators)[self$step_cnt + 1]]] + operator = operators[action] + if (action > length(operators)) {operator = operators[action %% length(operators)]} + self$s_r_d_info[["state"]] = paste0(self$s_r_d_info[["state"]], "-[", operator, "]") + #print(self$s_r_d_info[["state"]]) + self$s_r_d_info[["reward"]] = 0 + self$step_cnt = self$step_cnt + 1L + if (self$step_cnt >= self$ctrl$g_max_depth) { + model = g_getRLPipeline(self$s_r_d_info[["state"]]) + #print(paste(model, collapse = " --> ")) + # stop RL agent if no enough budget for this episode: + model_id = paste(model, collapse = "\t") + if (has.key(model_id, self$mbo_cache)){ + require_budget = self$ctrl$g_mbo_iter*sum(getParamLengths(g_getParamSetFun(model))) + } else { + require_budget = (self$ctrl$g_init_design + self$ctrl$g_mbo_iter)*sum(getParamLengths(g_getParamSetFun(model))) + } + if(self$budget < require_budget) stop("too small total budget for reinbo table!") + if (self$budget - length(self$model_trained) < require_budget) { + self$agent$interact$idx_episode = self$agent$interact$maxiter + self$s_r_d_info[["done"]] = TRUE + } else { + # train model with hyperparameter tuning: + self$tuning(model) + self$s_r_d_info[["reward"]] = self$model_best_perf # best performance of the model until now + self$s_r_d_info[["done"]] = TRUE + #print(paste("Best Perfomance:", self$model_best_perf)) + } + } + return(self$s_r_d_info) + }, + + + # This function will be called at the beginning of the learning and at the end of each episode + reset = function() { + self$step_cnt = 0 + self$s_r_d_info[["state"]] = "s" + self$s_r_d_info[["done"]] = FALSE + self$s_r_d_info + }, + + + # Hyperparameter tuning for generated model, return best performance as reward and update mbo_cache + tuning = function(model) { + model_id = paste(model, collapse = "\t") # mdoel_id for search in mbo_cache + ps = g_getParamSetFun(model) # generate parameter set + + # check if we have already evaluated this model + + # if already in mbo_cache: + if (has.key(model_id, self$mbo_cache)){ + previous_perf = max(self$mbo_cache[[model_id]][ , "y"]) # best performance until now + epis_unimproved = self$mbo_cache[[model_id]][1, "epis_unimproved"] # number of episodes that performance has not been improved + # if in more than 2 episodes that the performance of this model has not been improved, + # stop further hyperparameter tuning: + if (epis_unimproved > 2) { + self$model_best_perf = previous_perf + } else { + # else: use parameter set and performance in memory as initial design + design = self$mbo_cache[[model_id]][ , -length(self$mbo_cache[[model_id]])] + # run several iterations of MBO: + run = mbo_fun(self$task, model, design, self$measure, self$cv_instance, self$ctrl) + # best accuracy: + self$model_best_perf = run$y + # update mbo_cache: + self$mbo_cache[[model_id]] = run$opt.path$env$path + # add result to self$model_trained: + new = run$opt.path$env$path$y[run$opt.path$env$dob != 0] + self$model_trained = c(self$model_trained, new) + # check if the performance of this model has been improved in this episode: + if (run$y <= previous_perf) { + self$mbo_cache[[model_id]]["epis_unimproved"] = epis_unimproved + 1 + } else { + self$mbo_cache[[model_id]]["epis_unimproved"] = 0 + } + } + } else { + + # if not in mbo_cache: + design = generateDesign(n = self$ctrl$g_init_design*sum(getParamLengths(ps)), par.set = ps) + run = mbo_fun(self$task, model, design, self$measure, self$cv_instance, self$ctrl) # potential warning: generateDesign could only produce 3 points instead of 1000, see issue 442 of mlrMBO + self$model_best_perf = run$y + self$mbo_cache[[model_id]] = run$opt.path$env$path + self$mbo_cache[[model_id]]["epis_unimproved"] = 0 + new = run$opt.path$env$path$y + self$model_trained = c(self$model_trained, new) + } + } + ) +) + + diff --git a/lte/reinbo_table_func.R b/lte/reinbo_table_func.R new file mode 100644 index 0000000..eedfd4a --- /dev/null +++ b/lte/reinbo_table_func.R @@ -0,0 +1,109 @@ +# ML_ReinBo algorithm: +opt.reinbo.table = function(task, budget, measure, init_val, train_set = NULL, conf, ctrl) { + subTask = task + if (!is.null(train_set)) subTask = subsetTask(task, train_set) + inner_loop = makeResampleInstance("CV", iters = getGconf()$NCVInnerIter, stratify = TRUE, subTask) + env = runQTable(subTask, budget, measure, inner_loop, init_val, conf, ctrl) + mmodel = getBestModel(env$mbo_cache) + return(list(mmodel = mmodel, env = env)) +} + +# Predict function: evaluate best model on test dataset +lock_eval.reinbo.table = function(task, measure, train_set, test_set, best_model){ + best_model = best_model$mmodel + lrn = genLearnerForBestModel(task, best_model, measure) + mod = train(lrn, task, subset = train_set) + pred = predict(mod, task, subset = test_set) + perf = performance(pred, measures = measure) + return(perf) +} + + +# Reinforcement learning part: +#' @param ctrl pipeline configuration +runQTable <- function(task, budget, measure, instance, init_val, conf, ctrl) { + env = Q_table_Env$new(task, budget, measure, instance, ctrl) + agent = initAgent(name = "AgentTable", env = env, conf = conf, q_init = init_val, + state_names = ctrl$g_state_names, + act_names_per_state = get_act_names_perf_state(ctrl$g_operators), + vis_after_episode = TRUE) + agent$learn(getGconf()$RLMaxEpisode) + return(env) +} + +# MBO function: hyperparameter tuning +#' @param model character vector +mbo_fun = function(task, model, design, measure, cv_instance, ctrl) { + ps = g_getParamSetFun(model) # get parameter set from string representation of a model + object = makeSingleObjectiveFunction( + fn = function(x) { + -reinbo_mlr_fun(task, model, x, measure, cv_instance) + runif(1)/100000 + }, + par.set = ps, + has.simple.signature = FALSE, + minimize = FALSE + ) + ctrlmbo = setMBOControlTermination(makeMBOControl(), iters = ctrl$g_mbo_iter * sum(getParamLengths(ps))) # 2 times the parameter set size + run = mbo(object, design = design, control = ctrlmbo, show.info = FALSE) + ## in (function (fn, nvars, max = FALSE, pop.size = 1000, max.generations = 100, : Stopped because hard maximum generation limit was hit. + ## Genoud is a function that combines evolutionary search algorithms with derivative-based (Newton or quasi-Newton) methods to solve difficult optimization problems. + ## not always occur: Warning in generateDesign(control$infill.opt.focussearch.points, ps.local,: generateDesign could only produce 20 points instead of 1000! + ## in https://github.com/mlr-org/mlrMBO/issues/442, is being worked on https://github.com/mlr-org/mlrMBO/pull/444 + return(run) +} + + +# Mlr function: caculate performance of generated model given specific param_set +reinbo_mlr_fun = function(task, model, param_set, measure, cv_instance){ + lrn = genLearner.reinbo(task, model, param_set, measure) + perf = resample(lrn, task, resampling = cv_instance, measures = measure, show.info = FALSE)$aggr + return(perf) +} + + + +# To get best model from mbo_cache of environment: +getBestModel = function(cache){ + models = keys(cache) + results = data.frame(model = 0, y = 0) + for (i in 1:length(models)) { + results[i, 1] = models[i] + results[i, 2] = max(cache[[models[i]]][, "y"]) + } + key = results[results$y == max(results$y), "model"][1] + ps = cache[[key]] + ps = ps[(ps$y == max(ps$y)), (colnames(ps) != "epis_unimproved")][1, ] + return(data.frame(Model = key, ps)) +} + +genLearnerForBestModel = function(task, best_model, measure){ + model = strsplit(as.character(best_model$Model), "\t")[[1]] + param_set = as.list(best_model) + param_set$Model = NULL + param_set$y = NULL + if (!is.null(param_set$C)) { param_set$C = 2^param_set$C } + if (!is.null(param_set$sigma)) { param_set$sigma = 2^param_set$sigma } + lrn = genLearner.reinbo(task, model, param_set, measure) + return(lrn) +} + + +genLearner.reinbo = function(task, model, param_set, measure){ + p = getTaskNFeats(task) + lrn = sprintf("%s %%>>%% %s %%>>%% makeLearner('%s', par.vals = ps.learner)", + model[1], model[2], model[3]) + lrn = gsub(pattern = "perc", x = lrn, replacement = "perc = param_set$perc", fixed = TRUE) + lrn = gsub(pattern = "rank", x = lrn, replacement = "rank = as.integer(max(1, round(p*param_set$rank)))", fixed = TRUE) + lrn = gsub(pattern = "NA %>>%", x = lrn, replacement = "", fixed = TRUE) + ps.learner = param_set + ps.learner$perc = NULL + ps.learner$rank = NULL + if (model[3] == "classif.ranger") { + p1 = p + if (!is.null(param_set$perc)) {p1 = max(1, round(p*param_set$perc))} + if (!is.null(param_set$rank)) {p1 = max(1, round(p*param_set$rank))} + ps.learner$mtry = max(1, as.integer(p1*param_set$mtry)) + } + lrn = eval(parse(text = lrn)) + return(lrn) +} diff --git a/lte/reinbo_table_hyperpara_space.R b/lte/reinbo_table_hyperpara_space.R new file mode 100644 index 0000000..404f31d --- /dev/null +++ b/lte/reinbo_table_hyperpara_space.R @@ -0,0 +1,42 @@ +##### Parameter set of operators for hyperparameter tuning: +ps.ksvm = makeParamSet( + makeNumericParam("C", lower = -15, upper = 15, trafo = function(x) 2^x), + makeNumericParam("sigma", lower = -15, upper = 15, trafo = function(x) 2^x)) + +ps.ranger = makeParamSet( + makeNumericParam("mtry", lower = 1/10, upper = 1/1.5), ## range(p/10, p/1.5), p is the number of features + makeNumericParam("sample.fraction", lower = .1, upper = 1)) + +ps.xgboost = makeParamSet( + makeNumericParam("eta", lower = .001, upper = .3), + makeIntegerParam("max_depth", lower = 1L, upper = 15L), + makeNumericParam("subsample", lower = 0.5, upper = 1), + makeNumericParam("colsample_bytree", lower = 0.5, upper = 1), + makeNumericParam("min_child_weight", lower = 0, upper = 50) + ) + +ps.kknn = makeParamSet(makeIntegerParam("k", lower = 1L, upper = 20L)) + +ps.naiveBayes = makeParamSet(makeNumericParam("laplace", lower = 0.01, upper = 100)) + +ps.filter = makeParamSet(makeNumericParam("perc", lower = .1, upper = 1)) + +ps.pca = makeParamSet(makeNumericParam("rank", lower = .1, upper = 1)) ## range(p/10, p), p is the number of features + + + +##### Get parameter set for generated model: +g_getParamSetFun = function(model) { + ps.classif = sub(pattern = "classif", model[3], replacement = "ps") + ps.classif = eval(parse(text = ps.classif)) # hyperparameter set for classifier + if (model[2] == "NA") { + return(ps.classif) + } else if (length(grep(pattern = "perc", x = model)) > 0) { + return(c(ps.classif, ps.filter)) + } else { + return(c(ps.classif, ps.pca)) + } +} + + + diff --git a/lte/reinbo_table_reinbo.R b/lte/reinbo_table_reinbo.R new file mode 100644 index 0000000..66adc9d --- /dev/null +++ b/lte/reinbo_table_reinbo.R @@ -0,0 +1,19 @@ +#### function reinbo +#'@param task an mlr task +reinbo = function(task, custom_operators, budget, train_set) { + ## Parameters for RL environment: + ctrl = list() + ctrl$g_operators = g_getOperatorList(custom_operators) + ctrl$g_max_depth = length(ctrl$g_operators) # stages: Scaling --> Feature filtering --> Classification + ctrl$g_act_cnt = max(sapply(ctrl$g_operators, length)) # max number of available operators at each stage + ctrl$g_state_names = g_genStateList(ctrl$g_operators) + ctrl$g_state_dim = length(ctrl$g_state_names) + ## Parameters for BO_PROBE: + ctrl$g_init_design = 4 # initial design size for MBO: g_init_design*sum(getParamLengths(par.set)) + ctrl$g_mbo_iter = 2 # iterations of MBO in each episode: g_mbo_iter*sum(getParamLengths(ps)) + + conf = rlR::getDefaultConf("AgentTable") + conf$set(policy.maxEpsilon = 1, policy.minEpsilon = 0.01, policy.aneal.steps = 60) + best_model = opt.reinbo.table(task, budget = 100L, measure = list(mmce), train_set = train_set, init_val = -1, conf = conf, ctrl) + best_model +} diff --git a/lte/reinbo_table_test.R b/lte/reinbo_table_test.R new file mode 100644 index 0000000..fbe3b0c --- /dev/null +++ b/lte/reinbo_table_test.R @@ -0,0 +1,31 @@ +rm(list = ls()) +library(mlr) +library(mlrCPO) +library(reticulate) +library(BBmisc) +library(OpenML) +library(hash) +library(rlR) +library(mlrMBO) +library(phng) +library(R6) + +source("reinbo_table_hyperpara_space.R") +source("reinbo_table_utils.R") +source("reinbo_table_env.R") +source("reinbo_table_func.R") +source("system.R") +source("bt_conf.R") + +task = convertOMLTaskToMlr(getOMLTask(37))$mlr.task %>>% cpoDummyEncode(reference.cat = FALSE) +outer_loop = makeResampleInstance("CV", iters = 5, stratify = TRUE, task) + +train_set = outer_loop$train.inds[[1]] +test_set = outer_loop$test.inds[[1]] + +conf = rlR::getDefaultConf("AgentTable") +conf$set(policy.maxEpsilon = 1, policy.minEpsilon = 0.01, policy.aneal.steps = 60) +best_model = opt.reinbo.table(task, budget = 100L, measure = list(mmce), train_set = train_set, init_val = -1, conf = conf) +pred = lock_eval.reinbo.table(task, measure = list(mmce), train_set, test_set, best_model) +best_model$env$agent$q_tab +best_model$env$agent$act_names_per_state diff --git a/lte/reinbo_table_utils.R b/lte/reinbo_table_utils.R new file mode 100644 index 0000000..af90408 --- /dev/null +++ b/lte/reinbo_table_utils.R @@ -0,0 +1,57 @@ +source("reinbo_table_reinbo.R") +# Get list of operators: +#' @example g_getOperatorList(NULL) +g_getOperatorList = function(custom_operators) { + default_operators = list( + preprocess = c("cpoScale()", "cpoScale(scale = FALSE)", "cpoScale(center = FALSE)", "cpoSpatialSign()", "NA"), + filter = c("cpoFilterAnova(perc)", "cpoFilterKruskal(perc)", "cpoPca(center = FALSE, rank)", "cpoFilterUnivariate(perc)", "NA"), + classifier = c("classif.ksvm", "classif.ranger", "classif.kknn", "classif.xgboost", "classif.naiveBayes")) + for (stage in names(default_operators)){ + if (!is.null(custom_operators[[stage]])) { + default_operators[stage] = custom_operators[stage] + } + } + return(default_operators) +} + +# Generate list of all potential states in Q table: +g_genStateList = function(operators) { + state_list = c("s") + last_stage = state_list + for (stage in c("preprocess", "filter")){ + current_stage = c() + for (i in last_stage){ + for (j in operators[stage]){ + current_stage = c(current_stage, paste0(i, "-[", j, "]")) + } + } + state_list = c(state_list, current_stage) + last_stage = current_stage + } + return(state_list) +} + + +# Get list of all potential actions at each state: +get_act_names_perf_state = function(g_operators){ + list = list("s" = g_operators$preprocess) + step1_states = sprintf("s-[%s]", g_operators$preprocess) + for (i in step1_states) { + text = sprintf("list$'%s' = g_operators$filter", i) + eval(parse(text = text)) + for (j in sprintf("%s-[%s]", i, g_operators$filter)) { + text = sprintf("list$'%s' = g_operators$classifier", j) + eval(parse(text = text)) + } + } + return(list) +} + +# Get model at end of each episode: +g_getRLPipeline = function(last_state) { + model = unlist(lapply(strsplit(last_state, "-")[[1]][-1], + function(x) { + x = gsub("[", x, replacement = "", fixed = TRUE) + gsub("]", x, replacement = "", fixed = TRUE)})) + return(model) +} diff --git a/lte/smac_obj.R b/lte/smac_obj.R new file mode 100644 index 0000000..b62ff2f --- /dev/null +++ b/lte/smac_obj.R @@ -0,0 +1,17 @@ +# Objective to optimize: +toy_smac_obj = function(cfg) { + print(cfg) + runif(1) +} +smac_objective = function(cfg) { + # some variables are defined in the scope where this function is called + model_index <<- model_index + 1 + model_list[[model_index]] <<- cfg + lrn = gen_mlrCPOPipe_from_smac_cfg(cfg) + perf = resample(lrn, subTask, resampling = inner_loop, measures = measure, show.info = FALSE)$aggr + perf_list <<- c(perf_list, as.numeric(perf)) + return(perf) +} + + + diff --git a/lte/smac_obj.py b/lte/smac_obj.py new file mode 100644 index 0000000..d6ec23a --- /dev/null +++ b/lte/smac_obj.py @@ -0,0 +1,30 @@ +import rpy2 +import rpy2.robjects as robjects +import rpy2.robjects.numpy2ri +rpy2.robjects.numpy2ri.activate() +robjects.conversion.py2ri = rpy2.robjects.numpy2ri +from rpy2.robjects.packages import STAP +# if rpy2 < 2.6.1 do: +# from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage +# STAP = SignatureTranslatedAnonymousPackage +with open('smac_obj.R', 'r') as f: + string = f.read() +myfunc = STAP(string, "toy_smac_obj") +def smac_obj_from_cfg(cfg): + """ Creates a SVM based on a configuration and evaluates it on the + iris-dataset using cross-validation. + + Parameters: + ----------- + cfg: Configuration (ConfigSpace.ConfigurationSpace.Configuration) + Configuration containing the parameters. + Configurations are indexable! + + Returns: + -------- + A crossvalidated mean score for the svm on the loaded data-set. + """ + # For deactivated parameters, the configuration stores None-values. + # This is not accepted by the SVM, so we remove them. + cfg = {k : cfg[k] for k in cfg if cfg[k]} + return myfunc.toy_smac_obj(cfg) diff --git a/lte/system.R b/lte/system.R new file mode 100644 index 0000000..e28875f --- /dev/null +++ b/lte/system.R @@ -0,0 +1,12 @@ +sys = Sys.info() +flag_local = as.list(sys)$user == "JialiLin" +mconf.file = NULL + +if (flag_local) { + reticulate::use_python("/usr/local/bin/python3") + mconf.file = NA +} else { + reticulate::use_condaenv("w_env") + mconf.file = "lrz.batchtools.conf.R" +} + diff --git a/lte/tpe_func.R b/lte/tpe_func.R new file mode 100644 index 0000000..75e8fb9 --- /dev/null +++ b/lte/tpe_func.R @@ -0,0 +1,62 @@ +# TPE algorithm: +opt.tpe = function(task, budget, measure, train_set = NULL) { + subTask <<- task + if (!is.null(train_set)) subTask <<- subsetTask(task, train_set) + inner_loop <<- makeResampleInstance("CV", iters = getGconf()$NCVInnerIter, stratify = TRUE, subTask) + source_python("tpe_space.py") + hp = import("hyperopt") + model_index <<- 0 + model_list <<- list() + perf_list <<- NULL + measure <<- measure + best = hp$fmin(objective, space = space, algo = hp$tpe$suggest, max_evals = budget) + best_model_index = which(perf_list == min(perf_list))[1] + mmodel = model_list[[best_model_index]] + return(mmodel) +} + + +# Predict function: evaluate best model on test dataset +lock_eval.tpe = function(task, measure, train_set, test_set, best_model){ + lrn = genLearner.tpe(best_model) + mod = train(lrn, task, subset = train_set) + pred = predict(mod, task, subset = test_set) + mpred = performance(pred, measures = measure) + return(mpred) +} + + +# Objective to optimize: +objective = function(args) { + model_index <<- model_index + 1 + model_list[[model_index]] <<- args + lrn = genLearner.tpe(args) + perf = resample(lrn, subTask, resampling = inner_loop, measures = measure, show.info = FALSE)$aggr + perf_list <<- c(perf_list, as.numeric(perf)) + return(perf) +} +# one sample of args: args = hp$pyll$stochastic$sample(py$space) + + +# Generate mlr learner for configuration: +genLearner.tpe = function(args){ + model = args$Classifier$model + args$Classifier$model = NULL + ps.learner = args$Classifier # list in R, which can be evaluated by eval + filter = args$FeatureFilter$filter + lrn = sprintf("%s %%>>%% %s %%>>%% makeLearner('classif.%s', par.vals = ps.learner)", + args$Preprocess, filter, model) + lrn = gsub(pattern = "NA %>>%", x = lrn, replacement = "", fixed = TRUE) + lrn = gsub(pattern = "perc", x = lrn, replacement = "perc = args$FeatureFilter$perc", fixed = TRUE) + p = getTaskNFeats(subTask) + lrn = gsub(pattern = "rank", x = lrn, replacement = "rank = as.integer(max(1, round(p*args$FeatureFilter$rank)))", fixed = TRUE) + if (model == "ranger") { + p1 = p + if (!is.null(args$FeatureFilter$perc)) {p1 = max(1, round(p*args$FeatureFilter$perc))} + if (!is.null(args$FeatureFilter$rank)) {p1 = max(1, round(p*args$FeatureFilter$rank))} + ps.learner$mtry = max(1, as.integer(p1*args$FeatureFilter$mtry)) + } + lrn = eval(parse(text = lrn)) + return(lrn) +} + diff --git a/lte/tpe_space.py b/lte/tpe_space.py new file mode 100644 index 0000000..2a93357 --- /dev/null +++ b/lte/tpe_space.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on 15.2.2019 + +@author: Jiali Lin +""" + +from hyperopt import hp +import hyperopt.pyll.stochastic + +# Define the search space +space = { + # Step 1: + 'Preprocess': hp.choice('pre', + ['cpoScale()', + 'cpoScale(scale = FALSE)', + 'cpoScale(center = FALSE)', + 'cpoSpatialSign()', + 'NA']), + + + # Step 2: + 'FeatureFilter': hp.choice('feature', [ + {'filter': 'cpoFilterAnova(perc)', + 'perc': hp.uniform('ano_per', 0.1, 1)}, + + {'filter': 'cpoFilterKruskal(perc)', + 'perc': hp.uniform('kru_per', 0.1, 1)}, + + {'filter': 'cpoFilterUnivariate(perc)', + 'perc': hp.uniform('uni_per', 0.1, 1)}, + + {'filter': 'cpoPca(center = FALSE, rank)', + 'rank': hp.uniform('pca_rank', 0.1, 1)}, + + {'filter': 'NA'}]), + + + # Step 3: + 'Classifier': hp.choice('classify_model', [ + {'model': 'kknn', + 'k': 1 + hp.randint('kknn_k', 19)}, + + {'model': 'ksvm', + 'C': hp.uniform('ksvm_C', 2**(-15), 2**(15)), + 'sigma': hp.uniform('ksvm_sigma', 2**(-15), 2**(15))}, + + {'model': 'ranger', + 'mtry': hp.uniform('ranger_mtry', 0.1, 0.66666), + 'sample.fraction': hp.uniform('ranger_fra', 0.1, 1)}, + + {'model': 'xgboost', + 'eta': hp.uniform('xgboost_eta', 0.001, 0.3), + 'max_depth': 1 + hp.randint('xgboost_depth', 14), + 'subsample': hp.uniform('xgboost_sub', 0.5, 1), + 'colsample_bytree': hp.uniform('xgboost_col', 0.5, 1), + 'min_child_weight': hp.uniform('xgboost_min', 0, 50)}, + + {'model': 'naiveBayes', + 'laplace': hp.uniform('bay_laplace', 0.01, 100)} + + ])} + + + +# Sample one configuration: +# print(hyperopt.pyll.stochastic.sample(space)) +#print(hyperopt.pyll.stochastic.sample(space)) +#106 {'Classifier': {'model': 'ranger', 'mtry': 0.574453305013119, 'sample.fracti +#107 on': 0.8656502995483121}, 'FeatureFilter': {'filter': 'cpoFilterAnova(perc)' +#108 , 'perc': 0.3726989872044636}, 'Preprocess': 'NA'} diff --git a/lte/tpot_func.R b/lte/tpot_func.R new file mode 100644 index 0000000..1b63286 --- /dev/null +++ b/lte/tpot_func.R @@ -0,0 +1,17 @@ +source("system.R") +tpot = import("tpot") +opt.tpot = function(task, budget, measure, train_set) { + conf_tpot = getGconf()$conf_tpot + pipeline_optimizer = tpot$TPOTClassifier(generations = conf_tpot$generations, population_size = conf_tpot$population_size, + offspring_size = conf_tpot$offspring_size, cv = getGconf()$NCVInnerIter, + config_dict = conf_tpot$config_dict) + train_data = getTaskData(task, train_set, target.extra = TRUE) + pipeline_optimizer$fit(train_data$data, as.numeric(train_data$target)) + return(pipeline_optimizer) +} + +lock_eval.tpot = function(task, measure = NULL, train_set = NULL, test_set, best_model) { + test_data = getTaskData(task, test_set, target.extra = TRUE) + mpred = 1 - best_model$score(test_data$data, as.numeric(test_data$target)) + return(mpred) +} \ No newline at end of file diff --git a/lte/tpot_test.R b/lte/tpot_test.R new file mode 100644 index 0000000..a542881 --- /dev/null +++ b/lte/tpot_test.R @@ -0,0 +1,27 @@ +rm(list = ls()) +library(mlr) +library(mlrCPO) +library(reticulate) +library(BBmisc) +library(OpenML) +source("system.R") + +set.seed(1) +task = convertOMLTaskToMlr(getOMLTask(37))$mlr.task %>>% cpoDummyEncode(reference.cat = FALSE) +outer_loop_CV5 = makeResampleInstance("CV", iters = 5, task = task) +train_set = outer_loop_CV5$train.inds[[1]] +test_set = outer_loop_CV5$test.inds[[1]] +train_data = getTaskData(task, train_set, target.extra = TRUE) +test_data = getTaskData(task, test_set, target.extra = TRUE) + +tpot = import("tpot") +pipeline_optimizer = tpot$TPOTClassifier(generations = 2L, population_size = 3L, + offspring_size = 3L, cv = 5L, + config_dict = 'TPOT light') + +pipeline_optimizer$fit(train_data$data, as.numeric(train_data$target)) +pred = 1 - pipeline_optimizer$score(test_data$data, as.numeric(test_data$target)) + + + + diff --git a/lte/utility.R b/lte/utility.R new file mode 100644 index 0000000..f4f09d9 --- /dev/null +++ b/lte/utility.R @@ -0,0 +1,14 @@ +resample_opt_lock = function(mlr_task_full, outer_loop_rins, func_opt, func_eval, args_opt = list(), args_eval = list()) { + outer_iters = getGconf()$NCVOuterIter + measure = getGconf()$measures + list_lock = foreach(outer_iter = 1:outer_iters) %do% { + opt_set = outer_loop_rins$train.inds[[outer_iter]] + lock_set = outer_loop_rins$test.inds[[outer_iter]] + mmodel = do.call(func_opt, args = c(list(task = mlr_task_full, train_set = opt_set, measure = measure), args_opt)) + mpred = do.call(func_eval, args = c(list(task = mlr_task_full, train_set = opt_set, test_set = lock_set, measure = measure, best_model = mmodel), args_eval)) + return(list(mmodel = mmodel, mpred = mpred)) + } + list_mmodel = rlist::list.map(list_lock, mmodel) + vec_mpred = unlist(rlist::list.map(list_lock, mpred)) + return(list(list_mmodel = list_mmodel, vec_mpred = vec_mpred)) +}