diff --git a/examples/EDMF_data/emulator-rank-test.jl b/examples/EDMF_data/emulator-rank-test.jl
index 16728b35..f26a07ef 100644
--- a/examples/EDMF_data/emulator-rank-test.jl
+++ b/examples/EDMF_data/emulator-rank-test.jl
@@ -51,7 +51,7 @@ function main()
     n_iteration = 10
 
 
-    
+
     # Output figure save directory
     figure_save_directory = joinpath(@__DIR__, "output", exp_name, string(Dates.today()))
     data_save_directory = joinpath(@__DIR__, "output", exp_name, string(Dates.today()))
@@ -136,43 +136,43 @@ function main()
         truth_cov = truth_cov[good_datadim, good_datadim]
 
         # split out a training set:
-        n_test = Int(floor(0.2*length(good_particles)))
-        n_train = Int(ceil(0.8*length(good_particles)))
-        
+        n_test = Int(floor(0.2 * length(good_particles)))
+        n_train = Int(ceil(0.8 * length(good_particles)))
+
         rng_seed = 14225
         rng = Random.MersenneTwister(rng_seed)
 
         # random train-set
         #        train_idx = shuffle(rng, collect(1:n_test+n_train))[1:n_train]
         # final train earlier iteration, test final iteration
-        train_idx = collect(1:n_test+n_train)[1:n_train]
-        test_idx = setdiff(collect(1:n_test+n_train),train_idx)
-        
-        train_inputs = inputs[:,train_idx]
-        train_outputs = outputs[:,train_idx]
-
-        test_inputs = inputs[:,test_idx]
-        test_outputs = outputs[:,test_idx]
-        
+        train_idx = collect(1:(n_test + n_train))[1:n_train]
+        test_idx = setdiff(collect(1:(n_test + n_train)), train_idx)
+
+        train_inputs = inputs[:, train_idx]
+        train_outputs = outputs[:, train_idx]
+
+        test_inputs = inputs[:, test_idx]
+        test_outputs = outputs[:, test_idx]
+
         train_pairs = DataContainers.PairedDataContainer(train_inputs, train_outputs)
     end
-    
+
     @info "Completed data loading stage"
     println(" ")
     ##############################################
     # [3. ] Build Emulator from calibration data #
     ##############################################
 
-    opt_diagnostics = zeros(length(rank_test),n_repeats,n_iteration)
-    train_err = zeros(length(rank_test),n_repeats)
-    test_err = zeros(length(rank_test),n_repeats)
-    ttt = zeros(length(rank_test),n_repeats)
+    opt_diagnostics = zeros(length(rank_test), n_repeats, n_iteration)
+    train_err = zeros(length(rank_test), n_repeats)
+    test_err = zeros(length(rank_test), n_repeats)
+    ttt = zeros(length(rank_test), n_repeats)
 
     @info "Begin Emulation stage"
     # Create GP object
     train_frac = 0.9
     n_cross_val_sets = 2
-    max_feature_size=size(get_outputs(train_pairs), 2) * size(get_outputs(train_pairs),1) * (1-train_frac)
+    max_feature_size = size(get_outputs(train_pairs), 2) * size(get_outputs(train_pairs), 1) * (1 - train_frac)
     for (rank_id, rank_val) in enumerate(rank_test) #test over different ranks for svd-nonsep
         rng_seed = 99330
         rng = Random.MersenneTwister(rng_seed)
@@ -186,10 +186,10 @@ function main()
                 "scheduler" => DataMisfitController(terminate_at = 1000),
                 "cov_sample_multiplier" => 0.2,
                 "n_iteration" => n_iteration,
-                "n_features_opt" => Int(floor((max_feature_size/5))),# here: /5 with rank <= 3 works
+                "n_features_opt" => Int(floor((max_feature_size / 5))),# here: /5 with rank <= 3 works
                 "localization" => SEC(0.05),
                 "n_ensemble" => 400,
-                "n_cross_val_sets" => n_cross_val_sets,        
+                "n_cross_val_sets" => n_cross_val_sets,
             )
             if case == "RF-prior"
                 overrides = Dict("verbose" => true, "cov_sample_multiplier" => 0.01, "n_iteration" => 0)
@@ -200,7 +200,7 @@ function main()
             decorrelate = true
 
             if case == "GP"
-                rank_val=0
+                rank_val = 0
                 gppackage = Emulators.SKLJL()
                 pred_type = Emulators.YType()
                 mlt = GaussianProcess(
@@ -210,9 +210,9 @@ function main()
                     noise_learn = false,
                 )
             elseif case ∈ ["RF-vector-svd-sep"]
-                kernel_structure = SeparableKernel(LowRankFactor(5, nugget),LowRankFactor(rank_val,nugget))
+                kernel_structure = SeparableKernel(LowRankFactor(5, nugget), LowRankFactor(rank_val, nugget))
                 n_features = 500
-                
+
                 mlt = VectorRandomFeatureInterface(
                     n_features,
                     input_dim,
@@ -221,11 +221,11 @@ function main()
                     kernel_structure = kernel_structure,
                     optimizer_options = overrides,
                 )
-                
+
             elseif case ∈ ["RF-vector-svd-nonsep", "RF-prior"]
                 kernel_structure = NonseparableKernel(LowRankFactor(rank_val, nugget))
                 n_features = 500
-                
+
                 mlt = VectorRandomFeatureInterface(
                     n_features,
                     input_dim,
@@ -237,7 +237,7 @@ function main()
             elseif case ∈ ["RF-vector-nosvd-nonsep"]
                 kernel_structure = NonseparableKernel(LowRankFactor(rank_val, nugget))
                 n_features = 500
-                
+
                 mlt = VectorRandomFeatureInterface(
                     n_features,
                     input_dim,
@@ -248,11 +248,11 @@ function main()
                 )
                 decorrelate = false
             end
-            
+
             # Fit an emulator to the data
             normalized = true
-            
-            ttt[rank_id,rep_idx] = @elapsed begin
+
+            ttt[rank_id, rep_idx] = @elapsed begin
                 emulator = Emulator(
                     mlt,
                     train_pairs;
@@ -260,7 +260,7 @@ function main()
                     normalize_inputs = normalized,
                     decorrelate = decorrelate,
                 )
-            
+
                 # Optimize the GP hyperparameters for better fit
                 optimize_hyperparameters!(emulator)
             end
@@ -271,7 +271,7 @@ function main()
                 train_mean, _ = Emulators.predict(emulator, train_inputs[:, i:i], transform_to_real = true) # 3x1
                 train_err_tmp[1] += norm(train_mean - train_outputs[:, i])
             end
-            train_err[rank_id,rep_idx] = 1 / size(train_inputs, 2) * train_err_tmp[1]
+            train_err[rank_id, rep_idx] = 1 / size(train_inputs, 2) * train_err_tmp[1]
 
             # error of emulator on test points: 
             test_err_tmp = [0.0]
@@ -279,32 +279,36 @@ function main()
                 test_mean, _ = Emulators.predict(emulator, test_inputs[:, i:i], transform_to_real = true) # 3x1
                 test_err_tmp[1] += norm(test_mean - test_outputs[:, i])
             end
-            test_err[rank_id,rep_idx] = 1 / size(test_inputs, 2) * test_err_tmp[1]
+            test_err[rank_id, rep_idx] = 1 / size(test_inputs, 2) * test_err_tmp[1]
 
             @info "train error ($(size(train_inputs,2)) pts): $(train_err[rank_id,rep_idx])"
             @info "test error ($(size(test_inputs, 2)) pts): $(test_err[rank_id,rep_idx])"
             if !(case ∈ ["GP", "RF-prior"])
-                opt_diagnostics[rank_id,rep_idx,:] =  get_optimizer(mlt)[1] #length-1 vec of vec -> vec
+                opt_diagnostics[rank_id, rep_idx, :] = get_optimizer(mlt)[1] #length-1 vec of vec -> vec
                 eki_conv_filepath = joinpath(data_save_directory, "$(case)_$(rank_val)_eki-conv.jld2")
                 save(eki_conv_filepath, "opt_diagnostics", opt_diagnostics)
             end
 
-#            emulator_filepath = joinpath(data_save_directory, "$(case)_$(rank_val)_emulator.jld2")
-#            save(emulator_filepath, "emulator", emulator)
+            #            emulator_filepath = joinpath(data_save_directory, "$(case)_$(rank_val)_emulator.jld2")
+            #            save(emulator_filepath, "emulator", emulator)
 
             JLD2.save(
-                 joinpath(data_save_directory, case * "_edmf_rank_test_results.jld2"),
-                     "rank_test", collect(rank_test),
-                     "timings", ttt,
-                     "train_err", train_err,
-                     "test_err", test_err,
-                 )
+                joinpath(data_save_directory, case * "_edmf_rank_test_results.jld2"),
+                "rank_test",
+                collect(rank_test),
+                "timings",
+                ttt,
+                "train_err",
+                train_err,
+                "test_err",
+                test_err,
+            )
         end
     end
 
-    
-    
-    
+
+
+
     @info "Finished Emulation stage"
 end
 
diff --git a/examples/EDMF_data/plot_posterior.jl b/examples/EDMF_data/plot_posterior.jl
index ac44b902..0133c00c 100644
--- a/examples/EDMF_data/plot_posterior.jl
+++ b/examples/EDMF_data/plot_posterior.jl
@@ -7,176 +7,177 @@ using Dates
 using CalibrateEmulateSample.ParameterDistributions
 
 function main()
-#####
-# Creates 1 plots: One for a specific case, One with 2 cases, and One with all cases (final case being the prior).
-
-
-# date = Date(year,month,day)
-
-# 2-parameter calibration exp
-#exp_name = "ent-det-calibration"
-#date_of_run = Date(2023, 10, 5)
-
-# 5-parameter calibration exp
-exp_name = "ent-det-tked-tkee-stab-calibration"
-date_of_run = Date(2024, 11, 03)
-@info "plotting results found in $(date_of_run)"
-# Output figure read/write directory
-figure_save_directory = joinpath(@__DIR__, "output", exp_name, string(date_of_run))
-data_save_directory = joinpath(@__DIR__, "output", exp_name, string(date_of_run))
-
-#case:
-cases = [
-    "GP", # diagonalize, train scalar GP, assume diag inputs
-    "RF-prior",
-    "RF-vector-svd-nonsep",
-    "RF-vector-svd-sep",
-    "RF-vector-nosvd-nonsep",
-]
-case_rf = cases[3]
-kernel_rank = 3
-prior_kernel_rank = 3
-# load
-posterior_filepath = joinpath(data_save_directory, "$(case_rf)_$(kernel_rank)_posterior.jld2")
-if !isfile(posterior_filepath)
-    throw(ArgumentError(posterior_filepath * " not found. Please check experiment name and date"))
-else
-    @info "Loading posterior distribution from: " * posterior_filepath
-    posterior = load(posterior_filepath)["posterior"]
-end
-# get samples explicitly (may be easier to work with)
-posterior_samples = vcat([get_distribution(posterior)[name] for name in get_name(posterior)]...) #samples are columns
-transformed_posterior_samples =
-    mapslices(x -> transform_unconstrained_to_constrained(posterior, x), posterior_samples, dims = 1)
-
-# histograms
-nparam_plots = sum(get_dimensions(posterior)) - 1
-density_filepath = joinpath(figure_save_directory, "$(case_rf)_$(kernel_rank)_posterior_dist_comp.png")
-transformed_density_filepath = joinpath(figure_save_directory, "$(case_rf)_$(kernel_rank)_posterior_dist_phys.png")
-labels = get_name(posterior)
-
-burnin = 50_000
-
-
-data_rf = (; [(Symbol(labels[i]), posterior_samples[i, burnin:end]) for i in 1:length(labels)]...)
-transformed_data_rf =
-    (; [(Symbol(labels[i]), transformed_posterior_samples[i, burnin:end]) for i in 1:length(labels)]...)
-
-
-
-#    p = pairplot(data_rf => (PairPlots.Scatter(),))
-
-p = pairplot(data_rf => (PairPlots.Contourf(sigmas = 1:1:3),))
-trans_p = pairplot(transformed_data_rf => (PairPlots.Contourf(sigmas = 1:1:3),))
-
-save(density_filepath, p)
-save(transformed_density_filepath, trans_p)
-@info "saved RF contour plot"
-
-#
-#
-#
+    #####
+    # Creates 1 plots: One for a specific case, One with 2 cases, and One with all cases (final case being the prior).
+
+
+    # date = Date(year,month,day)
+
+    # 2-parameter calibration exp
+    #exp_name = "ent-det-calibration"
+    #date_of_run = Date(2023, 10, 5)
+
+    # 5-parameter calibration exp
+    exp_name = "ent-det-tked-tkee-stab-calibration"
+    date_of_run = Date(2024, 11, 03)
+    @info "plotting results found in $(date_of_run)"
+    # Output figure read/write directory
+    figure_save_directory = joinpath(@__DIR__, "output", exp_name, string(date_of_run))
+    data_save_directory = joinpath(@__DIR__, "output", exp_name, string(date_of_run))
+
+    #case:
+    cases = [
+        "GP", # diagonalize, train scalar GP, assume diag inputs
+        "RF-prior",
+        "RF-vector-svd-nonsep",
+        "RF-vector-svd-sep",
+        "RF-vector-nosvd-nonsep",
+    ]
+    case_rf = cases[3]
+    kernel_rank = 3
+    prior_kernel_rank = 3
+    # load
+    posterior_filepath = joinpath(data_save_directory, "$(case_rf)_$(kernel_rank)_posterior.jld2")
+    if !isfile(posterior_filepath)
+        throw(ArgumentError(posterior_filepath * " not found. Please check experiment name and date"))
+    else
+        @info "Loading posterior distribution from: " * posterior_filepath
+        posterior = load(posterior_filepath)["posterior"]
+    end
+    # get samples explicitly (may be easier to work with)
+    posterior_samples = vcat([get_distribution(posterior)[name] for name in get_name(posterior)]...) #samples are columns
+    transformed_posterior_samples =
+        mapslices(x -> transform_unconstrained_to_constrained(posterior, x), posterior_samples, dims = 1)
+
+    # histograms
+    nparam_plots = sum(get_dimensions(posterior)) - 1
+    density_filepath = joinpath(figure_save_directory, "$(case_rf)_$(kernel_rank)_posterior_dist_comp.png")
+    transformed_density_filepath = joinpath(figure_save_directory, "$(case_rf)_$(kernel_rank)_posterior_dist_phys.png")
+    labels = get_name(posterior)
+
+    burnin = 50_000
+
+
+    data_rf = (; [(Symbol(labels[i]), posterior_samples[i, burnin:end]) for i in 1:length(labels)]...)
+    transformed_data_rf =
+        (; [(Symbol(labels[i]), transformed_posterior_samples[i, burnin:end]) for i in 1:length(labels)]...)
+
+
+
+    #    p = pairplot(data_rf => (PairPlots.Scatter(),))
+
+    p = pairplot(data_rf => (PairPlots.Contourf(sigmas = 1:1:3),))
+    trans_p = pairplot(transformed_data_rf => (PairPlots.Contourf(sigmas = 1:1:3),))
 
-case_gp = cases[1]
-# load
-posterior_filepath = joinpath(data_save_directory, "$(case_gp)_0_posterior.jld2")
-if !isfile(posterior_filepath)
-    throw(ArgumentError(posterior_filepath * " not found. Please check experiment name and date"))
-else
-    @info "Loading posterior distribution from: " * posterior_filepath
-    posterior = load(posterior_filepath)["posterior"]
-end
-# get samples explicitly (may be easier to work with)
-posterior_samples = vcat([get_distribution(posterior)[name] for name in get_name(posterior)]...) #samples are columns
-transformed_posterior_samples =
-    mapslices(x -> transform_unconstrained_to_constrained(posterior, x), posterior_samples, dims = 1)
-
-# histograms
-nparam_plots = sum(get_dimensions(posterior)) - 1
-density_filepath = joinpath(figure_save_directory, "$(case_rf)_$(case_gp)_$(kernel_rank)_posterior_dist_comp.png")
-transformed_density_filepath = joinpath(figure_save_directory, "$(case_rf)_$(case_gp)_$(kernel_rank)_posterior_dist_phys.png")
-labels = get_name(posterior)
-data_gp = (; [(Symbol(labels[i]), posterior_samples[i, burnin:end]) for i in 1:length(labels)]...)
-transformed_data_gp =
-    (; [(Symbol(labels[i]), transformed_posterior_samples[i, burnin:end]) for i in 1:length(labels)]...)
-#
-#
-#
-gp_smoothing = 1 # >1 = smoothing KDE in plotting
-
-@warn "Contourf in PairPlots.jl for multiple histograms works in v2.8.0. Check versions if not seen correctly."
-p = pairplot(
-    data_rf => (PairPlots.Contourf(sigmas = 1:1:3),),
-    data_gp => (PairPlots.Contourf(sigmas = 1:1:3, bandwidth = gp_smoothing),),
-)
+    save(density_filepath, p)
+    save(transformed_density_filepath, trans_p)
+    @info "saved RF contour plot"
+
+    #
+    #
+    #
+
+    case_gp = cases[1]
+    # load
+    posterior_filepath = joinpath(data_save_directory, "$(case_gp)_0_posterior.jld2")
+    if !isfile(posterior_filepath)
+        throw(ArgumentError(posterior_filepath * " not found. Please check experiment name and date"))
+    else
+        @info "Loading posterior distribution from: " * posterior_filepath
+        posterior = load(posterior_filepath)["posterior"]
+    end
+    # get samples explicitly (may be easier to work with)
+    posterior_samples = vcat([get_distribution(posterior)[name] for name in get_name(posterior)]...) #samples are columns
+    transformed_posterior_samples =
+        mapslices(x -> transform_unconstrained_to_constrained(posterior, x), posterior_samples, dims = 1)
+
+    # histograms
+    nparam_plots = sum(get_dimensions(posterior)) - 1
+    density_filepath = joinpath(figure_save_directory, "$(case_rf)_$(case_gp)_$(kernel_rank)_posterior_dist_comp.png")
+    transformed_density_filepath =
+        joinpath(figure_save_directory, "$(case_rf)_$(case_gp)_$(kernel_rank)_posterior_dist_phys.png")
+    labels = get_name(posterior)
+    data_gp = (; [(Symbol(labels[i]), posterior_samples[i, burnin:end]) for i in 1:length(labels)]...)
+    transformed_data_gp =
+        (; [(Symbol(labels[i]), transformed_posterior_samples[i, burnin:end]) for i in 1:length(labels)]...)
+    #
+    #
+    #
+    gp_smoothing = 1 # >1 = smoothing KDE in plotting
+
+    @warn "Contourf in PairPlots.jl for multiple histograms works in v2.8.0. Check versions if not seen correctly."
+    p = pairplot(
+        data_rf => (PairPlots.Contourf(sigmas = 1:1:3),),
+        data_gp => (PairPlots.Contourf(sigmas = 1:1:3, bandwidth = gp_smoothing),),
+    )
 
 
     trans_p = pairplot(
-    transformed_data_rf => (PairPlots.Contourf(sigmas = 1:1:3),),
-    transformed_data_gp => (PairPlots.Contourf(sigmas = 1:1:3, bandwidth = gp_smoothing),),
-)
-#=
-    trans_p = pairplot(
-        Series((transformed_data_rf, color=Makie.wong_colors(0.5)[1]) => (PairPlots.Contourf(sigmas = 1:1:3)),),
-        Series((transformed_data_gp, color=Makie.wong_colors(0.5)[2]) => (PairPlots.Contourf(sigmas = 1:1:3)),),
+        transformed_data_rf => (PairPlots.Contourf(sigmas = 1:1:3),),
+        transformed_data_gp => (PairPlots.Contourf(sigmas = 1:1:3, bandwidth = gp_smoothing),),
     )
-=#
+    #=
+        trans_p = pairplot(
+            Series((transformed_data_rf, color=Makie.wong_colors(0.5)[1]) => (PairPlots.Contourf(sigmas = 1:1:3)),),
+            Series((transformed_data_gp, color=Makie.wong_colors(0.5)[2]) => (PairPlots.Contourf(sigmas = 1:1:3)),),
+        )
+    =#
     save(density_filepath, p)
-save(transformed_density_filepath, trans_p)
+    save(transformed_density_filepath, trans_p)
+
+    @info "saved RF/GP contour plot"
+
+
+    # Finally include the prior too
+    case_prior = cases[2]
+    # load
+    posterior_filepath = joinpath(data_save_directory, "$(case_prior)_$(prior_kernel_rank)_posterior.jld2")
+    if !isfile(posterior_filepath)
+        throw(ArgumentError(posterior_filepath * " not found. Please check experiment name and date"))
+    else
+        @info "Loading posterior distribution from: " * posterior_filepath
+        posterior = load(posterior_filepath)["posterior"]
+    end
+    # get samples explicitly (may be easier to work with)
+    posterior_samples = vcat([get_distribution(posterior)[name] for name in get_name(posterior)]...) #samples are columns
+    transformed_posterior_samples =
+        mapslices(x -> transform_unconstrained_to_constrained(posterior, x), posterior_samples, dims = 1)
+
+    # histograms
+    nparam_plots = sum(get_dimensions(posterior)) - 1
+    density_filepath = joinpath(figure_save_directory, "all_$(kernel_rank)_posterior_dist_comp.png")
+    transformed_density_filepath = joinpath(figure_save_directory, "all_$(kernel_rank)_posterior_dist_phys.png")
+    labels = get_name(posterior)
+
+    prior_burnin = 240000
+
+    data_prior = (; [(Symbol(labels[i]), posterior_samples[i, prior_burnin:end]) for i in 1:length(labels)]...)
+    transformed_data_prior =
+        (; [(Symbol(labels[i]), transformed_posterior_samples[i, prior_burnin:end]) for i in 1:length(labels)]...)
+    #
+    #
+    #
+
+    p = pairplot(
+        data_rf => (PairPlots.Contourf(sigmas = 1:1:3),),
+        data_gp => (PairPlots.Contourf(sigmas = 1:1:3, bandwidth = gp_smoothing),),
+        data_prior => (PairPlots.Scatter(),),
+    )
+    trans_p = pairplot(
+        transformed_data_rf => (PairPlots.Contourf(sigmas = 1:1:3),),
+        transformed_data_gp => (PairPlots.Contourf(sigmas = 1:1:3, bandwidth = gp_smoothing),),
+        transformed_data_prior => (PairPlots.Scatter(),),
+    )
 
-@info "saved RF/GP contour plot"
+    save(density_filepath, p)
+    save(transformed_density_filepath, trans_p)
 
+    density_filepath = joinpath(figure_save_directory, "all_$(kernel_rank)_posterior_dist_comp.pdf")
+    transformed_density_filepath = joinpath(figure_save_directory, "all_$(kernel_rank)_posterior_dist_phys.pdf")
+    save(density_filepath, p)
+    save(transformed_density_filepath, trans_p)
 
-# Finally include the prior too
-case_prior = cases[2]
-# load
-posterior_filepath = joinpath(data_save_directory, "$(case_prior)_$(prior_kernel_rank)_posterior.jld2")
-if !isfile(posterior_filepath)
-    throw(ArgumentError(posterior_filepath * " not found. Please check experiment name and date"))
-else
-    @info "Loading posterior distribution from: " * posterior_filepath
-    posterior = load(posterior_filepath)["posterior"]
-end
-# get samples explicitly (may be easier to work with)
-posterior_samples = vcat([get_distribution(posterior)[name] for name in get_name(posterior)]...) #samples are columns
-transformed_posterior_samples =
-    mapslices(x -> transform_unconstrained_to_constrained(posterior, x), posterior_samples, dims = 1)
-
-# histograms
-nparam_plots = sum(get_dimensions(posterior)) - 1
-density_filepath = joinpath(figure_save_directory, "all_$(kernel_rank)_posterior_dist_comp.png")
-transformed_density_filepath = joinpath(figure_save_directory, "all_$(kernel_rank)_posterior_dist_phys.png")
-labels = get_name(posterior)
-
-prior_burnin = 240000
-    
-data_prior = (; [(Symbol(labels[i]), posterior_samples[i, prior_burnin:end]) for i in 1:length(labels)]...)
-transformed_data_prior =
-    (; [(Symbol(labels[i]), transformed_posterior_samples[i, prior_burnin:end]) for i in 1:length(labels)]...)
-#
-#
-#
-
-p = pairplot(
-    data_rf => (PairPlots.Contourf(sigmas = 1:1:3),),
-    data_gp => (PairPlots.Contourf(sigmas = 1:1:3, bandwidth = gp_smoothing),),
-    data_prior => (PairPlots.Scatter(),),
-)
-trans_p = pairplot(
-    transformed_data_rf => (PairPlots.Contourf(sigmas = 1:1:3),),
-    transformed_data_gp => (PairPlots.Contourf(sigmas = 1:1:3, bandwidth = gp_smoothing),),
-    transformed_data_prior => (PairPlots.Scatter(),),
-)
-
-save(density_filepath, p)
-save(transformed_density_filepath, trans_p)
-
-density_filepath = joinpath(figure_save_directory, "all_$(kernel_rank)_posterior_dist_comp.pdf")
-transformed_density_filepath = joinpath(figure_save_directory, "all_$(kernel_rank)_posterior_dist_phys.pdf")
-save(density_filepath, p)
-save(transformed_density_filepath, trans_p)
-
-@info "saved RF/Prior/GP contour plot"
+    @info "saved RF/Prior/GP contour plot"
 end
 
 main()
diff --git a/examples/EDMF_data/uq_for_edmf.jl b/examples/EDMF_data/uq_for_edmf.jl
index 80516cbc..5deb7811 100644
--- a/examples/EDMF_data/uq_for_edmf.jl
+++ b/examples/EDMF_data/uq_for_edmf.jl
@@ -202,17 +202,18 @@ function main()
     kernel_rank = 3 # svd-sep should be kr - 5 (as input rank is set to 5)
     n_cross_val_sets = 2
     @info "Kernel rank: $(kernel_rank)"
-    max_feature_size=size(get_outputs(input_output_pairs), 2) * size(get_outputs(input_output_pairs),1) * (1-train_frac)
+    max_feature_size =
+        size(get_outputs(input_output_pairs), 2) * size(get_outputs(input_output_pairs), 1) * (1 - train_frac)
     overrides = Dict(
         "verbose" => true,
         "train_fraction" => train_frac,
         "scheduler" => DataMisfitController(terminate_at = 1000),
         "cov_sample_multiplier" => 0.2,
         "n_iteration" => 15,
-        "n_features_opt" => Int(floor((max_feature_size/5))),# here: /5 with rank <= 3 works
-      "localization" => SEC(0.05),
+        "n_features_opt" => Int(floor((max_feature_size / 5))),# here: /5 with rank <= 3 works
+        "localization" => SEC(0.05),
         "n_ensemble" => 400,
-        "n_cross_val_sets" => n_cross_val_sets,      
+        "n_cross_val_sets" => n_cross_val_sets,
     )
     if case == "RF-prior"
         overrides = Dict("verbose" => true, "cov_sample_multiplier" => 0.01, "n_iteration" => 0)
@@ -225,7 +226,7 @@ function main()
     decorrelate = true
     opt_diagnostics = []
     if case == "GP"
-        kernel_rank=0
+        kernel_rank = 0
         gppackage = Emulators.SKLJL()
         pred_type = Emulators.YType()
         mlt = GaussianProcess(
@@ -235,7 +236,7 @@ function main()
             noise_learn = false,
         )
     elseif case ∈ ["RF-vector-svd-sep"]
-        kernel_structure = SeparableKernel(LowRankFactor(5, nugget),LowRankFactor(kernel_rank,nugget))
+        kernel_structure = SeparableKernel(LowRankFactor(5, nugget), LowRankFactor(kernel_rank, nugget))
         n_features = 500
 
         mlt = VectorRandomFeatureInterface(
@@ -246,7 +247,7 @@ function main()
             kernel_structure = kernel_structure,
             optimizer_options = overrides,
         )
-        
+
     elseif case ∈ ["RF-vector-svd-nonsep", "RF-prior"]
         kernel_structure = NonseparableKernel(LowRankFactor(kernel_rank, nugget))
         n_features = 500
@@ -320,7 +321,7 @@ function main()
     emulator_filepath = joinpath(data_save_directory, "$(case)_$(kernel_rank)_emulator.jld2")
     save(emulator_filepath, "emulator", emulator)
 
-    if length(opt_diagnostics)>0
+    if length(opt_diagnostics) > 0
         eki_conv_filepath = joinpath(data_save_directory, "$(case)_$(kernel_rank)_eki-conv.jld2")
         save(eki_conv_filepath, "opt_diagnostics", opt_diagnostics)
     end
diff --git a/examples/Emulator/G-function/emulate-test-n-features.jl b/examples/Emulator/G-function/emulate-test-n-features.jl
index 2a3651b9..042694fb 100644
--- a/examples/Emulator/G-function/emulate-test-n-features.jl
+++ b/examples/Emulator/G-function/emulate-test-n-features.jl
@@ -99,16 +99,16 @@ function main()
     nugget = Float64(1e-12)
 
     n_features_vec = [25, 50, 100, 200, 400, 800] # < data points ideally as output dim = 1
-    ttt = zeros(length(n_features_vec),n_repeats)
-    train_err = zeros(length(n_features_vec),n_repeats)
-    test_err = zeros(length(n_features_vec),n_repeats)
+    ttt = zeros(length(n_features_vec), n_repeats)
+    train_err = zeros(length(n_features_vec), n_repeats)
+    test_err = zeros(length(n_features_vec), n_repeats)
     n_cross_val_sets = 2
 
-    
+
     for (f_idx, n_features_opt) in enumerate(n_features_vec)
         y_preds = []
         result_preds = []
-        
+
         overrides = Dict(
             #"verbose" => true,
             "scheduler" => DataMisfitController(terminate_at = 1e3),
@@ -125,7 +125,7 @@ function main()
             overrides["n_iteration"] = 0
             overrides["cov_sample_multiplier"] = 0.1
         end
-        
+
         for rep_idx in 1:n_repeats
             @info "Testing #features = $(n_features_opt) \n repeat $(rep_idx) of $(n_repeats)"
             # Build ML tools
@@ -149,163 +149,170 @@ function main()
                     optimizer_options = deepcopy(overrides),
                 )
             end
-            
+
             # Emulate
             ttt[f_idx, rep_idx] = @elapsed begin
                 emulator = Emulator(mlt, iopairs; obs_noise_cov = Γ * I, decorrelate = decorrelate)
                 optimize_hyperparameters!(emulator)
             end
-            
+
             # errors:
             # training error
             y_pred, y_var = predict(emulator, get_inputs(iopairs), transform_to_real = true)
-            train_err[f_idx,rep_idx] = sqrt(sum((y_pred - get_outputs(iopairs)).^2))/n_train_pts
+            train_err[f_idx, rep_idx] = sqrt(sum((y_pred - get_outputs(iopairs)) .^ 2)) / n_train_pts
 
             # predict on all test points with emulator (example)    
             y_pred, y_var = predict(emulator, samples', transform_to_real = true) #predict on all points
-            ind_test = ind_total[n_train_pts+1:end]
-            test_err[f_idx,rep_idx] = sqrt(sum((y_pred[ind_test] - y[ind_test]).^2))/length(ind_test)
+            ind_test = ind_total[(n_train_pts + 1):end]
+            test_err[f_idx, rep_idx] = sqrt(sum((y_pred[ind_test] - y[ind_test]) .^ 2)) / length(ind_test)
 
             JLD2.save(
-                joinpath(output_directory, "diff_n_features_GFunction_$(case)_$(n_dimensions)_ntest-$(Int(n_train_pts/5))_cv-$(n_cross_val_sets).jld2"),
-                "n_features_vec", n_features_vec,
-                "timings", ttt,
-                "train_err", train_err,
-                "test_err", test_err,
+                joinpath(
+                    output_directory,
+                    "diff_n_features_GFunction_$(case)_$(n_dimensions)_ntest-$(Int(n_train_pts/5))_cv-$(n_cross_val_sets).jld2",
+                ),
+                "n_features_vec",
+                n_features_vec,
+                "timings",
+                ttt,
+                "train_err",
+                train_err,
+                "test_err",
+                test_err,
             ) #save every iteration for safety
 
-            
+
             # obtain emulated Sobol indices
             result_pred = analyze(data, y_pred')
             println("First order: ", result_pred[:firstorder])
             println("Total order: ", result_pred[:totalorder])
-            
+
             push!(y_preds, y_pred)
             push!(result_preds, result_pred)
             GC.gc() #collect garbage
-            
-        # PLotting:
-#=        if rep_idx == 1
-            f3, ax3, plt3 = scatter(
-                1:n_dimensions,
-                result_preds[1][:firstorder];
-                color = :red,
-                markersize = 8,
-                marker = :cross,
-                label = "V-emulate",
-                title = "input dimension: $(n_dimensions)",
-            )
-            scatter!(ax3, result[:firstorder], color = :red, markersize = 8, label = "V-approx")
-            scatter!(ax3, V, color = :red, markersize = 12, marker = :xcross, label = "V-true")
-            scatter!(
-                ax3,
-                1:n_dimensions,
-                result_preds[1][:totalorder];
-                color = :blue,
-                label = "TV-emulate",
-                markersize = 8,
-                marker = :cross,
-            )
-            scatter!(ax3, result[:totalorder], color = :blue, markersize = 8, label = "TV-approx")
-            scatter!(ax3, TV, color = :blue, markersize = 12, marker = :xcross, label = "TV-true")
-            axislegend(ax3)
-
-            CairoMakie.save(
-                joinpath(output_directory, "GFunction_sens_$(case)_$(n_dimensions).png"),
-                f3,
-                px_per_unit = 3,
-            )
-            CairoMakie.save(
-                joinpath(output_directory, "GFunction_sens_$(case)_$(n_dimensions).pdf"),
-                f3,
-                px_per_unit = 3,
-            )
-        else
-            # get percentiles:
-            fo_mat = zeros(n_dimensions, rep_idx)
-            to_mat = zeros(n_dimensions, rep_idx)
-
-            for (idx, rp) in enumerate(result_preds)
-                fo_mat[:, idx] = rp[:firstorder]
-                to_mat[:, idx] = rp[:totalorder]
-            end
-
-            firstorder_med = percentile.(eachrow(fo_mat), 50)
-            firstorder_low = percentile.(eachrow(fo_mat), 5)
-            firstorder_up = percentile.(eachrow(fo_mat), 95)
-
-            totalorder_med = percentile.(eachrow(to_mat), 50)
-            totalorder_low = percentile.(eachrow(to_mat), 5)
-            totalorder_up = percentile.(eachrow(to_mat), 95)
-
-            println("(50%) firstorder: ", firstorder_med)
-            println("(5%)  firstorder: ", firstorder_low)
-            println("(95%)  firstorder: ", firstorder_up)
-
-            println("(50%) totalorder: ", totalorder_med)
-            println("(5%)  totalorder: ", totalorder_low)
-            println("(95%)  totalorder: ", totalorder_up)
-            #
-            f3, ax3, plt3 = errorbars(
-                1:n_dimensions,
-                firstorder_med,
-                firstorder_med - firstorder_low,
-                firstorder_up - firstorder_med;
-                whiskerwidth = 10,
-                color = :red,
-                label = "V-emulate",
-                title = "input dimension: $(n_dimensions)",
-            )
-            scatter!(ax3, result[:firstorder], color = :red, markersize = 8, label = "V-approx")
-            scatter!(ax3, V, color = :red, markersize = 12, marker = :xcross, label = "V-true")
-            errorbars!(
-                ax3,
-                1:n_dimensions,
-                totalorder_med,
-                totalorder_med - totalorder_low,
-                totalorder_up - totalorder_med;
-                whiskerwidth = 10,
-                color = :blue,
-                label = "TV-emulate",
-            )
-            scatter!(ax3, result[:totalorder], color = :blue, markersize = 8, label = "TV-approx")
-            scatter!(ax3, TV, color = :blue, markersize = 12, marker = :xcross, label = "TV-true")
-            axislegend(ax3)
-
-            CairoMakie.save(
-                joinpath(output_directory, "GFunction_sens_$(case)_$(n_dimensions).png"),
-                f3,
-                px_per_unit = 3,
-            )
-            CairoMakie.save(
-                joinpath(output_directory, "GFunction_sens_$(case)_$(n_dimensions).pdf"),
-                f3,
-                px_per_unit = 3,
-            )
 
-        end
-        # plots - first 3 dimensions
-        if rep_idx == 1
-            f2 = Figure(resolution = (1.618 * plot_dim * 300, 300), markersize = 4)
-            for i in 1:plot_dim
-                ax2 = Axis(f2[1, i], xlabel = "x" * string(i), ylabel = "f")
-                scatter!(ax2, samples[:, i], y_preds[1][:], color = :blue)
-                scatter!(ax2, samples[ind, i], y[ind] + noise, color = :red, markersize = 8)
-            end
-            CairoMakie.save(
-                joinpath(output_directory, "GFunction_slices_$(case)_$(n_dimensions).png"),
-                f2,
-                px_per_unit = 3,
-            )
-            CairoMakie.save(
-                joinpath(output_directory, "GFunction_slices_$(case)_$(n_dimensions).pdf"),
-                f2,
-                px_per_unit = 3,
-            )
-        end
-            
-        end
-            =#
+            # PLotting:
+            #=        if rep_idx == 1
+                        f3, ax3, plt3 = scatter(
+                            1:n_dimensions,
+                            result_preds[1][:firstorder];
+                            color = :red,
+                            markersize = 8,
+                            marker = :cross,
+                            label = "V-emulate",
+                            title = "input dimension: $(n_dimensions)",
+                        )
+                        scatter!(ax3, result[:firstorder], color = :red, markersize = 8, label = "V-approx")
+                        scatter!(ax3, V, color = :red, markersize = 12, marker = :xcross, label = "V-true")
+                        scatter!(
+                            ax3,
+                            1:n_dimensions,
+                            result_preds[1][:totalorder];
+                            color = :blue,
+                            label = "TV-emulate",
+                            markersize = 8,
+                            marker = :cross,
+                        )
+                        scatter!(ax3, result[:totalorder], color = :blue, markersize = 8, label = "TV-approx")
+                        scatter!(ax3, TV, color = :blue, markersize = 12, marker = :xcross, label = "TV-true")
+                        axislegend(ax3)
+
+                        CairoMakie.save(
+                            joinpath(output_directory, "GFunction_sens_$(case)_$(n_dimensions).png"),
+                            f3,
+                            px_per_unit = 3,
+                        )
+                        CairoMakie.save(
+                            joinpath(output_directory, "GFunction_sens_$(case)_$(n_dimensions).pdf"),
+                            f3,
+                            px_per_unit = 3,
+                        )
+                    else
+                        # get percentiles:
+                        fo_mat = zeros(n_dimensions, rep_idx)
+                        to_mat = zeros(n_dimensions, rep_idx)
+
+                        for (idx, rp) in enumerate(result_preds)
+                            fo_mat[:, idx] = rp[:firstorder]
+                            to_mat[:, idx] = rp[:totalorder]
+                        end
+
+                        firstorder_med = percentile.(eachrow(fo_mat), 50)
+                        firstorder_low = percentile.(eachrow(fo_mat), 5)
+                        firstorder_up = percentile.(eachrow(fo_mat), 95)
+
+                        totalorder_med = percentile.(eachrow(to_mat), 50)
+                        totalorder_low = percentile.(eachrow(to_mat), 5)
+                        totalorder_up = percentile.(eachrow(to_mat), 95)
+
+                        println("(50%) firstorder: ", firstorder_med)
+                        println("(5%)  firstorder: ", firstorder_low)
+                        println("(95%)  firstorder: ", firstorder_up)
+
+                        println("(50%) totalorder: ", totalorder_med)
+                        println("(5%)  totalorder: ", totalorder_low)
+                        println("(95%)  totalorder: ", totalorder_up)
+                        #
+                        f3, ax3, plt3 = errorbars(
+                            1:n_dimensions,
+                            firstorder_med,
+                            firstorder_med - firstorder_low,
+                            firstorder_up - firstorder_med;
+                            whiskerwidth = 10,
+                            color = :red,
+                            label = "V-emulate",
+                            title = "input dimension: $(n_dimensions)",
+                        )
+                        scatter!(ax3, result[:firstorder], color = :red, markersize = 8, label = "V-approx")
+                        scatter!(ax3, V, color = :red, markersize = 12, marker = :xcross, label = "V-true")
+                        errorbars!(
+                            ax3,
+                            1:n_dimensions,
+                            totalorder_med,
+                            totalorder_med - totalorder_low,
+                            totalorder_up - totalorder_med;
+                            whiskerwidth = 10,
+                            color = :blue,
+                            label = "TV-emulate",
+                        )
+                        scatter!(ax3, result[:totalorder], color = :blue, markersize = 8, label = "TV-approx")
+                        scatter!(ax3, TV, color = :blue, markersize = 12, marker = :xcross, label = "TV-true")
+                        axislegend(ax3)
+
+                        CairoMakie.save(
+                            joinpath(output_directory, "GFunction_sens_$(case)_$(n_dimensions).png"),
+                            f3,
+                            px_per_unit = 3,
+                        )
+                        CairoMakie.save(
+                            joinpath(output_directory, "GFunction_sens_$(case)_$(n_dimensions).pdf"),
+                            f3,
+                            px_per_unit = 3,
+                        )
+
+                    end
+                    # plots - first 3 dimensions
+                    if rep_idx == 1
+                        f2 = Figure(resolution = (1.618 * plot_dim * 300, 300), markersize = 4)
+                        for i in 1:plot_dim
+                            ax2 = Axis(f2[1, i], xlabel = "x" * string(i), ylabel = "f")
+                            scatter!(ax2, samples[:, i], y_preds[1][:], color = :blue)
+                            scatter!(ax2, samples[ind, i], y[ind] + noise, color = :red, markersize = 8)
+                        end
+                        CairoMakie.save(
+                            joinpath(output_directory, "GFunction_slices_$(case)_$(n_dimensions).png"),
+                            f2,
+                            px_per_unit = 3,
+                        )
+                        CairoMakie.save(
+                            joinpath(output_directory, "GFunction_slices_$(case)_$(n_dimensions).pdf"),
+                            f2,
+                            px_per_unit = 3,
+                        )
+                    end
+
+                    end
+                        =#
         end
     end
 
diff --git a/examples/Emulator/G-function/emulate.jl b/examples/Emulator/G-function/emulate.jl
index aa1c117d..23007f5d 100644
--- a/examples/Emulator/G-function/emulate.jl
+++ b/examples/Emulator/G-function/emulate.jl
@@ -1,110 +1,110 @@
 
-  using GlobalSensitivityAnalysis
-  const GSA = GlobalSensitivityAnalysis
-  using Distributions
-  using DataStructures
-  using Random
-  using LinearAlgebra
-  import StatsBase: percentile
-  using JLD2
-
-  using CalibrateEmulateSample.EnsembleKalmanProcesses
-  using CalibrateEmulateSample.Emulators
-  using CalibrateEmulateSample.DataContainers
-  using CalibrateEmulateSample.EnsembleKalmanProcesses.Localizers
-
-  using CairoMakie, ColorSchemes #for plots
-  seed = 2589436
-
-  output_directory = joinpath(@__DIR__, "output")
-  if !isdir(output_directory)
-      mkdir(output_directory)
-  end
-
-
-  inner_func(x::AV, a::AV) where {AV <: AbstractVector} = prod((abs.(4 * x .- 2) + a) ./ (1 .+ a))
-
-  "G-Function taken from https://www.sfu.ca/~ssurjano/gfunc.html"
-  function GFunction(x::AM, a::AV) where {AM <: AbstractMatrix, AV <: AbstractVector}
-      @assert size(x, 1) == length(a)
-      return mapslices(y -> inner_func(y, a), x; dims = 1) #applys the map to columns
-  end
-
-  function GFunction(x::AM) where {AM <: AbstractMatrix}
-      a = [(i - 1.0) / 2.0 for i in 1:size(x, 1)]
-      return GFunction(x, a)
-  end
-
-  function main()
-
-      rng = MersenneTwister(seed)
-
-      n_repeats = 20# repeat exp with same data.
-      n_dimensions = 20
-      # To create the sampling
-      n_data_gen = 800
-
-      data =
-          SobolData(params = OrderedDict([Pair(Symbol("x", i), Uniform(0, 1)) for i in 1:n_dimensions]), N = n_data_gen)
-
-      # To perform global analysis,
-      # one must generate samples using Sobol sequence (i.e. creates more than N points)
-      samples = GSA.sample(data)
-      n_data = size(samples, 1) # [n_samples x n_dim]
-      println("number of sobol points: ", n_data)
-      # run model (example)
-      y = GFunction(samples')' # G is applied to columns
-      # perform Sobol Analysis
-      result = analyze(data, y)
-
-      # plot the first 3 dimensions
-      plot_dim = n_dimensions >= 3 ? 3 : n_dimensions
-      f1 = Figure(resolution = (1.618 * plot_dim * 300, 300), markersize = 4)
-      for i in 1:plot_dim
-          ax = Axis(f1[1, i], xlabel = "x" * string(i), ylabel = "f")
-          scatter!(ax, samples[:, i], y[:], color = :orange)
-      end
-
-      CairoMakie.save(joinpath(output_directory, "GFunction_slices_truth_$(n_dimensions).png"), f1, px_per_unit = 3)
-      CairoMakie.save(joinpath(output_directory, "GFunction_slices_truth_$(n_dimensions).pdf"), f1, px_per_unit = 3)
-
-      n_train_pts = n_dimensions * 250
-      ind = shuffle!(rng, Vector(1:n_data))[1:n_train_pts]
-      # now subsample the samples data
-      n_tp = length(ind)
-      input = zeros(n_dimensions, n_tp)
-      output = zeros(1, n_tp)
-      Γ = 1e-3
-      noise = rand(rng, Normal(0, Γ), n_tp)
-      for i in 1:n_tp
-          input[:, i] = samples[ind[i], :]
-          output[i] = y[ind[i]] + noise[i]
-      end
-      iopairs = PairedDataContainer(input, output)
-
-      # analytic sobol indices taken from
-      # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8989694/pdf/main.pdf
-      a = [(i - 1.0) / 2.0 for i in 1:n_dimensions]  # a_i < a_j => a_i more sensitive
-      prod_tmp = prod(1 .+ 1 ./ (3 .* (1 .+ a) .^ 2)) - 1
-      V = [(1 / (3 * (1 + ai)^2)) / prod_tmp for ai in a]
-      prod_tmp2 = [prod(1 .+ 1 ./ (3 .* (1 .+ a[1:end .!== j]) .^ 2)) for j in 1:n_dimensions]
-      TV = [(1 / (3 * (1 + ai)^2)) * prod_tmp2[i] / prod_tmp for (i, ai) in enumerate(a)]
-
-
-
-      cases = ["Prior", "GP", "RF-scalar"]
-      case = cases[3]
-      decorrelate = true
-      nugget = Float64(1e-12)
-
-      overrides = Dict(
-          "verbose" => true,
-          "scheduler" => DataMisfitController(terminate_at = 1e2),
-          "n_features_opt" => 150,
-          "n_iteration" => 10,
-          "cov_sample_multiplier" => 3.0,
-          #"localization" => SEC(0.1),#,Doesnt help much tbh
-          #"accelerator" => NesterovAccelerator(),
+using GlobalSensitivityAnalysis
+const GSA = GlobalSensitivityAnalysis
+using Distributions
+using DataStructures
+using Random
+using LinearAlgebra
+import StatsBase: percentile
+using JLD2
+
+using CalibrateEmulateSample.EnsembleKalmanProcesses
+using CalibrateEmulateSample.Emulators
+using CalibrateEmulateSample.DataContainers
+using CalibrateEmulateSample.EnsembleKalmanProcesses.Localizers
+
+using CairoMakie, ColorSchemes #for plots
+seed = 2589436
+
+output_directory = joinpath(@__DIR__, "output")
+if !isdir(output_directory)
+    mkdir(output_directory)
+end
+
+
+inner_func(x::AV, a::AV) where {AV <: AbstractVector} = prod((abs.(4 * x .- 2) + a) ./ (1 .+ a))
+
+"G-Function taken from https://www.sfu.ca/~ssurjano/gfunc.html"
+function GFunction(x::AM, a::AV) where {AM <: AbstractMatrix, AV <: AbstractVector}
+    @assert size(x, 1) == length(a)
+    return mapslices(y -> inner_func(y, a), x; dims = 1) #applys the map to columns
+end
+
+function GFunction(x::AM) where {AM <: AbstractMatrix}
+    a = [(i - 1.0) / 2.0 for i in 1:size(x, 1)]
+    return GFunction(x, a)
+end
+
+function main()
+
+    rng = MersenneTwister(seed)
+
+    n_repeats = 20# repeat exp with same data.
+    n_dimensions = 20
+    # To create the sampling
+    n_data_gen = 800
+
+    data =
+        SobolData(params = OrderedDict([Pair(Symbol("x", i), Uniform(0, 1)) for i in 1:n_dimensions]), N = n_data_gen)
+
+    # To perform global analysis,
+    # one must generate samples using Sobol sequence (i.e. creates more than N points)
+    samples = GSA.sample(data)
+    n_data = size(samples, 1) # [n_samples x n_dim]
+    println("number of sobol points: ", n_data)
+    # run model (example)
+    y = GFunction(samples')' # G is applied to columns
+    # perform Sobol Analysis
+    result = analyze(data, y)
+
+    # plot the first 3 dimensions
+    plot_dim = n_dimensions >= 3 ? 3 : n_dimensions
+    f1 = Figure(resolution = (1.618 * plot_dim * 300, 300), markersize = 4)
+    for i in 1:plot_dim
+        ax = Axis(f1[1, i], xlabel = "x" * string(i), ylabel = "f")
+        scatter!(ax, samples[:, i], y[:], color = :orange)
+    end
+
+    CairoMakie.save(joinpath(output_directory, "GFunction_slices_truth_$(n_dimensions).png"), f1, px_per_unit = 3)
+    CairoMakie.save(joinpath(output_directory, "GFunction_slices_truth_$(n_dimensions).pdf"), f1, px_per_unit = 3)
+
+    n_train_pts = n_dimensions * 250
+    ind = shuffle!(rng, Vector(1:n_data))[1:n_train_pts]
+    # now subsample the samples data
+    n_tp = length(ind)
+    input = zeros(n_dimensions, n_tp)
+    output = zeros(1, n_tp)
+    Γ = 1e-3
+    noise = rand(rng, Normal(0, Γ), n_tp)
+    for i in 1:n_tp
+        input[:, i] = samples[ind[i], :]
+        output[i] = y[ind[i]] + noise[i]
+    end
+    iopairs = PairedDataContainer(input, output)
+
+    # analytic sobol indices taken from
+    # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8989694/pdf/main.pdf
+    a = [(i - 1.0) / 2.0 for i in 1:n_dimensions]  # a_i < a_j => a_i more sensitive
+    prod_tmp = prod(1 .+ 1 ./ (3 .* (1 .+ a) .^ 2)) - 1
+    V = [(1 / (3 * (1 + ai)^2)) / prod_tmp for ai in a]
+    prod_tmp2 = [prod(1 .+ 1 ./ (3 .* (1 .+ a[1:end .!== j]) .^ 2)) for j in 1:n_dimensions]
+    TV = [(1 / (3 * (1 + ai)^2)) * prod_tmp2[i] / prod_tmp for (i, ai) in enumerate(a)]
+
+
+
+    cases = ["Prior", "GP", "RF-scalar"]
+    case = cases[3]
+    decorrelate = true
+    nugget = Float64(1e-12)
+
+    overrides = Dict(
+        "verbose" => true,
+        "scheduler" => DataMisfitController(terminate_at = 1e2),
+        "n_features_opt" => 150,
+        "n_iteration" => 10,
+        "cov_sample_multiplier" => 3.0,
+        #"localization" => SEC(0.1),#,Doesnt help much tbh
+        #"accelerator" => NesterovAccelerator(),
         "n_ensemble" => 100, #40*n_dimensions,
         "n_cross_val_sets" => 4,
     )
@@ -147,13 +147,13 @@
             emulator = Emulator(mlt, iopairs; obs_noise_cov = Γ * I, decorrelate = decorrelate)
             optimize_hyperparameters!(emulator)
         end
-        
+
         if case == "RF-scalar"
             diag_tmp = reduce(hcat, get_optimizer(mlt)) # (n_iteration, dim_output=1) convergence for each scalar mode as cols
             push!(opt_diagnostics, diag_tmp)
         end
-        
-        @info "statistics of training time for case $(case): \n mean(s): $(mean(times[1:rep_idx])) \n var(s) : $(var(times[1:rep_idx]))"        
+
+        @info "statistics of training time for case $(case): \n mean(s): $(mean(times[1:rep_idx])) \n var(s) : $(var(times[1:rep_idx]))"
         # predict on all Sobol points with emulator (example)    
         y_pred, y_var = predict(emulator, samples', transform_to_real = true)
 
@@ -169,8 +169,8 @@
         # PLotting:
         fontsize = 24
         if rep_idx == 1
-            f3 = Figure(markersize = 8,fontsize = fontsize)
-            ax3 = Axis(f3[1,1])
+            f3 = Figure(markersize = 8, fontsize = fontsize)
+            ax3 = Axis(f3[1, 1])
             scatter!(
                 ax3,
                 1:n_dimensions,
@@ -230,8 +230,8 @@
             println("(5%)  totalorder: ", totalorder_low)
             println("(95%)  totalorder: ", totalorder_up)
             #
-            f3 = Figure(markersize = 8,fontsize = fontsize)
-            ax3 = Axis(f3[1,1])
+            f3 = Figure(markersize = 8, fontsize = fontsize)
+            ax3 = Axis(f3[1, 1])
             errorbars!(
                 ax3,
                 1:n_dimensions,
@@ -316,7 +316,7 @@
 
     end
 
-    
+
     println(" ")
     println("True Sobol Indices")
     println("******************")
@@ -331,8 +331,8 @@
 
     println("Sampled Emulated Sobol Indices (# obs $n_train_pts, noise var $Γ)")
     println("***************************************************************")
-    
-    
+
+
     jldsave(
         joinpath(output_directory, "GFunction_$(case)_$(n_dimensions).jld2");
         sobol_pts = samples,
diff --git a/examples/Emulator/G-function/plot_result.jl b/examples/Emulator/G-function/plot_result.jl
index d66e560b..03455b87 100644
--- a/examples/Emulator/G-function/plot_result.jl
+++ b/examples/Emulator/G-function/plot_result.jl
@@ -10,7 +10,7 @@ function main()
     n_dimensions = 3
     filename = joinpath(output_directory, "Gfunction_$(case)_$(n_dimensions).jld2")
     legend = true
-    
+
     (
         sobol_pts,
         train_idx,
@@ -39,8 +39,8 @@ function main()
     n_repeats = length(mlt_sobol)
     fontsize = 24
     if n_repeats == 1
-        f3 = Figure(markersize = 8,fontsize = fontsize)
-        ax3 = Axis(f3[1,1], xticks = 1:2:n_dimensions, ylabel = "Sobol Index", xlabel="i")
+        f3 = Figure(markersize = 8, fontsize = fontsize)
+        ax3 = Axis(f3[1, 1], xticks = 1:2:n_dimensions, ylabel = "Sobol Index", xlabel = "i")
         scatter!(
             ax3,
             1:n_dimensions,
@@ -71,7 +71,7 @@ function main()
         CairoMakie.save(png_out, f3, px_per_unit = 3)
         CairoMakie.save(pdf_out, f3, px_per_unit = 3)
         @info "Plotted sensitivities, case dim = $n_dimensions, \n storing plots in: \n $png_out \n $pdf_out"
-        
+
     else
         # get percentiles:
         fo_mat = zeros(n_dimensions, n_repeats)
@@ -98,8 +98,8 @@ function main()
         println("(5%)  totalorder: ", totalorder_low)
         println("(95%)  totalorder: ", totalorder_up)
         #
-        f3 = Figure(markersize = 8,fontsize = fontsize)
-        ax3 = Axis(f3[1,1], xticks = 1:2:n_dimensions, ylabel = "Sobol Index", xlabel = "i")
+        f3 = Figure(markersize = 8, fontsize = fontsize)
+        ax3 = Axis(f3[1, 1], xticks = 1:2:n_dimensions, ylabel = "Sobol Index", xlabel = "i")
         errorbars!(
             ax3,
             1:n_dimensions,
@@ -132,11 +132,11 @@ function main()
         CairoMakie.save(png_out, f3, px_per_unit = 3)
         CairoMakie.save(pdf_out, f3, px_per_unit = 3)
         @info "Plotted sensitivities, case dim = $n_dimensions, \n storing plots in: \n $png_out \n $pdf_out"
-        
+
     end
     # plots - first 3 dimensions
     plot_dim = n_dimensions >= 3 ? 3 : n_dimensions
-    f2 = Figure(resolution = (1.618 * plot_dim * 300, 300), markersize = 4, fontsize=fontsize)
+    f2 = Figure(resolution = (1.618 * plot_dim * 300, 300), markersize = 4, fontsize = fontsize)
     for i in 1:plot_dim
         ax2 = Axis(f2[1, i], xlabel = "x" * string(i), ylabel = "f")
         scatter!(ax2, sobol_pts[:, i], mlt_pred_y[1][:], color = :blue)
diff --git a/examples/Emulator/Ishigami/emulate.jl b/examples/Emulator/Ishigami/emulate.jl
index a02443bb..45e04acd 100644
--- a/examples/Emulator/Ishigami/emulate.jl
+++ b/examples/Emulator/Ishigami/emulate.jl
@@ -77,7 +77,7 @@ function main()
     noise = rand(rng, Normal(0, Γ), n_tp)
     for i in 1:n_tp
         input[:, i] = samples[ind[i], :]
-        output[i] = y[ind[i]] + noise[i] 
+        output[i] = y[ind[i]] + noise[i]
     end
     iopairs = PairedDataContainer(input, output)
 
@@ -198,7 +198,7 @@ function main()
 
     # plots
 
-    f2 = Figure(resolution = (1.618 * 900, 300), markersize = 4, fontsize=28)
+    f2 = Figure(resolution = (1.618 * 900, 300), markersize = 4, fontsize = 28)
     axx_em = Axis(f2[1, 1], xlabel = "x1", ylabel = "f")
     axy_em = Axis(f2[1, 2], xlabel = "x2", ylabel = "f")
     axz_em = Axis(f2[1, 3], xlabel = "x3", ylabel = "f")
diff --git a/examples/Emulator/Ishigami/plot_result.jl b/examples/Emulator/Ishigami/plot_result.jl
index f6fa2257..18345977 100644
--- a/examples/Emulator/Ishigami/plot_result.jl
+++ b/examples/Emulator/Ishigami/plot_result.jl
@@ -4,33 +4,34 @@ using JLD2
 
 function main()
 
-    
 
-    
+
+
     output_directory = "output"
     cases = ["Prior", "GP", "RF-scalar"]
     case = cases[2]
     filename = joinpath(output_directory, "results_$case.jld2")
 
-    (sobol_pts, train_idx, mlt_pred_y, mlt_sobol, analytic_sobol, true_y, noise_sample, noise_cov, estimated_sobol) = load(
-        filename,
-        "sobol_pts",
-        "train_idx",
-        "mlt_pred_y",
-        "mlt_sobol",
-        "analytic_sobol",
-        "true_y",
-        "noise_sample",
-        "noise_cov",
-        "estimated_sobol",
-    )
+    (sobol_pts, train_idx, mlt_pred_y, mlt_sobol, analytic_sobol, true_y, noise_sample, noise_cov, estimated_sobol) =
+        load(
+            filename,
+            "sobol_pts",
+            "train_idx",
+            "mlt_pred_y",
+            "mlt_sobol",
+            "analytic_sobol",
+            "true_y",
+            "noise_sample",
+            "noise_cov",
+            "estimated_sobol",
+        )
     n_data = size(sobol_pts, 1)
     n_train_pts = length(train_idx)
     n_repeats = length(mlt_sobol)
     (V, V1, V2, V3, VT1, VT2, VT3) = analytic_sobol
 
 
-    fontsize=28
+    fontsize = 28
     f1 = Figure(resolution = (1.618 * 900, 300), markersize = 4, fontsize = fontsize)
     axx = Axis(f1[1, 1], xlabel = "x1", ylabel = "f")
     axy = Axis(f1[1, 2], xlabel = "x2", ylabel = "f")
diff --git a/examples/Emulator/L63/emulate.jl b/examples/Emulator/L63/emulate.jl
index b63e3a9b..64ee4b6c 100644
--- a/examples/Emulator/L63/emulate.jl
+++ b/examples/Emulator/L63/emulate.jl
@@ -31,7 +31,7 @@ function main()
 
     #for later plots
     fontsize = 20
-    wideticks= WilkinsonTicks(3, k_min=3, k_max=4) # prefer few ticks
+    wideticks = WilkinsonTicks(3, k_min = 3, k_max = 4) # prefer few ticks
 
 
     # Run L63 from 0 -> tmax
@@ -96,7 +96,16 @@ function main()
 
 
     # Emulate
-    cases = ["GP", "RF-prior", "RF-scalar", "RF-scalar-diagin", "RF-svd-nonsep", "RF-nosvd-nonsep", "RF-nosvd-sep","RF-svd-sep"]
+    cases = [
+        "GP",
+        "RF-prior",
+        "RF-scalar",
+        "RF-scalar-diagin",
+        "RF-svd-nonsep",
+        "RF-nosvd-nonsep",
+        "RF-nosvd-sep",
+        "RF-svd-sep",
+    ]
 
     case = cases[7]
 
@@ -212,7 +221,7 @@ function main()
         optimize_hyperparameters!(emulator)
 
         # diagnostics
-        if case ∈ ["RF-svd-nonsep", "RF-nosvd-nonsep", "RF-svd-sep"]]
+        if case ∈ ["RF-svd-nonsep", "RF-nosvd-nonsep", "RF-svd-sep"]
             push!(opt_diagnostics, get_optimizer(mlt)[1]) #length-1 vec of vec  -> vec
         end
 
@@ -249,10 +258,10 @@ function main()
         if rep_idx == 1
 
             # plotting trace
-            f = Figure(size = (900, 450), fontsize=fontsize)
+            f = Figure(size = (900, 450), fontsize = fontsize)
             axx = Axis(f[1, 1], ylabel = "x", yticks = wideticks)
             axy = Axis(f[2, 1], ylabel = "y", yticks = wideticks)
-            axz = Axis(f[3, 1], xlabel = "time", ylabel = "z", yticks = [10,30,50])
+            axz = Axis(f[3, 1], xlabel = "time", ylabel = "z", yticks = [10, 30, 50])
 
             xx = 0:dt:tmax_test
             lines!(axx, xx, u_test_tmp[1, :], color = :blue)
@@ -269,7 +278,7 @@ function main()
             save(joinpath(output_directory, case * "_l63_test.pdf"), f, pt_per_unit = 3)
 
             # plot attractor
-            f3 = Figure(fontsize=fontsize)
+            f3 = Figure(fontsize = fontsize)
             lines(f3[1, 1], u_test_tmp[1, :], u_test_tmp[3, :], color = :blue)
             lines(f3[2, 1], solplot[1, :], solplot[3, :], color = :orange)
 
@@ -278,11 +287,11 @@ function main()
             save(joinpath(output_directory, case * "_l63_attr.pdf"), f3, pt_per_unit = 3)
 
             # plotting histograms
-            f2 = Figure(fontsize=1.25*fontsize)
-            axx = Axis(f2[1, 1], xlabel = "x", ylabel = "pdf", xticks=wideticks, yticklabelsvisible = false)
-            axy = Axis(f2[1, 2], xlabel = "y", xticks=wideticks, yticklabelsvisible = false)
-            axz = Axis(f2[1, 3], xlabel = "z", xticks= [10,30,50], yticklabelsvisible = false)
-            
+            f2 = Figure(fontsize = 1.25 * fontsize)
+            axx = Axis(f2[1, 1], xlabel = "x", ylabel = "pdf", xticks = wideticks, yticklabelsvisible = false)
+            axy = Axis(f2[1, 2], xlabel = "y", xticks = wideticks, yticklabelsvisible = false)
+            axz = Axis(f2[1, 3], xlabel = "z", xticks = [10, 30, 50], yticklabelsvisible = false)
+
             hist!(axx, u_hist_tmp[1, :], bins = 50, normalization = :pdf, color = (:blue, 0.5))
             hist!(axy, u_hist_tmp[2, :], bins = 50, normalization = :pdf, color = (:blue, 0.5))
             hist!(axz, u_hist_tmp[3, :], bins = 50, normalization = :pdf, color = (:blue, 0.5))
@@ -336,10 +345,10 @@ function main()
         push!(u_cdf, u_cdf_tmp)
     end
 
-    f4 = Figure(size = (900, Int(floor(900 / 1.618))), fontsize = 1.5*fontsize)
-    axx = Axis(f4[1, 1], xlabel = "x", ylabel = "cdf", xticks=wideticks)
-    axy = Axis(f4[1, 2], xlabel = "y", xticks=wideticks,yticklabelsvisible = false)
-    axz = Axis(f4[1, 3], xlabel = "z", xticks= [10,30,50],yticklabelsvisible = false)
+    f4 = Figure(size = (900, Int(floor(900 / 1.618))), fontsize = 1.5 * fontsize)
+    axx = Axis(f4[1, 1], xlabel = "x", ylabel = "cdf", xticks = wideticks)
+    axy = Axis(f4[1, 2], xlabel = "y", xticks = wideticks, yticklabelsvisible = false)
+    axz = Axis(f4[1, 3], xlabel = "z", xticks = [10, 30, 50], yticklabelsvisible = false)
 
     unif_samples = (1:size(sol_cdf, 2)) / size(sol_cdf, 2)
 
diff --git a/examples/Emulator/L63/emulate_diff-rank-test.jl b/examples/Emulator/L63/emulate_diff-rank-test.jl
index 2d8f8b3a..222147bf 100644
--- a/examples/Emulator/L63/emulate_diff-rank-test.jl
+++ b/examples/Emulator/L63/emulate_diff-rank-test.jl
@@ -26,14 +26,14 @@ function main()
     # rng
     rng = MersenneTwister(1232435)
 
-    n_repeats = 5 # repeat exp with same data.
+    n_repeats = 1# 5 # repeat exp with same data.
     println("run experiment $n_repeats times")
     rank_test = 1:9 # must be 1:k for now
     n_iteration = 20
 
     #for later plots
     fontsize = 20
-    wideticks= WilkinsonTicks(3, k_min=3, k_max=4) # prefer few ticks
+    wideticks = WilkinsonTicks(3, k_min = 3, k_max = 4) # prefer few ticks
 
 
     # Run L63 from 0 -> tmax
@@ -98,18 +98,27 @@ function main()
 
 
     # Emulate
-    cases = ["GP", "RF-prior", "RF-scalar", "RF-scalar-diagin", "RF-svd-nonsep", "RF-nosvd-nonsep", "RF-nosvd-sep", "RF-svd-sep"]
-
-    case = cases[8] #5
+    cases = [
+        "GP",
+        "RF-prior",
+        "RF-scalar",
+        "RF-scalar-diagin",
+        "RF-svd-nonsep",
+        "RF-nosvd-nonsep",
+        "RF-nosvd-sep",
+        "RF-svd-sep",
+    ]
+
+    case = cases[1] #5
 
     nugget = Float64(1e-8)
     u_test = []
     u_hist = []
-    opt_diagnostics = zeros(length(rank_test),n_repeats,n_iteration)
-    train_err = zeros(length(rank_test),n_repeats)
-    test_err = zeros(length(rank_test),n_repeats)
+    opt_diagnostics = zeros(length(rank_test), n_repeats, n_iteration)
+    train_err = zeros(length(rank_test), n_repeats)
+    test_err = zeros(length(rank_test), n_repeats)
 
-    ttt = zeros(length(rank_test),n_repeats)
+    ttt = zeros(length(rank_test), n_repeats)
     for (rank_id, rank_val) in enumerate(rank_test) #test over different ranks for svd-nonsep
         @info "Test rank: $(rank_val)"
         for rep_idx in 1:n_repeats
@@ -135,7 +144,7 @@ function main()
             elseif case ∈ ["RF-svd-nonsep"]
                 kernel_structure = NonseparableKernel(LowRankFactor(rank_val, nugget))
                 n_features = 500
-                
+
                 mlt = VectorRandomFeatureInterface(
                     n_features,
                     3,
@@ -143,10 +152,10 @@ function main()
                     rng = rng,
                     kernel_structure = kernel_structure,
                     optimizer_options = rf_optimizer_overrides,
-            )
+                )
             elseif case ∈ ["RF-svd-sep", "RF-nosvd-sep"]
-                rank_out = Int(ceil(rank_val/3)) # 1 1 1 2 2 2 
-                rank_in = rank_val - 3*(rank_out-1) # 1 2 3 1 2 3
+                rank_out = Int(ceil(rank_val / 3)) # 1 1 1 2 2 2 
+                rank_in = rank_val - 3 * (rank_out - 1) # 1 2 3 1 2 3
                 @info "Test rank in: $(rank_in) out: $(rank_out)"
                 kernel_structure = SeparableKernel(LowRankFactor(rank_in, nugget), LowRankFactor(rank_out, nugget))
                 n_features = 500
@@ -160,7 +169,7 @@ function main()
                 )
 
             end
-            
+
             #save config for RF
             if !(case == "GP") && (rep_idx == 1)
                 JLD2.save(
@@ -184,12 +193,12 @@ function main()
                 emulator = Emulator(mlt, iopairs; obs_noise_cov = Γy, decorrelate = decorrelate)
                 optimize_hyperparameters!(emulator)
             end
-            
+
             # diagnostics
             if case != "GP"
-                opt_diagnostics[rank_id,rep_idx,:] = get_optimizer(mlt)[1] #length-1 vec of vec  -> vec
+                opt_diagnostics[rank_id, rep_idx, :] = get_optimizer(mlt)[1] #length-1 vec of vec  -> vec
             end
-            
+
             # Predict with emulator
             u_test_tmp = zeros(3, length(xspan_test))
             u_test_tmp[:, 1] = sol_test.u[1]
@@ -206,38 +215,43 @@ function main()
                 train_mean, _ = predict(emulator, input[:, i:i], transform_to_real = true) # 3x1
                 train_err_tmp[1] += norm(train_mean - output[:, i])
             end
-            train_err[rank_id,rep_idx] = 1 / size(input, 2) * train_err_tmp[1]
+            train_err[rank_id, rep_idx] = 1 / size(input, 2) * train_err_tmp[1]
 
             # test error i -> o
             test_err_tmp = [0.0]
             for i in 1:(length(xspan_test) - 1)
-                test_mean, _ = predict(emulator, reshape(sol_test.u[i],:,1), transform_to_real = true) # 3x1 matrix
-                test_err_tmp[1] += norm(test_mean[:] - sol_test.u[i+1])
+                test_mean, _ = predict(emulator, reshape(sol_test.u[i], :, 1), transform_to_real = true) # 3x1 matrix
+                test_err_tmp[1] += norm(test_mean[:] - sol_test.u[i + 1])
             end
-            test_err[rank_id,rep_idx] = 1/(length(xspan_test) - 1) * test_err_tmp[1]
+            test_err[rank_id, rep_idx] = 1 / (length(xspan_test) - 1) * test_err_tmp[1]
             println("normalized L^2 error on training data:", 1 / size(input, 2) * train_err_tmp[1])
-            
+            println("normalized L^2 error on test data:", 1 / (length(xspan_test) - 1) * test_err_tmp[1])
+
             u_hist_tmp = zeros(3, length(xspan_hist))
             u_hist_tmp[:, 1] = sol_hist.u[1]  # start at end of previous sim
-            
+
             for i in 1:(length(xspan_hist) - 1)
                 rf_mean, _ = predict(emulator, u_hist_tmp[:, i:i], transform_to_real = true) # 3x1 matrix
                 u_hist_tmp[:, i + 1] = rf_mean
             end
-            
+
             push!(u_test, u_test_tmp)
             push!(u_hist, u_hist_tmp)
-            
+
             JLD2.save(
-                 joinpath(output_directory, case * "_l63_rank_test_results.jld2"),
-                     "rank_test", collect(rank_test),
-                     "timings", ttt,
-                     "train_err", train_err,
-                     "test_err", test_err,
-                 )
+                joinpath(output_directory, case * "_l63_rank_test_results.jld2"),
+                "rank_test",
+                collect(rank_test),
+                "timings",
+                ttt,
+                "train_err",
+                train_err,
+                "test_err",
+                test_err,
+            )
 
         end
-        
+
     end
 
     # save data
@@ -247,18 +261,18 @@ function main()
 
     # plot eki convergence plot
     if length(opt_diagnostics) > 0
-        
-        err_cols = mean(opt_diagnostics, dims=2)[:,1,:]' # average error over repeats for each rank as columns
+
+        err_cols = mean(opt_diagnostics, dims = 2)[:, 1, :]' # average error over repeats for each rank as columns
         err_normalized = (err_cols' ./ err_cols[1, :])' # divide each series by the max, so all errors start at 1
-        plot_x = collect(1:size(err_cols,1))
+        plot_x = collect(1:size(err_cols, 1))
         #save
         error_filepath = joinpath(output_directory, "eki_diff-rank-test_conv_error.jld2")
         save(error_filepath, "error", err_cols, "trajectory_error", train_err)
 
         f5 = Figure(resolution = (1.618 * 300, 300), markersize = 4)
         ax_conv = Axis(f5[1, 1], xlabel = "Iteration", ylabel = "max-normalized error", yscale = log10)
-        for (rank_id,rank_val) in enumerate(rank_test)
-                lines!(ax_conv, plot_x, err_normalized[:,rank_id], label = "rank $(rank_val)")
+        for (rank_id, rank_val) in enumerate(rank_test)
+            lines!(ax_conv, plot_x, err_normalized[:, rank_id], label = "rank $(rank_val)")
         end
         axislegend(ax_conv)
         save(joinpath(output_directory, "l63_diff-rank-test_eki-conv_$(case).png"), f5, px_per_unit = 3)
@@ -266,9 +280,9 @@ function main()
 
     end
 
-   
-    
-    
+
+
+
 end
 
 main()
diff --git a/examples/Emulator/L63/plot_results-diff-rank-test.jl b/examples/Emulator/L63/plot_results-diff-rank-test.jl
index 23027940..dfe7d0f6 100644
--- a/examples/Emulator/L63/plot_results-diff-rank-test.jl
+++ b/examples/Emulator/L63/plot_results-diff-rank-test.jl
@@ -6,75 +6,100 @@ using JLD2
 function main()
     # filepaths
     output_directory = joinpath(@__DIR__, "output")
-#    case = "RF-svd-nonsep"
+    #    case = "RF-svd-nonsep"
     case = "RF-svd-sep"
     rank_test = 1:9 # must be 1:k for now
     @info "plotting case $case for rank test $(rank_test)"
 
     #for later plots
     fontsize = 18
-    wideticks= WilkinsonTicks(3, k_min=3, k_max=4) # prefer few ticks
+    wideticks = WilkinsonTicks(3, k_min = 3, k_max = 4) # prefer few ticks
 
     error_filepath = joinpath(output_directory, "eki_diff-rank-test_conv_error.jld2")
     error_data = JLD2.load(error_filepath)
-    err_cols =  error_data["error"] # exps are columns
-    
-    rank_test_filepath = joinpath(output_directory, case*"_l63_rank_test_results.jld2")
+    err_cols = error_data["error"] # exps are columns
+
+    rank_test_filepath = joinpath(output_directory, case * "_l63_rank_test_results.jld2")
     rank_test_data = JLD2.load(rank_test_filepath)
     train_err = rank_test_data["train_err"]
     test_err = rank_test_data["test_err"]
 
     # Plot convergences
     err_normalized = (err_cols' ./ err_cols[1, :])' # divide each series by the max, so all errors start at 1
-    n_iterations = size(err_cols,1)
+    n_iterations = size(err_cols, 1)
     plot_x = collect(1:n_iterations)
-    
-    f5 = Figure(resolution = (1.618 * 300, 300), markersize = 4, fontsize=fontsize)
-    ax_conv = Axis(f5[1, 1], xlabel = "Iteration", ylabel = "max-normalized error", yscale = log10, xticks = 2:2:n_iterations, yticks= ([10^-4,10^-2,10^0, 10^2], [L"10^{-4}",L"10^{-2}", L"10^0",L"10^2"]))
+
+    f5 = Figure(resolution = (1.618 * 300, 300), markersize = 4, fontsize = fontsize)
+    ax_conv = Axis(
+        f5[1, 1],
+        xlabel = "Iteration",
+        ylabel = "max-normalized error",
+        yscale = log10,
+        xticks = 2:2:n_iterations,
+        yticks = ([10^-4, 10^-2, 10^0, 10^2], [L"10^{-4}", L"10^{-2}", L"10^0", L"10^2"]),
+    )
     plot_to = 10
-    for (rank_id,rank_val) in enumerate(rank_test)
-        lines!(ax_conv, plot_x[1:plot_to], err_normalized[1:plot_to,rank_id], label = "rank $(rank_val)", color=(:blue, 0.5*rank_val/10))
+    for (rank_id, rank_val) in enumerate(rank_test)
+        lines!(
+            ax_conv,
+            plot_x[1:plot_to],
+            err_normalized[1:plot_to, rank_id],
+            label = "rank $(rank_val)",
+            color = (:blue, 0.5 * rank_val / 10),
+        )
     end
-    
+
     #axislegend(ax_conv)
     save(joinpath(output_directory, "l63_diff-rank-test_eki-conv_$(case).png"), f5, px_per_unit = 3)
-    save(joinpath(output_directory, "l63_diff-rank-test_eki-conv_$(case).pdf"), f5, px_per_unit = 3) 
+    save(joinpath(output_directory, "l63_diff-rank-test_eki-conv_$(case).pdf"), f5, px_per_unit = 3)
 
     # if including GP - either load or just put in average here:
-    gp_train_test_err=[0.0154,0.00292]
-    
-    f6 = Figure(resolution = (1.618 * 300, 300), markersize = 4, fontsize=fontsize)
+    gp_train_test_err = [0.0154, 0.00292]
+
+    f6 = Figure(resolution = (1.618 * 300, 300), markersize = 4, fontsize = fontsize)
     if case == "RF-svd-nonsep"
-        ax_error = Axis(f6[1, 1], xlabel = "rank", xticks=collect(rank_test), ylabel = "L²-error in trajectory", yscale=log10 )
-        mean_train_err = mean(train_err[1:length(rank_test),:],dims=2)
-        mean_test_err = mean(test_err[1:length(rank_test),:],dims=2)
-        lines!(ax_error, rank_test, mean_train_err[:], label="RF-train", color=:blue, linestyle=:dash)
-        lines!(ax_error, rank_test, mean_test_err[:], label="RF-test", color=:blue)
-        hlines!(ax_error, [gp_train_test_err[1]], label="GP-train", color = :orange, linestyle=:dash)
-         hlines!(ax_error, [gp_train_test_err[2]], label="GP-test", color = :orange, linestyle=:solid)
-        
+        ax_error = Axis(
+            f6[1, 1],
+            xlabel = "rank",
+            xticks = collect(rank_test),
+            ylabel = "L²-error in trajectory",
+            yscale = log10,
+        )
+        mean_train_err = mean(train_err[1:length(rank_test), :], dims = 2)
+        mean_test_err = mean(test_err[1:length(rank_test), :], dims = 2)
+        lines!(ax_error, rank_test, mean_train_err[:], label = "RF-train", color = :blue, linestyle = :dash)
+        lines!(ax_error, rank_test, mean_test_err[:], label = "RF-test", color = :blue)
+        hlines!(ax_error, [gp_train_test_err[1]], label = "GP-train", color = :orange, linestyle = :dash)
+        hlines!(ax_error, [gp_train_test_err[2]], label = "GP-test", color = :orange, linestyle = :solid)
+
         axislegend(ax_error)
     elseif case == "RF-svd-sep"
         rank_labels = []
         for rank_val in collect(rank_test)
-            rank_out = Int(ceil(rank_val/3)) # 1 1 1 2 2 2 
-            rank_in = rank_val - 3*(rank_out-1) # 1 2 3 1 2 3
+            rank_out = Int(ceil(rank_val / 3)) # 1 1 1 2 2 2 
+            rank_in = rank_val - 3 * (rank_out - 1) # 1 2 3 1 2 3
             push!(rank_labels, (rank_in, rank_out))
         end
-        idx = [1,4,7,2,5,8,3,6,9]
-        
-        ax_error = Axis(f6[1, 1], xlabel = "rank (in,out)", xticks=(rank_test,["$(rank_labels[i])" for i in idx]), ylabel = "L²-error in trajectory", yscale=log10 )
-        mean_train_err = mean(train_err[idx,:],dims=2)
-        mean_test_err = mean(test_err[idx,:],dims=2)
-        lines!(ax_error, rank_test, mean_train_err[:], label="RF-train", color=:blue, linestyle=:dash)
-        lines!(ax_error, rank_test, mean_test_err[:], label="RF-test", color=:blue)
-        hlines!(ax_error, [gp_train_test_err[1]], label="GP-train", color = :orange, linestyle=:dash)
-        hlines!(ax_error, [gp_train_test_err[2]], label="GP-test", color = :orange, linestyle=:solid)
+        idx = [1, 4, 7, 2, 5, 8, 3, 6, 9]
+
+        ax_error = Axis(
+            f6[1, 1],
+            xlabel = "rank (in,out)",
+            xticks = (rank_test, ["$(rank_labels[i])" for i in idx]),
+            ylabel = "L²-error in trajectory",
+            yscale = log10,
+        )
+        mean_train_err = mean(train_err[idx, :], dims = 2)
+        mean_test_err = mean(test_err[idx, :], dims = 2)
+        lines!(ax_error, rank_test, mean_train_err[:], label = "RF-train", color = :blue, linestyle = :dash)
+        lines!(ax_error, rank_test, mean_test_err[:], label = "RF-test", color = :blue)
+        hlines!(ax_error, [gp_train_test_err[1]], label = "GP-train", color = :orange, linestyle = :dash)
+        hlines!(ax_error, [gp_train_test_err[2]], label = "GP-test", color = :orange, linestyle = :solid)
         axislegend(ax_error)
     else
         throw(ArgumentError("case $(case) not found, check for typos"))
     end
-        
+
     save(joinpath(output_directory, "l63_diff-rank-test_train_err_$(case).png"), f6, px_per_unit = 3)
     save(joinpath(output_directory, "l63_diff-rank-test_train_err_$(case).pdf"), f6, px_per_unit = 3)
 end
diff --git a/examples/Emulator/L63/plot_results.jl b/examples/Emulator/L63/plot_results.jl
index 49253b5c..923eccf6 100644
--- a/examples/Emulator/L63/plot_results.jl
+++ b/examples/Emulator/L63/plot_results.jl
@@ -11,8 +11,8 @@ function main()
     @info "plotting case $case"
     #for later plots
     fontsize = 20
-    wideticks= WilkinsonTicks(3, k_min=3, k_max=4) # prefer few ticks
-    
+    wideticks = WilkinsonTicks(3, k_min = 3, k_max = 4) # prefer few ticks
+
     # Load from saved files
     #=
         config_file = JLD2.load(joinpath(output_directory, case * "_l63_config.jld2"))
@@ -37,11 +37,11 @@ function main()
     tmax_test = 20 #100
     xx = 0.0:dt:tmax_test
 
-    f = Figure(size = (900, 450), fontsize=fontsize)
+    f = Figure(size = (900, 450), fontsize = fontsize)
     axx = Axis(f[1, 1], ylabel = "x", yticks = wideticks)
     axy = Axis(f[2, 1], ylabel = "y", yticks = wideticks)
-    axz = Axis(f[3, 1], xlabel = "time", ylabel = "z", yticks = [10,30,50])
-    
+    axz = Axis(f[3, 1], xlabel = "time", ylabel = "z", yticks = [10, 30, 50])
+
     tt = 1:length(xx)
     lines!(axx, xx, uplot_tmp[1, tt], color = :blue)
     lines!(axy, xx, uplot_tmp[2, tt], color = :blue)
@@ -56,9 +56,9 @@ function main()
     save(joinpath(output_directory, case * "_l63_test.png"), f, px_per_unit = 3)
     save(joinpath(output_directory, case * "_l63_test.pdf"), f, pt_per_unit = 3)
     @info "plotted trajectory in \n $(case)_l63_test.png, and $(case)_l63_test.pdf"
-    
+
     # plot attractor
-    f3 = Figure(fontsize=fontsize)
+    f3 = Figure(fontsize = fontsize)
     lines(f3[1, 1], uplot_tmp[1, :], uplot_tmp[3, :], color = :blue)
     lines(f3[2, 1], solplot[1, :], solplot[3, :], color = :orange)
 
@@ -66,13 +66,13 @@ function main()
     save(joinpath(output_directory, case * "_l63_attr.png"), f3, px_per_unit = 3)
     save(joinpath(output_directory, case * "_l63_attr.pdf"), f3, pt_per_unit = 3)
     @info "plotted attractor in \n $(case)_l63_attr.png, and $(case)_l63_attr.pdf"
-   
+
     # plotting histograms
-     f2 = Figure(fontsize=1.25*fontsize)
-    axx = Axis(f2[1, 1], xlabel = "x", ylabel = "pdf", xticks=wideticks, yticklabelsvisible = false)
-    axy = Axis(f2[1, 2], xlabel = "y", xticks=wideticks, yticklabelsvisible = false)
-    axz = Axis(f2[1, 3], xlabel = "z", xticks= [10,30,50], yticklabelsvisible = false)
-            
+    f2 = Figure(fontsize = 1.25 * fontsize)
+    axx = Axis(f2[1, 1], xlabel = "x", ylabel = "pdf", xticks = wideticks, yticklabelsvisible = false)
+    axy = Axis(f2[1, 2], xlabel = "y", xticks = wideticks, yticklabelsvisible = false)
+    axz = Axis(f2[1, 3], xlabel = "z", xticks = [10, 30, 50], yticklabelsvisible = false)
+
     hist!(axx, uhist_tmp[1, :], bins = 50, normalization = :pdf, color = (:blue, 0.5))
     hist!(axy, uhist_tmp[2, :], bins = 50, normalization = :pdf, color = (:blue, 0.5))
     hist!(axz, uhist_tmp[3, :], bins = 50, normalization = :pdf, color = (:blue, 0.5))
@@ -95,15 +95,15 @@ function main()
             ucdf_tmp = sort(u, dims = 2)
             push!(ucdf, ucdf_tmp)
         end
-        
-        f4 = Figure(size = (900, Int(floor(900 / 1.618))), fontsize = 1.5*fontsize)
-        axx = Axis(f4[1, 1], xlabel = "x", ylabel = "cdf", xticks=wideticks)
-        axy = Axis(f4[1, 2], xlabel = "y", xticks=wideticks,yticklabelsvisible = false)
-        axz = Axis(f4[1, 3], xlabel = "z", xticks= [10,30,50],yticklabelsvisible = false)
-        
+
+        f4 = Figure(size = (900, Int(floor(900 / 1.618))), fontsize = 1.5 * fontsize)
+        axx = Axis(f4[1, 1], xlabel = "x", ylabel = "cdf", xticks = wideticks)
+        axy = Axis(f4[1, 2], xlabel = "y", xticks = wideticks, yticklabelsvisible = false)
+        axz = Axis(f4[1, 3], xlabel = "z", xticks = [10, 30, 50], yticklabelsvisible = false)
+
 
         unif_samples = (1:size(solcdf, 2)) / size(solcdf, 2)
-        
+
         for u in ucdf
             lines!(axx, u[1, :], unif_samples, color = (:blue, 0.2), linewidth = 4)
             lines!(axy, u[2, :], unif_samples, color = (:blue, 0.2), linewidth = 4)
@@ -113,7 +113,7 @@ function main()
         lines!(axx, solcdf[1, :], unif_samples, color = (:orange, 1.0), linewidth = 4)
         lines!(axy, solcdf[2, :], unif_samples, color = (:orange, 1.0), linewidth = 4)
         lines!(axz, solcdf[3, :], unif_samples, color = (:orange, 1.0), linewidth = 4)
-        
+
         # save
         save(joinpath(output_directory, case * "_l63_cdfs.png"), f4, px_per_unit = 3)
         save(joinpath(output_directory, case * "_l63_cdfs.pdf"), f4, pt_per_unit = 3)
diff --git a/examples/Emulator/Regression_2d_2d/compare_regression.jl b/examples/Emulator/Regression_2d_2d/compare_regression.jl
index c02b0f17..df9282d3 100644
--- a/examples/Emulator/Regression_2d_2d/compare_regression.jl
+++ b/examples/Emulator/Regression_2d_2d/compare_regression.jl
@@ -57,8 +57,8 @@ function main()
     d = 2   # output dim
     X = 2.0 * π * rand(p, n)
     # G(x1, x2)
-    g1(x) = sin.(x[1, :]) .+ 2*cos.(2*x[2, :])    
-    g2(x) = 3*sin.(3*x[1, :]) .- 4*cos.(4*x[2, :])
+    g1(x) = sin.(x[1, :]) .+ 2 * cos.(2 * x[2, :])
+    g2(x) = 3 * sin.(3 * x[1, :]) .- 4 * cos.(4 * x[2, :])
     g1x = g1(X)
     g2x = g2(X)
     gx = zeros(2, n)
@@ -147,7 +147,8 @@ function main()
 
         # common random feature setup
         n_features = 300
-        optimizer_options = Dict("n_iteration" => 20, "n_features_opt" => 100, "n_ensemble" => 80, "cov_sample_multiplier"=>1.0)
+        optimizer_options =
+            Dict("n_iteration" => 20, "n_features_opt" => 100, "n_ensemble" => 80, "cov_sample_multiplier" => 1.0)
         nugget = 1e-12
 
 
diff --git a/src/RandomFeature.jl b/src/RandomFeature.jl
index fc65746e..a9e22970 100644
--- a/src/RandomFeature.jl
+++ b/src/RandomFeature.jl
@@ -461,7 +461,7 @@ function calculate_mean_cov_and_coeffs(
     input_dim = size(itrain, 1)
     output_dim = size(otrain, 1)
     n_test = size(itest, 2)
-    
+
     # build and fit the RF
     rfm = RFM_from_hyperparameters(
         rfi,
@@ -625,7 +625,7 @@ function estimate_mean_and_coeffnorm_covariance(
 
     output_dim = size(get_outputs(io_pairs), 1)
     n_test = length(test_idx)
-    
+
     means = zeros(output_dim, n_samples, n_test)
     mean_of_covs = zeros(output_dim, output_dim, n_test)
     moc_tmp = similar(mean_of_covs)
@@ -733,7 +733,7 @@ function calculate_ensemble_mean_and_coeffnorm(
     N_ens = size(lmat, 2)
     output_dim = size(get_outputs(io_pairs), 1)
     n_test = length(test_idx)
-    
+
     means = zeros(output_dim, N_ens, n_test)
     mean_of_covs = zeros(output_dim, output_dim, n_test)
     buffer = zeros(n_test, output_dim, n_features)
diff --git a/src/RandomFeature_crossval.jl b/src/RandomFeature_crossval.jl
deleted file mode 100644
index fc65746e..00000000
--- a/src/RandomFeature_crossval.jl
+++ /dev/null
@@ -1,1024 +0,0 @@
-
-using RandomFeatures
-const RF = RandomFeatures
-using EnsembleKalmanProcesses
-const EKP = EnsembleKalmanProcesses
-using EnsembleKalmanProcesses.Localizers
-using ..ParameterDistributions
-using ..Utilities
-using StableRNGs
-using Random
-
-export calculate_n_hyperparameters, hyperparameters_from_flat, build_default_prior, cov_structure_from_string
-export CovarianceStructureType, OneDimFactor, DiagonalFactor, CholeskyFactor, LowRankFactor, HierarchicalLowRankFactor
-export SeparableKernel, NonseparableKernel
-export get_input_cov_structure, get_output_cov_structure, get_cov_structure
-export get_eps
-export rank
-export shrinkage_cov, nice_cov
-
-
-abstract type RandomFeatureInterface <: MachineLearningTool end
-
-# Different types of threading
-abstract type MultithreadType end
-struct TullioThreading <: MultithreadType end
-struct EnsembleThreading <: MultithreadType end
-
-
-# Different types of covariance representation in the prior
-
-abstract type CovarianceStructureType end
-
-"""
-$(DocStringExtensions.TYPEDEF)
-
-covariance structure for a one-dimensional space
-"""
-struct OneDimFactor <: CovarianceStructureType end
-
-"""
-$(DocStringExtensions.TYPEDEF)
-
-builds a diagonal covariance structure
-"""
-struct DiagonalFactor{FT <: AbstractFloat} <: CovarianceStructureType
-    "nugget term"
-    eps::FT
-end
-DiagonalFactor() = DiagonalFactor(Float64(1.0))
-get_eps(df::DiagonalFactor) = df.eps
-
-"""
-$(DocStringExtensions.TYPEDEF)
-
-builds a general positive-definite covariance structure
-"""
-struct CholeskyFactor{FT <: AbstractFloat} <: CovarianceStructureType
-    "nugget term"
-    eps::FT
-end
-CholeskyFactor() = CholeskyFactor(Float64(1.0))
-get_eps(cf::CholeskyFactor) = cf.eps
-
-"""
-$(DocStringExtensions.TYPEDEF)
-
-builds a covariance structure that deviates from the identity with a low-rank perturbation. This perturbation is diagonalized in the low-rank space
-"""
-struct LowRankFactor{FT <: AbstractFloat} <: CovarianceStructureType
-    "rank of perturbation"
-    rank::Int
-    "nugget term"
-    eps::FT
-end
-LowRankFactor(r::Int) = LowRankFactor(r, Float64(1.0))
-get_eps(lrf::LowRankFactor) = lrf.eps
-
-
-"""
-$(DocStringExtensions.TYPEDEF)
-
-builds a covariance structure that deviates from the identity with a more general low-rank perturbation
-"""
-struct HierarchicalLowRankFactor{FT <: AbstractFloat} <: CovarianceStructureType
-    "rank of perturbation"
-    rank::Int
-    "nugget term"
-    eps::FT
-end
-
-HierarchicalLowRankFactor(r::Int) = HierarchicalLowRankFactor(r, Float64(1.0))
-get_eps(hlrf::HierarchicalLowRankFactor) = hlrf.eps
-
-import LinearAlgebra: rank
-rank(lrf::LowRankFactor) = lrf.rank
-rank(hlrf::HierarchicalLowRankFactor) = hlrf.rank
-
-# To add a new cov type `T`  one must define these 3 methods:
-# hyperparameters_from_flat(x::V, d::Int, t::T) where {V <: AbstractVector}
-# calculate_n_hyperparameters(d::Int, t::T)
-# build_default_prior(name::SS, d::Int, t::T) where {SS <: AbstractString}
-# and add a string id to cov_structure_from_string
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-creates some simple covariance structures from strings: ["onedim", "diagonal", "cholesky", "lowrank", hierlowrank"]. See the covariance Structures for more details.
-"""
-function cov_structure_from_string(s::S, d::Int) where {S <: AbstractString}
-    if s == "onedim"
-        return OneDimFactor()
-    elseif s == "diagonal"
-        return DiagonalFactor()
-    elseif s == "cholesky"
-        return CholeskyFactor()
-    elseif s == "lowrank"
-        return LowRankFactor(Int(ceil(sqrt(d))))
-    elseif s == "hierlowrank"
-        return HierarchicalLowRankFactor(Int(ceil(sqrt(d))))
-    else
-        throw(
-            ArgumentError(
-                "Recieved unknown `input_cov_structure` keyword $(s), \n please choose from [\"onedim\",\"diagonal\",\"cholesky\", \"lowrank\"] or provide a CovarianceStructureType",
-            ),
-        )
-    end
-end
-cov_structure_from_string(cst::CST, args...; kwargs...) where {CST <: CovarianceStructureType} = cst
-
-
-# Different types of kernel 
-abstract type KernelStructureType end
-
-"""
-$(DocStringExtensions.TYPEDEF)
-
-Builds a separable kernel, i.e. one that accounts for input and output covariance structure separately
-"""
-struct SeparableKernel{CST1 <: CovarianceStructureType, CST2 <: CovarianceStructureType} <: KernelStructureType
-    input_cov_structure::CST1
-    output_cov_structure::CST2
-end
-get_input_cov_structure(kernel_structure::SeparableKernel) = kernel_structure.input_cov_structure
-get_output_cov_structure(kernel_structure::SeparableKernel) = kernel_structure.output_cov_structure
-
-
-"""
-$(DocStringExtensions.TYPEDEF)
-
-Builds a nonseparable kernel, i.e. one that accounts for a joint input and output covariance structure
-"""
-struct NonseparableKernel{CST <: CovarianceStructureType} <: KernelStructureType
-    cov_structure::CST
-end
-get_cov_structure(kernel_structure::NonseparableKernel) = kernel_structure.cov_structure
-
-
-# calculate_n_hyperparameters
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-calculates the number of hyperparameters generated by the choice of covariance structure
-"""
-calculate_n_hyperparameters(d::Int, odf::OneDimFactor) = 0
-calculate_n_hyperparameters(d::Int, df::DiagonalFactor) = d
-calculate_n_hyperparameters(d::Int, cf::CholeskyFactor) = Int(d * (d + 1) / 2)
-calculate_n_hyperparameters(d::Int, lrf::LowRankFactor) = Int(rank(lrf) + d * rank(lrf))
-calculate_n_hyperparameters(d::Int, hlrf::HierarchicalLowRankFactor) =
-    Int(rank(hlrf) * (rank(hlrf) + 1) / 2 + d * rank(hlrf))
-
-# build from flat 
-
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-reshapes a list of hyperparameters into a covariance matrix based on the selected structure
-"""
-hyperparameters_from_flat(x::V, odf::OneDimFactor) where {V <: AbstractVector} = nothing
-
-function hyperparameters_from_flat(x::V, df::DiagonalFactor) where {V <: AbstractVector}
-    return convert(Matrix, Diagonal(x) + get_eps(df) * I)
-end
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Converts a flat array into a cholesky factor.
-"""
-function flat_to_chol(x::V) where {V <: AbstractVector}
-    choldim = Int(floor(sqrt(2 * length(x))))
-    cholmat = zeros(choldim, choldim)
-    for i in 1:choldim
-        for j in 1:i
-            cholmat[i, j] = x[sum(0:(i - 1)) + j]
-        end
-    end
-    return cholmat
-end
-
-function hyperparameters_from_flat(x::V, cf::CholeskyFactor) where {V <: AbstractVector}
-    chol = flat_to_chol(x)
-    return chol * permutedims(chol, (2, 1)) + get_eps(cf) * I
-
-end
-
-function hyperparameters_from_flat(x::V, lrf::LowRankFactor) where {V <: AbstractVector}
-
-    r = rank(lrf)
-    d = Int(length(x) / r - 1) # l = r + d*r
-
-    D = Diagonal(x[1:r]) # D acts like sing.vals.^2
-    U = reshape(x[(r + 1):end], d, r)
-    qrU = qr(U) # qrU.Q is semi-orthogonal in first r columns, i.e. Q^T.Q = I
-    return qrU.Q[:, 1:r] * D * qrU.Q[:, 1:r]' + get_eps(lrf) * I
-
-end
-
-function hyperparameters_from_flat(x::V, hlrf::HierarchicalLowRankFactor) where {V <: AbstractVector}
-    r = rank(hlrf)
-    d = Int(length(x) / r - (r + 1) / 2) # l = r^2 + d*r
-
-    # Ambikasaran, O'Neil, Singh 2016
-    L = flat_to_chol(x[1:Int(r * (r + 1) / 2)])
-    U = reshape(x[Int(r * (r + 1) / 2 + 1):end], d, r)
-    K = L * permutedims(L, (2, 1)) + get_eps(hlrf) * I
-
-    qrU = qr(U)
-    svdT = svd(I + qrU.R * K * qrU.R') # square so T = Q S Q^t 
-    M = svdT.V * Diagonal(sqrt.(svdT.S)) # M M^t = Q sqrt(S) (Q sqrt(S))^t = Q sqrt(S) sqrt(S) Q^t = T
-    X = M - I
-    update_factor = I + qrU.Q * X * qrU.Q'
-
-    return update_factor * update_factor'
-end
-
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-builds a prior distribution for the kernel hyperparameters to initialize optimization.
-"""
-build_default_prior(name::SS, n_hp::Int, odf::OneDimFactor) where {SS <: AbstractString} = nothing
-
-function build_default_prior(name::SS, n_hp::Int, df::DiagonalFactor) where {SS <: AbstractString}
-    return ParameterDistribution(
-        Dict(
-            "name" => "$(name)_diagonal",
-            "distribution" => VectorOfParameterized(repeat([Normal(0, 3)], n_hp)),
-            "constraint" => repeat([bounded_below(0.0)], n_hp),
-        ),
-    )
-end
-
-function build_default_prior(name::SS, n_hp::Int, df::CholeskyFactor) where {SS <: AbstractString}
-    return constrained_gaussian("$(name)_cholesky", 0.0, 5.0, -Inf, Inf, repeats = n_hp)
-end
-
-function build_default_prior(name::SS, n_hp::Int, lrf::LowRankFactor) where {SS <: AbstractString}
-    r = rank(lrf)
-    d = Int(n_hp / r - 1)
-    D = ParameterDistribution(
-        Dict(
-            "name" => "$(name)_lowrank_diagonal",
-            "distribution" => VectorOfParameterized(repeat([Normal(0, 3)], r)),
-            "constraint" => repeat([bounded_below(0.0)], r),
-        ),
-    )
-    U = constrained_gaussian("$(name)_lowrank_U", 0.0, 100.0, -Inf, Inf, repeats = Int(d * r))
-    return combine_distributions([D, U])
-end
-
-function build_default_prior(name::SS, n_hp::Int, hlrf::HierarchicalLowRankFactor) where {SS <: AbstractString}
-    r = rank(hlrf)
-    d = Int(n_hp / r - (r + 1) / 2)
-    L = constrained_gaussian("$(name)_lowrank_Kchol", 0.0, 5.0, -Inf, Inf, repeats = Int(r * (r + 1) / 2))
-    U = constrained_gaussian("$(name)_lowrank_U", 0.0, 100.0, -Inf, Inf, repeats = Int(d * r))
-    return combine_distributions([L, U])
-end
-
-# combining input and output spaces:
-function calculate_n_hyperparameters(
-    input_dim::Int,
-    output_dim::Int,
-    kernel_structure::SK,
-) where {SK <: SeparableKernel}
-
-    input_cov_structure = get_input_cov_structure(kernel_structure)
-    n_hp_in = calculate_n_hyperparameters(input_dim, input_cov_structure)
-    output_cov_structure = get_output_cov_structure(kernel_structure)
-    n_hp_out = calculate_n_hyperparameters(output_dim, output_cov_structure)
-    n_scalings = 1
-
-    # if both in and output dim are "OneDimFactors" we still learn 1 hp
-    if n_hp_in + n_hp_out == 0
-        return 1 + n_scalings
-    end
-
-    return n_hp_in + n_hp_out + n_scalings
-end
-
-function calculate_n_hyperparameters(input_dim::Int, kernel_structure::KST) where {KST <: KernelStructureType}
-    output_dim = 1
-    return calculate_n_hyperparameters(input_dim, output_dim, kernel_structure)
-end
-
-function calculate_n_hyperparameters(
-    input_dim::Int,
-    output_dim::Int,
-    kernel_structure::NK,
-) where {NK <: NonseparableKernel}
-    cov_structure = get_cov_structure(kernel_structure)
-    n_hp = calculate_n_hyperparameters(input_dim * output_dim, cov_structure)
-    n_scalings = 1
-    # if both in and output dim are "OneDimFactors" we still learn 1 hp
-    if n_hp == 0
-        return 1 + n_scalings
-    end
-
-    return n_hp + n_scalings
-end
-
-
-
-
-function build_default_prior(input_dim::Int, output_dim::Int, kernel_structure::SK) where {SK <: SeparableKernel}
-    input_cov_structure = get_input_cov_structure(kernel_structure)
-    output_cov_structure = get_output_cov_structure(kernel_structure)
-
-    n_hp_in = calculate_n_hyperparameters(input_dim, input_cov_structure)
-    input_prior = build_default_prior("input", n_hp_in, input_cov_structure)
-    n_hp_out = calculate_n_hyperparameters(output_dim, output_cov_structure)
-    output_prior = build_default_prior("output", n_hp_out, output_cov_structure)
-
-    scaling_kern = constrained_gaussian("sigma", 1, 5, 0, Inf)
-
-    # We only use OneDimFactor values, default to input if both in and output dimension are one dimensional.Otherwise they are ignored
-    if isnothing(input_prior) && isnothing(output_prior)
-        onedim_prior = constrained_gaussian("onedim", 1.0, 1.0, 0.0, Inf)
-        return combine_distributions([onedim_prior, scaling_kern])
-    elseif isnothing(input_prior)
-        return combine_distributions([output_prior, scaling_kern])
-    elseif isnothing(output_prior)
-        return combine_distributions([input_prior, scaling_kern])
-    else
-        return combine_distributions([input_prior, output_prior, scaling_kern])
-    end
-end
-
-function build_default_prior(input_dim::Int, kernel_structure::KST) where {KST <: KernelStructureType}
-    output_dim = 1
-    return build_default_prior(input_dim, output_dim, kernel_structure)
-end
-
-function build_default_prior(input_dim::Int, output_dim::Int, kernel_structure::NK) where {NK <: NonseparableKernel}
-    cov_structure = get_cov_structure(kernel_structure)
-    n_hp = calculate_n_hyperparameters(input_dim * output_dim, cov_structure)
-    if n_hp == 0
-        pd_kern = constrained_gaussian("onedim", 1.0, 1.0, 0.0, Inf)
-    else
-        pd_kern = build_default_prior("full", n_hp, cov_structure)
-    end
-
-    scaling_kern = constrained_gaussian("sigma", 1, 5, 0, Inf)
-    return combine_distributions([pd_kern, scaling_kern])
-end
-
-function hyperparameters_from_flat(
-    x::VV,
-    input_dim::Int,
-    output_dim::Int,
-    kernel_structure::SK,
-) where {VV <: AbstractVector, SK <: SeparableKernel}
-    input_cov_structure = get_input_cov_structure(kernel_structure)
-    output_cov_structure = get_output_cov_structure(kernel_structure)
-
-    n_hp_in = calculate_n_hyperparameters(input_dim, input_cov_structure)
-    U = hyperparameters_from_flat(x[1:n_hp_in], input_cov_structure)
-    V = hyperparameters_from_flat(x[(n_hp_in + 1):end], output_cov_structure)
-
-    if isnothing(U) && isnothing(V) #i.e. both are from OneDimFactor cov structures
-        return x[1] * ones(1, 1), V
-    elseif isnothing(U)
-        return U, 0.5 * (V + permutedims(V, (2, 1)))
-    elseif isnothing(V)
-        return 0.5 * (U + permutedims(U, (2, 1))), V
-    else
-        # symmetrize (sometimes there are numerical artifacts)
-        U = 0.5 * (U + permutedims(U, (2, 1)))
-        V = 0.5 * (V + permutedims(V, (2, 1)))
-        return U, V
-    end
-end
-
-function hyperparameters_from_flat(
-    x::VV,
-    input_dim::Int,
-    kernel_structure::SK,
-) where {VV <: AbstractVector, SK <: SeparableKernel}
-    output_dim = 1
-    U, V = hyperparameters_from_flat(x, input_dim, output_dim, kernel_structure)
-
-    return U
-end
-
-function hyperparameters_from_flat(
-    x::VV,
-    input_dim::Int,
-    output_dim::Int,
-    kernel_structure::NK,
-) where {VV <: AbstractVector, NK <: NonseparableKernel}
-    cov_structure = get_cov_structure(kernel_structure)
-    C = hyperparameters_from_flat(x, cov_structure)
-    if isnothing(C)
-        return x[1] * ones(1, 1)
-    else
-        return 0.5 * (C + permutedims(C, (2, 1)))
-    end
-
-end
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Calculates key quantities (mean, covariance, and coefficients) of the objective function to be optimized for hyperparameter training. Buffers: mean_store, cov_store, buffer are provided to aid memory management.
-"""
-function calculate_mean_cov_and_coeffs(
-    rfi::RFI,
-    rng::RNG,
-    l::ForVM,
-    regularization::MorUSorD,
-    n_features::Int,
-    train_idx::VV,
-    test_idx::VV,
-    batch_sizes::Union{Dict{S, Int}, Nothing},
-    io_pairs::PairedDataContainer,
-    decomp_type::S,
-    mean_store::M,
-    cov_store::A,
-    buffer::A,
-    multithread_type::MT,
-) where {
-    RFI <: RandomFeatureInterface,
-    RNG <: AbstractRNG,
-    ForVM <: Union{AbstractFloat, AbstractVecOrMat},
-    VV <: AbstractVector,
-    S <: AbstractString,
-    MorUSorD <: Union{Matrix, UniformScaling, Diagonal},
-    M <: AbstractMatrix{<:AbstractFloat},
-    A <: AbstractArray{<:AbstractFloat, 3},
-    MT <: MultithreadType,
-}
-
-    # split data into train/test 
-    itrain = get_inputs(io_pairs)[:, train_idx]
-    otrain = get_outputs(io_pairs)[:, train_idx]
-    io_train_cost = PairedDataContainer(itrain, otrain)
-    itest = get_inputs(io_pairs)[:, test_idx]
-    otest = get_outputs(io_pairs)[:, test_idx]
-    input_dim = size(itrain, 1)
-    output_dim = size(otrain, 1)
-    n_test = size(itest, 2)
-    
-    # build and fit the RF
-    rfm = RFM_from_hyperparameters(
-        rfi,
-        rng,
-        l,
-        regularization,
-        n_features,
-        batch_sizes,
-        input_dim,
-        output_dim,
-        multithread_type,
-    )
-    fitted_features = RF.Methods.fit(rfm, io_train_cost, decomposition_type = decomp_type)
-
-    #we want to calc 1/var(y-mean)^2 + lambda/m * coeffs^2 in the end
-    thread_opt = isa(multithread_type, TullioThreading)
-    RF.Methods.predict!(
-        rfm,
-        fitted_features,
-        DataContainer(itest),
-        mean_store,
-        cov_store,
-        buffer,
-        tullio_threading = thread_opt,
-    )
-
-    # sizes (output_dim x n_test), (output_dim x output_dim x n_test) 
-
-    ## TODO - the theory states that the following should be set:
-    scaled_coeffs = sqrt(1 / (n_features)) * RF.Methods.get_coeffs(fitted_features)
-
-    #scaled_coeffs = 1e-3 * rand(n_features)#overwrite with noise...
-    # However the convergence is much improved with setting this to zero:
-    #scaled_coeffs = 0    
-    if decomp_type == "cholesky"
-        chol_fac = RF.Methods.get_decomposition(RF.Methods.get_feature_factors(fitted_features)).L
-
-        complexity = 2 * sum(log(chol_fac[i, i]) for i in 1:size(chol_fac, 1))
-    else
-        svd_singval = RF.Methods.get_decomposition(RF.Methods.get_feature_factors(fitted_features)).S
-        complexity = sum(log, svd_singval) # note this is log(abs(det))
-    end
-    complexity = sqrt(complexity)
-    return scaled_coeffs, complexity
-
-end
-
-
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Calculate the empirical covariance, additionally applying a shrinkage operator (here the Ledoit Wolf 2004 shrinkage operation). Known to have better stability properties than Monte-Carlo for low sample sizes
-"""
-function shrinkage_cov(sample_mat::AA) where {AA <: AbstractMatrix}
-    n_out, n_sample = size(sample_mat)
-
-    # de-mean (as we will use the samples directly for calculation of β)
-    sample_mat_zeromean = sample_mat .- mean(sample_mat, dims = 2)
-    # Ledoit Wolf shrinkage to I
-
-    # get sample covariance
-    Γ = cov(sample_mat_zeromean, dims = 2)
-    # estimate opt shrinkage
-    μ_shrink = 1 / n_out * tr(Γ)
-    δ_shrink = norm(Γ - μ_shrink * I)^2 / n_out # (scaled) frob norm of Γ_m
-    #once de-meaning, we need to correct the sample covariance with an n_sample -> n_sample-1
-    β_shrink = sum([norm(c * c' - -Γ)^2 / n_out for c in eachcol(sample_mat_zeromean)]) / (n_sample - 1)^2
-
-    γ_shrink = min(β_shrink / δ_shrink, 1) # clipping is typically rare
-    #  γμI + (1-γ)Γ
-    Γ .*= (1 - γ_shrink)
-    for i in 1:n_out
-        Γ[i, i] += γ_shrink * μ_shrink
-    end
-
-    @info "Shrinkage scale: $(γ_shrink), (0 = none, 1 = revert to scaled Identity)\n shrinkage covariance condition number: $(cond(Γ))"
-    return Γ
-end
-
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Calculate the empirical covariance, additionally applying the Noise Informed Covariance Estimator (NICE) Vishnay et al. 2024.
-"""
-function nice_cov(sample_mat::AA, n_samples = 400, δ::FT = 1.0) where {AA <: AbstractMatrix, FT <: Real}
-
-    n_sample_cov = size(sample_mat, 2)
-    Γ = cov(sample_mat, dims = 2)
-
-    bd_tol = 1e8 * eps()
-
-    v = sqrt.(diag(Γ))
-    V = Diagonal(v) #stds
-    V_inv = inv(V)
-    corr = clamp.(V_inv * Γ * V_inv, -1 + bd_tol, 1 - bd_tol) # full corr
-
-    # parameter sweep over the exponents
-    max_exponent = 2 * 5 # must be even
-    interp_steps = 100
-    # use find the variability in the corr coeff matrix entries
-    std_corrs = approximate_corr_std.(corr, n_sample_cov, n_samples) # Found in EKP.Localizers !! slowest part of code -> could speed up by precomputing an interpolation of [-1,1]
-
-    std_tol = sqrt(sum(std_corrs .^ 2))
-    α_min_exceeded = [max_exponent]
-    for α in 2:2:max_exponent # even exponents give a PSD
-        corr_psd = corr .^ (α + 1) # abs not needed as α even
-        # find the first exponent that exceeds the noise tolerance in norm
-        if norm(corr_psd - corr) > δ * std_tol
-            α_min_exceeded[1] = α
-            break
-        end
-    end
-    corr_psd = corr .^ α_min_exceeded[1]
-    corr_psd_prev = corr .^ (α_min_exceeded[1] - 2) # previous PSD correction 
-
-    for α in LinRange(1.0, 0.0, interp_steps)
-        corr_interp = ((1 - α) * (corr_psd_prev) + α * corr_psd) .* corr
-        if norm(corr_interp - corr) < δ * std_tol
-            corr[:, :] = corr_interp #update the correlation matrix block
-            break
-        end
-    end
-    out = posdef_correct(V * corr * V) # rebuild the cov matrix
-    @info "NICE-adjusted covariance condition number: $(cond(out))"
-    return out
-
-end
-
-
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Estimation of objective function (marginal log-likelihood) variability for optimization of hyperparameters. Calculated empirically by calling many samples of the objective at a "reasonable" value. `multithread_type` is used as dispatch, either `ensemble` (thread over the empirical samples) or `tullio` (serial samples, and thread the linear algebra within each sample)
-"""
-function estimate_mean_and_coeffnorm_covariance(
-    rfi::RFI,
-    rng::RNG,
-    l::ForVM,
-    regularization::MorUSorD,
-    n_features::Int,
-    train_idx::VV,
-    test_idx::VV,
-    batch_sizes::Union{Dict{S, Int}, Nothing},
-    io_pairs::PairedDataContainer,
-    n_samples::Int,
-    decomp_type::S,
-    multithread_type::TullioThreading;
-    repeats::Int = 1,
-    cov_correction = "shrinkage",
-) where {
-    RFI <: RandomFeatureInterface,
-    RNG <: AbstractRNG,
-    ForVM <: Union{AbstractFloat, AbstractVecOrMat},
-    VV <: AbstractVector,
-    S <: AbstractString,
-    MorUSorD <: Union{Matrix, UniformScaling, Diagonal},
-}
-
-    output_dim = size(get_outputs(io_pairs), 1)
-    n_test = length(test_idx)
-    
-    means = zeros(output_dim, n_samples, n_test)
-    mean_of_covs = zeros(output_dim, output_dim, n_test)
-    moc_tmp = similar(mean_of_covs)
-    mtmp = zeros(output_dim, n_test)
-    buffer = zeros(n_test, output_dim, n_features)
-    complexity = zeros(1, n_samples)
-    coeffl2norm = zeros(1, n_samples)
-    println("estimate cov with " * string(n_samples) * " iterations...")
-
-    for i in ProgressBar(1:n_samples)
-        for j in 1:repeats
-            c, cplxty = calculate_mean_cov_and_coeffs(
-                rfi,
-                rng,
-                l,
-                regularization,
-                n_features,
-                train_idx,
-                test_idx,
-                batch_sizes,
-                io_pairs,
-                decomp_type,
-                mtmp,
-                moc_tmp,
-                buffer,
-                multithread_type,
-            )
-
-            # m output_dim x n_test
-            # v output_dim x output_dim x n_test
-            # c n_features
-            # cplxty 1
-
-            # update vbles needed for cov
-            means[:, i, :] .+= mtmp ./ repeats
-            coeffl2norm[1, i] += sqrt(sum(abs2, c)) / repeats
-            complexity[1, i] += cplxty / repeats
-
-            # update vbles needed for mean
-            @. mean_of_covs += moc_tmp / (repeats * n_samples)
-
-        end
-    end
-    means = permutedims(means, (3, 2, 1))
-    mean_of_covs = permutedims(mean_of_covs, (3, 1, 2))
-
-    approx_σ2 = zeros(n_test * output_dim, n_test * output_dim)
-    blockmeans = zeros(n_test * output_dim, n_samples)
-    for i in 1:n_test
-        id = ((i - 1) * output_dim + 1):(i * output_dim)
-        approx_σ2[id, id] = mean_of_covs[i, :, :] # this ordering, so we can take a mean/cov in dims = 2.
-        blockmeans[id, :] = permutedims(means[i, :, :], (2, 1))
-    end
-
-    sample_mat = vcat(blockmeans, coeffl2norm, complexity)
-
-    if cov_correction == "shrinkage"
-        Γ = shrinkage_cov(sample_mat)
-    elseif cov_correction == "nice"
-        Γ = nice_cov(sample_mat)
-    else
-        Γ = cov(sample_mat, dims = 2)
-    end
-
-    if !isposdef(approx_σ2)
-        println("approx_σ2 not posdef")
-        approx_σ2 = posdef_correct(approx_σ2)
-    end
-
-    return Γ, approx_σ2
-
-end
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Evaluate an objective function (marginal log-likelihood) over an ensemble of hyperparameter values (used for EKI). `multithread_type` is used as dispatch, either `ensemble` (thread over the ensemble members) or `tullio` (serial samples, and thread the linear algebra within each sample)
-"""
-function calculate_ensemble_mean_and_coeffnorm(
-    rfi::RFI,
-    rng::RNG,
-    lvecormat::VorM,
-    regularization::MorUSorD,
-    n_features::Int,
-    train_idx::VV,
-    test_idx::VV,
-    batch_sizes::Union{Dict{S, Int}, Nothing},
-    io_pairs::PairedDataContainer,
-    decomp_type::S,
-    multithread_type::TullioThreading;
-    repeats::Int = 1,
-) where {
-    RFI <: RandomFeatureInterface,
-    RNG <: AbstractRNG,
-    VorM <: AbstractVecOrMat,
-    VV <: AbstractVector,
-    S <: AbstractString,
-    MorUSorD <: Union{Matrix, UniformScaling, Diagonal},
-}
-    if isa(lvecormat, AbstractVector)
-        lmat = reshape(lvecormat, 1, :)
-    else
-        lmat = lvecormat
-    end
-    N_ens = size(lmat, 2)
-    output_dim = size(get_outputs(io_pairs), 1)
-    n_test = length(test_idx)
-    
-    means = zeros(output_dim, N_ens, n_test)
-    mean_of_covs = zeros(output_dim, output_dim, n_test)
-    buffer = zeros(n_test, output_dim, n_features)
-    complexity = zeros(1, N_ens)
-    coeffl2norm = zeros(1, N_ens)
-    moc_tmp = similar(mean_of_covs)
-    mtmp = zeros(output_dim, n_test)
-
-    println("calculating " * string(N_ens) * " ensemble members...")
-
-    for i in ProgressBar(1:N_ens)
-        for j in collect(1:repeats)
-            l = lmat[:, i]
-
-            c, cplxty = calculate_mean_cov_and_coeffs(
-                rfi,
-                rng,
-                l,
-                regularization,
-                n_features,
-                train_idx,
-                test_idx,
-                batch_sizes,
-                io_pairs,
-                decomp_type,
-                mtmp,
-                moc_tmp,
-                buffer,
-                multithread_type,
-            )
-            # m output_dim x n_test
-            # v output_dim x output_dim x n_test
-            # c n_features
-            means[:, i, :] += mtmp ./ repeats
-            @. mean_of_covs += moc_tmp / (repeats * N_ens)
-            coeffl2norm[1, i] += sqrt(sum(c .^ 2)) / repeats
-            complexity[1, i] += cplxty / repeats
-        end
-    end
-    means = permutedims(means, (3, 2, 1))
-    mean_of_covs = permutedims(mean_of_covs, (3, 1, 2))
-    blockcovmat = zeros(n_test * output_dim, n_test * output_dim)
-    blockmeans = zeros(n_test * output_dim, N_ens)
-    for i in 1:n_test
-        id = ((i - 1) * output_dim + 1):(i * output_dim)
-        blockcovmat[id, id] = mean_of_covs[i, :, :]
-        blockmeans[id, :] = permutedims(means[i, :, :], (2, 1))
-    end
-
-    if !isposdef(blockcovmat)
-        println("blockcovmat not posdef")
-        blockcovmat = posdef_correct(blockcovmat)
-    end
-
-    return vcat(blockmeans, coeffl2norm, complexity), blockcovmat
-
-end
-
-
-function estimate_mean_and_coeffnorm_covariance(
-    rfi::RFI,
-    rng::RNG,
-    l::ForVM,
-    regularization::MorUSorD,
-    n_features::Int,
-    train_idx::VV,
-    test_idx::VV,
-    batch_sizes::Union{Dict{S, Int}, Nothing},
-    io_pairs::PairedDataContainer,
-    n_samples::Int,
-    decomp_type::S,
-    multithread_type::EnsembleThreading;
-    repeats::Int = 1,
-    cov_correction = "shrinkage",
-) where {
-    RFI <: RandomFeatureInterface,
-    RNG <: AbstractRNG,
-    ForVM <: Union{AbstractFloat, AbstractVecOrMat},
-    VV <: AbstractVector,
-    S <: AbstractString,
-    MorUSorD <: Union{Matrix, UniformScaling, Diagonal},
-}
-
-    output_dim = size(get_outputs(io_pairs), 1)
-    n_test = length(test_idx)
-    println("estimate cov with " * string(n_samples) * " iterations...")
-
-    nthreads = Threads.nthreads()
-    rng_seed = randperm(rng, 10^5)[1] # dumb way to get a random integer in 1:10^5
-    rng_list = [Random.MersenneTwister(rng_seed + i) for i in 1:nthreads]
-
-    c_list = [zeros(1, n_samples) for i in 1:nthreads]
-    cp_list = [zeros(1, n_samples) for i in 1:nthreads]
-    m_list = [zeros(output_dim, n_samples, n_test) for i in 1:nthreads]
-    moc_list = [zeros(output_dim, output_dim, n_test) for i in 1:nthreads]
-    moc_tmp_list = [zeros(output_dim, output_dim, n_test) for i in 1:nthreads]
-    mtmp_list = [zeros(output_dim, n_test) for i in 1:nthreads]
-    buffer_list = [zeros(n_test, output_dim, n_features) for i in 1:nthreads]
-
-
-    Threads.@threads for i in ProgressBar(1:n_samples)
-        tid = Threads.threadid()
-        rngtmp = rng_list[tid]
-        mtmp = mtmp_list[tid]
-        moc_tmp = moc_tmp_list[tid]
-        buffer = buffer_list[tid]
-        for j in 1:repeats
-
-            c, cplxty = calculate_mean_cov_and_coeffs(
-                rfi,
-                rngtmp,
-                l,
-                regularization,
-                n_features,
-                train_idx,
-                test_idx,
-                batch_sizes,
-                io_pairs,
-                decomp_type,
-                mtmp,
-                moc_tmp,
-                buffer,
-                multithread_type,
-            )
-
-
-            # m output_dim x n_test
-            # v output_dim x output_dim x n_test
-            # c n_features
-            # cplxty 1
-
-            # update vbles needed for cov
-            # update vbles needed for cov
-            m_list[tid][:, i, :] += mtmp ./ repeats
-            c_list[tid][1, i] += sqrt(sum(abs2, c)) / repeats
-            cp_list[tid][1, i] += cplxty / repeats
-
-            # update vbles needed for mean
-            @. moc_list[tid] += moc_tmp ./ (repeats * n_samples)
-
-        end
-    end
-
-    #put back together after threading
-    mean_of_covs = sum(moc_list)
-    means = sum(m_list)
-    coeffl2norm = sum(c_list)
-    complexity = sum(cp_list)
-    means = permutedims(means, (3, 2, 1))
-    mean_of_covs = permutedims(mean_of_covs, (3, 1, 2))
-
-    approx_σ2 = zeros(n_test * output_dim, n_test * output_dim)
-    blockmeans = zeros(n_test * output_dim, n_samples)
-    for i in 1:n_test
-        id = ((i - 1) * output_dim + 1):(i * output_dim)
-        approx_σ2[id, id] = mean_of_covs[i, :, :] # this ordering, so we can take a mean/cov in dims = 2.
-        blockmeans[id, :] = permutedims(means[i, :, :], (2, 1))
-    end
-
-
-    sample_mat = vcat(blockmeans, coeffl2norm, complexity)
-
-    if cov_correction == "shrinkage"
-        Γ = shrinkage_cov(sample_mat)
-    elseif cov_correction == "nice"
-        Γ = nice_cov(sample_mat)
-    else
-        Γ = cov(sample_mat, dims = 2)
-    end
-
-    if !isposdef(approx_σ2)
-        println("approx_σ2 not posdef")
-        approx_σ2 = posdef_correct(approx_σ2)
-    end
-
-    return Γ, approx_σ2
-
-end
-
-
-
-
-function calculate_ensemble_mean_and_coeffnorm(
-    rfi::RFI,
-    rng::RNG,
-    lvecormat::VorM,
-    regularization::MorUSorD,
-    n_features::Int,
-    train_idx::VV,
-    test_idx::VV,
-    batch_sizes::Union{Dict{S, Int}, Nothing},
-    io_pairs::PairedDataContainer,
-    decomp_type::S,
-    multithread_type::EnsembleThreading;
-    repeats::Int = 1,
-) where {
-    RFI <: RandomFeatureInterface,
-    RNG <: AbstractRNG,
-    VorM <: AbstractVecOrMat,
-    VV <: AbstractVector,
-    S <: AbstractString,
-    MorUSorD <: Union{Matrix, UniformScaling, Diagonal},
-}
-    if isa(lvecormat, AbstractVector)
-        lmat = reshape(lvecormat, 1, :)
-    else
-        lmat = lvecormat
-    end
-    N_ens = size(lmat, 2)
-    output_dim = size(get_outputs(io_pairs), 1)
-    n_test = length(test_idx)
-
-
-    println("calculating " * string(N_ens) * " ensemble members...")
-
-    nthreads = Threads.nthreads()
-    c_list = [zeros(1, N_ens) for i in 1:nthreads]
-    cp_list = [zeros(1, N_ens) for i in 1:nthreads]
-    m_list = [zeros(output_dim, N_ens, n_test) for i in 1:nthreads]
-    moc_list = [zeros(output_dim, output_dim, n_test) for i in 1:nthreads]
-    buffer_list = [zeros(n_test, output_dim, n_features) for i in 1:nthreads]
-    moc_tmp_list = [zeros(output_dim, output_dim, n_test) for i in 1:nthreads]
-    mtmp_list = [zeros(output_dim, n_test) for i in 1:nthreads]
-    rng_seed = randperm(rng, 10^5)[1] # dumb way to get a random integer in 1:10^5
-    rng_list = [Random.MersenneTwister(rng_seed + i) for i in 1:nthreads]
-    Threads.@threads for i in ProgressBar(1:N_ens)
-        tid = Threads.threadid()
-        rngtmp = rng_list[tid]
-        mtmp = mtmp_list[tid]
-        moc_tmp = moc_tmp_list[tid]
-        buffer = buffer_list[tid]
-        l = lmat[:, i]
-        for j in collect(1:repeats)
-
-            c, cplxty = calculate_mean_cov_and_coeffs(
-                rfi,
-                rngtmp,
-                l,
-                regularization,
-                n_features,
-                train_idx,
-                test_idx,
-                batch_sizes,
-                io_pairs,
-                decomp_type,
-                mtmp,
-                moc_tmp,
-                buffer,
-                multithread_type,
-            )
-
-            # m output_dim x n_test
-            # v output_dim x output_dim x n_test
-            # c n_features
-            m_list[tid][:, i, :] += mtmp ./ repeats
-            @. moc_list[tid] += moc_tmp ./ (repeats * N_ens)
-            c_list[tid][1, i] += sqrt(sum(c .^ 2)) / repeats
-            cp_list[tid][1, i] += cplxty / repeats
-
-        end
-    end
-
-    # put back together after threading
-    mean_of_covs = sum(moc_list)
-    means = sum(m_list)
-    coeffl2norm = sum(c_list)
-    complexity = sum(cp_list)
-    means = permutedims(means, (3, 2, 1))
-    mean_of_covs = permutedims(mean_of_covs, (3, 1, 2))
-    blockcovmat = zeros(n_test * output_dim, n_test * output_dim)
-    blockmeans = zeros(n_test * output_dim, N_ens)
-    for i in 1:n_test
-        id = ((i - 1) * output_dim + 1):(i * output_dim)
-        blockcovmat[id, id] = mean_of_covs[i, :, :]
-        blockmeans[id, :] = permutedims(means[i, :, :], (2, 1))
-    end
-
-    if !isposdef(blockcovmat)
-        println("blockcovmat not posdef")
-        blockcovmat = posdef_correct(blockcovmat)
-    end
-
-    return vcat(blockmeans, coeffl2norm, complexity), blockcovmat
-
-end
-
-include("ScalarRandomFeature.jl")
-include("VectorRandomFeature.jl")
diff --git a/src/RandomFeature_nodatasplit.jl b/src/RandomFeature_nodatasplit.jl
deleted file mode 100644
index c14e0a64..00000000
--- a/src/RandomFeature_nodatasplit.jl
+++ /dev/null
@@ -1,1018 +0,0 @@
-
-using RandomFeatures
-const RF = RandomFeatures
-using EnsembleKalmanProcesses
-const EKP = EnsembleKalmanProcesses
-using EnsembleKalmanProcesses.Localizers
-using ..ParameterDistributions
-using ..Utilities
-using StableRNGs
-using Random
-
-export calculate_n_hyperparameters, hyperparameters_from_flat, build_default_prior, cov_structure_from_string
-export CovarianceStructureType, OneDimFactor, DiagonalFactor, CholeskyFactor, LowRankFactor, HierarchicalLowRankFactor
-export SeparableKernel, NonseparableKernel
-export get_input_cov_structure, get_output_cov_structure, get_cov_structure
-export get_eps
-export rank
-export shrinkage_cov, nice_cov
-
-
-abstract type RandomFeatureInterface <: MachineLearningTool end
-
-# Different types of threading
-abstract type MultithreadType end
-struct TullioThreading <: MultithreadType end
-struct EnsembleThreading <: MultithreadType end
-
-
-# Different types of covariance representation in the prior
-
-abstract type CovarianceStructureType end
-
-"""
-$(DocStringExtensions.TYPEDEF)
-
-covariance structure for a one-dimensional space
-"""
-struct OneDimFactor <: CovarianceStructureType end
-
-"""
-$(DocStringExtensions.TYPEDEF)
-
-builds a diagonal covariance structure
-"""
-struct DiagonalFactor{FT <: AbstractFloat} <: CovarianceStructureType
-    "nugget term"
-    eps::FT
-end
-DiagonalFactor() = DiagonalFactor(Float64(1.0))
-get_eps(df::DiagonalFactor) = df.eps
-
-"""
-$(DocStringExtensions.TYPEDEF)
-
-builds a general positive-definite covariance structure
-"""
-struct CholeskyFactor{FT <: AbstractFloat} <: CovarianceStructureType
-    "nugget term"
-    eps::FT
-end
-CholeskyFactor() = CholeskyFactor(Float64(1.0))
-get_eps(cf::CholeskyFactor) = cf.eps
-
-"""
-$(DocStringExtensions.TYPEDEF)
-
-builds a covariance structure that deviates from the identity with a low-rank perturbation. This perturbation is diagonalized in the low-rank space
-"""
-struct LowRankFactor{FT <: AbstractFloat} <: CovarianceStructureType
-    "rank of perturbation"
-    rank::Int
-    "nugget term"
-    eps::FT
-end
-LowRankFactor(r::Int) = LowRankFactor(r, Float64(1.0))
-get_eps(lrf::LowRankFactor) = lrf.eps
-
-
-"""
-$(DocStringExtensions.TYPEDEF)
-
-builds a covariance structure that deviates from the identity with a more general low-rank perturbation
-"""
-struct HierarchicalLowRankFactor{FT <: AbstractFloat} <: CovarianceStructureType
-    "rank of perturbation"
-    rank::Int
-    "nugget term"
-    eps::FT
-end
-
-HierarchicalLowRankFactor(r::Int) = HierarchicalLowRankFactor(r, Float64(1.0))
-get_eps(hlrf::HierarchicalLowRankFactor) = hlrf.eps
-
-import LinearAlgebra: rank
-rank(lrf::LowRankFactor) = lrf.rank
-rank(hlrf::HierarchicalLowRankFactor) = hlrf.rank
-
-# To add a new cov type `T`  one must define these 3 methods:
-# hyperparameters_from_flat(x::V, d::Int, t::T) where {V <: AbstractVector}
-# calculate_n_hyperparameters(d::Int, t::T)
-# build_default_prior(name::SS, d::Int, t::T) where {SS <: AbstractString}
-# and add a string id to cov_structure_from_string
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-creates some simple covariance structures from strings: ["onedim", "diagonal", "cholesky", "lowrank", hierlowrank"]. See the covariance Structures for more details.
-"""
-function cov_structure_from_string(s::S, d::Int) where {S <: AbstractString}
-    if s == "onedim"
-        return OneDimFactor()
-    elseif s == "diagonal"
-        return DiagonalFactor()
-    elseif s == "cholesky"
-        return CholeskyFactor()
-    elseif s == "lowrank"
-        return LowRankFactor(Int(ceil(sqrt(d))))
-    elseif s == "hierlowrank"
-        return HierarchicalLowRankFactor(Int(ceil(sqrt(d))))
-    else
-        throw(
-            ArgumentError(
-                "Recieved unknown `input_cov_structure` keyword $(s), \n please choose from [\"onedim\",\"diagonal\",\"cholesky\", \"lowrank\"] or provide a CovarianceStructureType",
-            ),
-        )
-    end
-end
-cov_structure_from_string(cst::CST, args...; kwargs...) where {CST <: CovarianceStructureType} = cst
-
-
-# Different types of kernel 
-abstract type KernelStructureType end
-
-"""
-$(DocStringExtensions.TYPEDEF)
-
-Builds a separable kernel, i.e. one that accounts for input and output covariance structure separately
-"""
-struct SeparableKernel{CST1 <: CovarianceStructureType, CST2 <: CovarianceStructureType} <: KernelStructureType
-    input_cov_structure::CST1
-    output_cov_structure::CST2
-end
-get_input_cov_structure(kernel_structure::SeparableKernel) = kernel_structure.input_cov_structure
-get_output_cov_structure(kernel_structure::SeparableKernel) = kernel_structure.output_cov_structure
-
-
-"""
-$(DocStringExtensions.TYPEDEF)
-
-Builds a nonseparable kernel, i.e. one that accounts for a joint input and output covariance structure
-"""
-struct NonseparableKernel{CST <: CovarianceStructureType} <: KernelStructureType
-    cov_structure::CST
-end
-get_cov_structure(kernel_structure::NonseparableKernel) = kernel_structure.cov_structure
-
-
-# calculate_n_hyperparameters
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-calculates the number of hyperparameters generated by the choice of covariance structure
-"""
-calculate_n_hyperparameters(d::Int, odf::OneDimFactor) = 0
-calculate_n_hyperparameters(d::Int, df::DiagonalFactor) = d
-calculate_n_hyperparameters(d::Int, cf::CholeskyFactor) = Int(d * (d + 1) / 2)
-calculate_n_hyperparameters(d::Int, lrf::LowRankFactor) = Int(rank(lrf) + d * rank(lrf))
-calculate_n_hyperparameters(d::Int, hlrf::HierarchicalLowRankFactor) =
-    Int(rank(hlrf) * (rank(hlrf) + 1) / 2 + d * rank(hlrf))
-
-# build from flat 
-
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-reshapes a list of hyperparameters into a covariance matrix based on the selected structure
-"""
-hyperparameters_from_flat(x::V, odf::OneDimFactor) where {V <: AbstractVector} = nothing
-
-function hyperparameters_from_flat(x::V, df::DiagonalFactor) where {V <: AbstractVector}
-    return convert(Matrix, Diagonal(x) + get_eps(df) * I)
-end
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Converts a flat array into a cholesky factor.
-"""
-function flat_to_chol(x::V) where {V <: AbstractVector}
-    choldim = Int(floor(sqrt(2 * length(x))))
-    cholmat = zeros(choldim, choldim)
-    for i in 1:choldim
-        for j in 1:i
-            cholmat[i, j] = x[sum(0:(i - 1)) + j]
-        end
-    end
-    return cholmat
-end
-
-function hyperparameters_from_flat(x::V, cf::CholeskyFactor) where {V <: AbstractVector}
-    chol = flat_to_chol(x)
-    return chol * permutedims(chol, (2, 1)) + get_eps(cf) * I
-
-end
-
-function hyperparameters_from_flat(x::V, lrf::LowRankFactor) where {V <: AbstractVector}
-
-    r = rank(lrf)
-    d = Int(length(x) / r - 1) # l = r + d*r
-
-    D = Diagonal(x[1:r]) # D acts like sing.vals.^2
-    U = reshape(x[(r + 1):end], d, r)
-    qrU = qr(U) # qrU.Q is semi-orthogonal in first r columns, i.e. Q^T.Q = I
-    return qrU.Q[:, 1:r] * D * qrU.Q[:, 1:r]' + get_eps(lrf) * I
-
-end
-
-function hyperparameters_from_flat(x::V, hlrf::HierarchicalLowRankFactor) where {V <: AbstractVector}
-    r = rank(hlrf)
-    d = Int(length(x) / r - (r + 1) / 2) # l = r^2 + d*r
-
-    # Ambikasaran, O'Neil, Singh 2016
-    L = flat_to_chol(x[1:Int(r * (r + 1) / 2)])
-    U = reshape(x[Int(r * (r + 1) / 2 + 1):end], d, r)
-    K = L * permutedims(L, (2, 1)) + get_eps(hlrf) * I
-
-    qrU = qr(U)
-    svdT = svd(I + qrU.R * K * qrU.R') # square so T = Q S Q^t 
-    M = svdT.V * Diagonal(sqrt.(svdT.S)) # M M^t = Q sqrt(S) (Q sqrt(S))^t = Q sqrt(S) sqrt(S) Q^t = T
-    X = M - I
-    update_factor = I + qrU.Q * X * qrU.Q'
-
-    return update_factor * update_factor'
-end
-
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-builds a prior distribution for the kernel hyperparameters to initialize optimization.
-"""
-build_default_prior(name::SS, n_hp::Int, odf::OneDimFactor) where {SS <: AbstractString} = nothing
-
-function build_default_prior(name::SS, n_hp::Int, df::DiagonalFactor) where {SS <: AbstractString}
-    return ParameterDistribution(
-        Dict(
-            "name" => "$(name)_diagonal",
-            "distribution" => VectorOfParameterized(repeat([Normal(0, 3)], n_hp)),
-            "constraint" => repeat([bounded_below(0.0)], n_hp),
-        ),
-    )
-end
-
-function build_default_prior(name::SS, n_hp::Int, df::CholeskyFactor) where {SS <: AbstractString}
-    return constrained_gaussian("$(name)_cholesky", 0.0, 5.0, -Inf, Inf, repeats = n_hp)
-end
-
-function build_default_prior(name::SS, n_hp::Int, lrf::LowRankFactor) where {SS <: AbstractString}
-    r = rank(lrf)
-    d = Int(n_hp / r - 1)
-    D = ParameterDistribution(
-        Dict(
-            "name" => "$(name)_lowrank_diagonal",
-            "distribution" => VectorOfParameterized(repeat([Normal(0, 3)], r)),
-            "constraint" => repeat([bounded_below(0.0)], r),
-        ),
-    )
-    U = constrained_gaussian("$(name)_lowrank_U", 0.0, 100.0, -Inf, Inf, repeats = Int(d * r))
-    return combine_distributions([D, U])
-end
-
-function build_default_prior(name::SS, n_hp::Int, hlrf::HierarchicalLowRankFactor) where {SS <: AbstractString}
-    r = rank(hlrf)
-    d = Int(n_hp / r - (r + 1) / 2)
-    L = constrained_gaussian("$(name)_lowrank_Kchol", 0.0, 5.0, -Inf, Inf, repeats = Int(r * (r + 1) / 2))
-    U = constrained_gaussian("$(name)_lowrank_U", 0.0, 100.0, -Inf, Inf, repeats = Int(d * r))
-    return combine_distributions([L, U])
-end
-
-# combining input and output spaces:
-function calculate_n_hyperparameters(
-    input_dim::Int,
-    output_dim::Int,
-    kernel_structure::SK,
-) where {SK <: SeparableKernel}
-
-    input_cov_structure = get_input_cov_structure(kernel_structure)
-    n_hp_in = calculate_n_hyperparameters(input_dim, input_cov_structure)
-    output_cov_structure = get_output_cov_structure(kernel_structure)
-    n_hp_out = calculate_n_hyperparameters(output_dim, output_cov_structure)
-    n_scalings = 1
-
-    # if both in and output dim are "OneDimFactors" we still learn 1 hp
-    if n_hp_in + n_hp_out == 0
-        return 1 + n_scalings
-    end
-
-    return n_hp_in + n_hp_out + n_scalings
-end
-
-function calculate_n_hyperparameters(input_dim::Int, kernel_structure::KST) where {KST <: KernelStructureType}
-    output_dim = 1
-    return calculate_n_hyperparameters(input_dim, output_dim, kernel_structure)
-end
-
-function calculate_n_hyperparameters(
-    input_dim::Int,
-    output_dim::Int,
-    kernel_structure::NK,
-) where {NK <: NonseparableKernel}
-    cov_structure = get_cov_structure(kernel_structure)
-    n_hp = calculate_n_hyperparameters(input_dim * output_dim, cov_structure)
-    n_scalings = 1
-    # if both in and output dim are "OneDimFactors" we still learn 1 hp
-    if n_hp == 0
-        return 1 + n_scalings
-    end
-
-    return n_hp + n_scalings
-end
-
-
-
-
-function build_default_prior(input_dim::Int, output_dim::Int, kernel_structure::SK) where {SK <: SeparableKernel}
-    input_cov_structure = get_input_cov_structure(kernel_structure)
-    output_cov_structure = get_output_cov_structure(kernel_structure)
-
-    n_hp_in = calculate_n_hyperparameters(input_dim, input_cov_structure)
-    input_prior = build_default_prior("input", n_hp_in, input_cov_structure)
-    n_hp_out = calculate_n_hyperparameters(output_dim, output_cov_structure)
-    output_prior = build_default_prior("output", n_hp_out, output_cov_structure)
-
-    scaling_kern = constrained_gaussian("sigma", 1, 5, 0, Inf)
-
-    # We only use OneDimFactor values, default to input if both in and output dimension are one dimensional.Otherwise they are ignored
-    if isnothing(input_prior) && isnothing(output_prior)
-        onedim_prior = constrained_gaussian("onedim", 1.0, 1.0, 0.0, Inf)
-        return combine_distributions([onedim_prior, scaling_kern])
-    elseif isnothing(input_prior)
-        return combine_distributions([output_prior, scaling_kern])
-    elseif isnothing(output_prior)
-        return combine_distributions([input_prior, scaling_kern])
-    else
-        return combine_distributions([input_prior, output_prior, scaling_kern])
-    end
-end
-
-function build_default_prior(input_dim::Int, kernel_structure::KST) where {KST <: KernelStructureType}
-    output_dim = 1
-    return build_default_prior(input_dim, output_dim, kernel_structure)
-end
-
-function build_default_prior(input_dim::Int, output_dim::Int, kernel_structure::NK) where {NK <: NonseparableKernel}
-    cov_structure = get_cov_structure(kernel_structure)
-    n_hp = calculate_n_hyperparameters(input_dim * output_dim, cov_structure)
-    if n_hp == 0
-        pd_kern = constrained_gaussian("onedim", 1.0, 1.0, 0.0, Inf)
-    else
-        pd_kern = build_default_prior("full", n_hp, cov_structure)
-    end
-
-    scaling_kern = constrained_gaussian("sigma", 1, 5, 0, Inf)
-    return combine_distributions([pd_kern, scaling_kern])
-end
-
-function hyperparameters_from_flat(
-    x::VV,
-    input_dim::Int,
-    output_dim::Int,
-    kernel_structure::SK,
-) where {VV <: AbstractVector, SK <: SeparableKernel}
-    input_cov_structure = get_input_cov_structure(kernel_structure)
-    output_cov_structure = get_output_cov_structure(kernel_structure)
-
-    n_hp_in = calculate_n_hyperparameters(input_dim, input_cov_structure)
-    U = hyperparameters_from_flat(x[1:n_hp_in], input_cov_structure)
-    V = hyperparameters_from_flat(x[(n_hp_in + 1):end], output_cov_structure)
-
-    if isnothing(U) && isnothing(V) #i.e. both are from OneDimFactor cov structures
-        return x[1] * ones(1, 1), V
-    elseif isnothing(U)
-        return U, 0.5 * (V + permutedims(V, (2, 1)))
-    elseif isnothing(V)
-        return 0.5 * (U + permutedims(U, (2, 1))), V
-    else
-        # symmetrize (sometimes there are numerical artifacts)
-        U = 0.5 * (U + permutedims(U, (2, 1)))
-        V = 0.5 * (V + permutedims(V, (2, 1)))
-        return U, V
-    end
-end
-
-function hyperparameters_from_flat(
-    x::VV,
-    input_dim::Int,
-    kernel_structure::SK,
-) where {VV <: AbstractVector, SK <: SeparableKernel}
-    output_dim = 1
-    U, V = hyperparameters_from_flat(x, input_dim, output_dim, kernel_structure)
-
-    return U
-end
-
-function hyperparameters_from_flat(
-    x::VV,
-    input_dim::Int,
-    output_dim::Int,
-    kernel_structure::NK,
-) where {VV <: AbstractVector, NK <: NonseparableKernel}
-    cov_structure = get_cov_structure(kernel_structure)
-    C = hyperparameters_from_flat(x, cov_structure)
-    if isnothing(C)
-        return x[1] * ones(1, 1)
-    else
-        return 0.5 * (C + permutedims(C, (2, 1)))
-    end
-
-end
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Calculates key quantities (mean, covariance, and coefficients) of the objective function to be optimized for hyperparameter training. Buffers: mean_store, cov_store, buffer are provided to aid memory management.
-"""
-function calculate_mean_cov_and_coeffs(
-    rfi::RFI,
-    rng::RNG,
-    l::ForVM,
-    regularization::MorUSorD,
-    n_features::Int,
-    n_train::Int,
-    n_test::Int,
-    batch_sizes::Union{Dict{S, Int}, Nothing},
-    io_pairs::PairedDataContainer,
-    decomp_type::S,
-    mean_store::M,
-    cov_store::A,
-    buffer::A,
-    multithread_type::MT,
-) where {
-    RFI <: RandomFeatureInterface,
-    RNG <: AbstractRNG,
-    ForVM <: Union{AbstractFloat, AbstractVecOrMat},
-    S <: AbstractString,
-    MorUSorD <: Union{Matrix, UniformScaling, Diagonal},
-    M <: AbstractMatrix{<:AbstractFloat},
-    A <: AbstractArray{<:AbstractFloat, 3},
-    MT <: MultithreadType,
-}
-
-    # split data into train/test 
-    itrain = get_inputs(io_pairs)
-    otrain = get_outputs(io_pairs)
-    io_train_cost = PairedDataContainer(itrain, otrain)
-#    itest = get_inputs(io_pairs)[:, (n_train + 1):end]
-#    otest = get_outputs(io_pairs)[:, (n_train + 1):end]
-    itest = get_inputs(io_pairs)
-    otest = get_outputs(io_pairs)
-    input_dim = size(itrain, 1)
-    output_dim = size(otrain, 1)
-    n_test = size(itest, 2)
-    # build and fit the RF
-    rfm = RFM_from_hyperparameters(
-        rfi,
-        rng,
-        l,
-        regularization,
-        n_features,
-        batch_sizes,
-        input_dim,
-        output_dim,
-        multithread_type,
-    )
-    fitted_features = RF.Methods.fit(rfm, io_train_cost, decomposition_type = decomp_type)
-
-    #we want to calc 1/var(y-mean)^2 + lambda/m * coeffs^2 in the end
-    thread_opt = isa(multithread_type, TullioThreading)
-    RF.Methods.predict!(
-        rfm,
-        fitted_features,
-        DataContainer(itest),
-        mean_store,
-        cov_store,
-        buffer,
-        tullio_threading = thread_opt,
-    )
-
-    # sizes (output_dim x n_test), (output_dim x output_dim x n_test) 
-
-    ## TODO - the theory states that the following should be set:
-    scaled_coeffs = sqrt(1 / (n_features)) * RF.Methods.get_coeffs(fitted_features)
-
-    #scaled_coeffs = 1e-3 * rand(n_features)#overwrite with noise...
-    # However the convergence is much improved with setting this to zero:
-    #scaled_coeffs = 0    
-    if decomp_type == "cholesky"
-        chol_fac = RF.Methods.get_decomposition(RF.Methods.get_feature_factors(fitted_features)).L
-
-        complexity = 2 * sum(log(chol_fac[i, i]) for i in 1:size(chol_fac, 1))
-    else
-        svd_singval = RF.Methods.get_decomposition(RF.Methods.get_feature_factors(fitted_features)).S
-        complexity = sum(log, svd_singval) # note this is log(abs(det))
-    end
-    complexity = sqrt(complexity)
-    return scaled_coeffs, complexity
-
-end
-
-
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Calculate the empirical covariance, additionally applying a shrinkage operator (here the Ledoit Wolf 2004 shrinkage operation). Known to have better stability properties than Monte-Carlo for low sample sizes
-"""
-function shrinkage_cov(sample_mat::AA) where {AA <: AbstractMatrix}
-    n_out, n_sample = size(sample_mat)
-
-    # de-mean (as we will use the samples directly for calculation of β)
-    sample_mat_zeromean = sample_mat .- mean(sample_mat, dims = 2)
-    # Ledoit Wolf shrinkage to I
-
-    # get sample covariance
-    Γ = cov(sample_mat_zeromean, dims = 2)
-    # estimate opt shrinkage
-    μ_shrink = 1 / n_out * tr(Γ)
-    δ_shrink = norm(Γ - μ_shrink * I)^2 / n_out # (scaled) frob norm of Γ_m
-    #once de-meaning, we need to correct the sample covariance with an n_sample -> n_sample-1
-    β_shrink = sum([norm(c * c' - -Γ)^2 / n_out for c in eachcol(sample_mat_zeromean)]) / (n_sample - 1)^2
-
-    γ_shrink = min(β_shrink / δ_shrink, 1) # clipping is typically rare
-    #  γμI + (1-γ)Γ
-    Γ .*= (1 - γ_shrink)
-    for i in 1:n_out
-        Γ[i, i] += γ_shrink * μ_shrink
-    end
-
-    @info "Shrinkage scale: $(γ_shrink), (0 = none, 1 = revert to scaled Identity)\n shrinkage covariance condition number: $(cond(Γ))"
-    return Γ
-end
-
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Calculate the empirical covariance, additionally applying the Noise Informed Covariance Estimator (NICE) Vishnay et al. 2024.
-"""
-function nice_cov(sample_mat::AA, n_samples = 400, δ::FT = 1.0) where {AA <: AbstractMatrix, FT <: Real}
-
-    n_sample_cov = size(sample_mat, 2)
-    Γ = cov(sample_mat, dims = 2)
-
-    bd_tol = 1e8 * eps()
-
-    v = sqrt.(diag(Γ))
-    V = Diagonal(v) #stds
-    V_inv = inv(V)
-    corr = clamp.(V_inv * Γ * V_inv, -1 + bd_tol, 1 - bd_tol) # full corr
-
-    # parameter sweep over the exponents
-    max_exponent = 2 * 5 # must be even
-    interp_steps = 100
-    # use find the variability in the corr coeff matrix entries
-    std_corrs = approximate_corr_std.(corr, n_sample_cov, n_samples) # Found in EKP.Localizers !! slowest part of code -> could speed up by precomputing an interpolation of [-1,1]
-
-    std_tol = sqrt(sum(std_corrs .^ 2))
-    α_min_exceeded = [max_exponent]
-    for α in 2:2:max_exponent # even exponents give a PSD
-        corr_psd = corr .^ (α + 1) # abs not needed as α even
-        # find the first exponent that exceeds the noise tolerance in norm
-        if norm(corr_psd - corr) > δ * std_tol
-            α_min_exceeded[1] = α
-            break
-        end
-    end
-    corr_psd = corr .^ α_min_exceeded[1]
-    corr_psd_prev = corr .^ (α_min_exceeded[1] - 2) # previous PSD correction 
-
-    for α in LinRange(1.0, 0.0, interp_steps)
-        corr_interp = ((1 - α) * (corr_psd_prev) + α * corr_psd) .* corr
-        if norm(corr_interp - corr) < δ * std_tol
-            corr[:, :] = corr_interp #update the correlation matrix block
-            break
-        end
-    end
-    out = posdef_correct(V * corr * V) # rebuild the cov matrix
-    @info "NICE-adjusted covariance condition number: $(cond(out))"
-    return out
-
-end
-
-
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Estimation of objective function (marginal log-likelihood) variability for optimization of hyperparameters. Calculated empirically by calling many samples of the objective at a "reasonable" value. `multithread_type` is used as dispatch, either `ensemble` (thread over the empirical samples) or `tullio` (serial samples, and thread the linear algebra within each sample)
-"""
-function estimate_mean_and_coeffnorm_covariance(
-    rfi::RFI,
-    rng::RNG,
-    l::ForVM,
-    regularization::MorUSorD,
-    n_features::Int,
-    n_train::Int,
-    n_test::Int,
-    batch_sizes::Union{Dict{S, Int}, Nothing},
-    io_pairs::PairedDataContainer,
-    n_samples::Int,
-    decomp_type::S,
-    multithread_type::TullioThreading;
-    repeats::Int = 1,
-    cov_correction = "shrinkage",
-) where {
-    RFI <: RandomFeatureInterface,
-    RNG <: AbstractRNG,
-    ForVM <: Union{AbstractFloat, AbstractVecOrMat},
-    S <: AbstractString,
-    MorUSorD <: Union{Matrix, UniformScaling, Diagonal},
-}
-
-    output_dim = size(get_outputs(io_pairs), 1)
-
-    means = zeros(output_dim, n_samples, n_test)
-    mean_of_covs = zeros(output_dim, output_dim, n_test)
-    moc_tmp = similar(mean_of_covs)
-    mtmp = zeros(output_dim, n_test)
-    buffer = zeros(n_test, output_dim, n_features)
-    complexity = zeros(1, n_samples)
-    coeffl2norm = zeros(1, n_samples)
-    println("estimate cov with " * string(n_samples * repeats) * " iterations...")
-
-    for i in ProgressBar(1:n_samples)
-        for j in 1:repeats
-            c, cplxty = calculate_mean_cov_and_coeffs(
-                rfi,
-                rng,
-                l,
-                regularization,
-                n_features,
-                n_train,
-                n_test,
-                batch_sizes,
-                io_pairs,
-                decomp_type,
-                mtmp,
-                moc_tmp,
-                buffer,
-                multithread_type,
-            )
-
-            # m output_dim x n_test
-            # v output_dim x output_dim x n_test
-            # c n_features
-            # cplxty 1
-
-            # update vbles needed for cov
-            means[:, i, :] .+= mtmp ./ repeats
-            coeffl2norm[1, i] += sqrt(sum(abs2, c)) / repeats
-            complexity[1, i] += cplxty / repeats
-
-            # update vbles needed for mean
-            @. mean_of_covs += moc_tmp / (repeats * n_samples)
-
-        end
-    end
-    means = permutedims(means, (3, 2, 1))
-    mean_of_covs = permutedims(mean_of_covs, (3, 1, 2))
-
-    approx_σ2 = zeros(n_test * output_dim, n_test * output_dim)
-    blockmeans = zeros(n_test * output_dim, n_samples)
-    for i in 1:n_test
-        id = ((i - 1) * output_dim + 1):(i * output_dim)
-        approx_σ2[id, id] = mean_of_covs[i, :, :] # this ordering, so we can take a mean/cov in dims = 2.
-        blockmeans[id, :] = permutedims(means[i, :, :], (2, 1))
-    end
-
-    sample_mat = vcat(blockmeans, coeffl2norm, complexity)
-
-    if cov_correction == "shrinkage"
-        Γ = shrinkage_cov(sample_mat)
-    elseif cov_correction == "nice"
-        Γ = nice_cov(sample_mat)
-    else
-        Γ = cov(sample_mat, dims = 2)
-    end
-
-    if !isposdef(approx_σ2)
-        println("approx_σ2 not posdef")
-        approx_σ2 = posdef_correct(approx_σ2)
-    end
-
-    return Γ, approx_σ2
-
-end
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Evaluate an objective function (marginal log-likelihood) over an ensemble of hyperparameter values (used for EKI). `multithread_type` is used as dispatch, either `ensemble` (thread over the ensemble members) or `tullio` (serial samples, and thread the linear algebra within each sample)
-"""
-function calculate_ensemble_mean_and_coeffnorm(
-    rfi::RFI,
-    rng::RNG,
-    lvecormat::VorM,
-    regularization::MorUSorD,
-    n_features::Int,
-    n_train::Int,
-    n_test::Int,
-    batch_sizes::Union{Dict{S, Int}, Nothing},
-    io_pairs::PairedDataContainer,
-    decomp_type::S,
-    multithread_type::TullioThreading;
-    repeats::Int = 1,
-) where {
-    RFI <: RandomFeatureInterface,
-    RNG <: AbstractRNG,
-    VorM <: AbstractVecOrMat,
-    S <: AbstractString,
-    MorUSorD <: Union{Matrix, UniformScaling, Diagonal},
-}
-    if isa(lvecormat, AbstractVector)
-        lmat = reshape(lvecormat, 1, :)
-    else
-        lmat = lvecormat
-    end
-    N_ens = size(lmat, 2)
-    output_dim = size(get_outputs(io_pairs), 1)
-
-    means = zeros(output_dim, N_ens, n_test)
-    mean_of_covs = zeros(output_dim, output_dim, n_test)
-    buffer = zeros(n_test, output_dim, n_features)
-    complexity = zeros(1, N_ens)
-    coeffl2norm = zeros(1, N_ens)
-    moc_tmp = similar(mean_of_covs)
-    mtmp = zeros(output_dim, n_test)
-
-    println("calculating " * string(N_ens * repeats) * " ensemble members...")
-
-    for i in ProgressBar(1:N_ens)
-        for j in collect(1:repeats)
-            l = lmat[:, i]
-
-            c, cplxty = calculate_mean_cov_and_coeffs(
-                rfi,
-                rng,
-                l,
-                regularization,
-                n_features,
-                n_train,
-                n_test,
-                batch_sizes,
-                io_pairs,
-                decomp_type,
-                mtmp,
-                moc_tmp,
-                buffer,
-                multithread_type,
-            )
-            # m output_dim x n_test
-            # v output_dim x output_dim x n_test
-            # c n_features
-            means[:, i, :] += mtmp ./ repeats
-            @. mean_of_covs += moc_tmp / (repeats * N_ens)
-            coeffl2norm[1, i] += sqrt(sum(c .^ 2)) / repeats
-            complexity[1, i] += cplxty / repeats
-        end
-    end
-    means = permutedims(means, (3, 2, 1))
-    mean_of_covs = permutedims(mean_of_covs, (3, 1, 2))
-    blockcovmat = zeros(n_test * output_dim, n_test * output_dim)
-    blockmeans = zeros(n_test * output_dim, N_ens)
-    for i in 1:n_test
-        id = ((i - 1) * output_dim + 1):(i * output_dim)
-        blockcovmat[id, id] = mean_of_covs[i, :, :]
-        blockmeans[id, :] = permutedims(means[i, :, :], (2, 1))
-    end
-
-    if !isposdef(blockcovmat)
-        println("blockcovmat not posdef")
-        blockcovmat = posdef_correct(blockcovmat)
-    end
-
-    return vcat(blockmeans, coeffl2norm, complexity), blockcovmat
-
-end
-
-
-function estimate_mean_and_coeffnorm_covariance(
-    rfi::RFI,
-    rng::RNG,
-    l::ForVM,
-    regularization::MorUSorD,
-    n_features::Int,
-    n_train::Int,
-    n_test::Int,
-    batch_sizes::Union{Dict{S, Int}, Nothing},
-    io_pairs::PairedDataContainer,
-    n_samples::Int,
-    decomp_type::S,
-    multithread_type::EnsembleThreading;
-    repeats::Int = 1,
-    cov_correction = "shrinkage",
-) where {
-    RFI <: RandomFeatureInterface,
-    RNG <: AbstractRNG,
-    ForVM <: Union{AbstractFloat, AbstractVecOrMat},
-    S <: AbstractString,
-    MorUSorD <: Union{Matrix, UniformScaling, Diagonal},
-}
-
-    output_dim = size(get_outputs(io_pairs), 1)
-
-    println("estimate cov with " * string(n_samples * repeats) * " iterations...")
-
-    nthreads = Threads.nthreads()
-    rng_seed = randperm(rng, 10^5)[1] # dumb way to get a random integer in 1:10^5
-    rng_list = [Random.MersenneTwister(rng_seed + i) for i in 1:nthreads]
-
-    c_list = [zeros(1, n_samples) for i in 1:nthreads]
-    cp_list = [zeros(1, n_samples) for i in 1:nthreads]
-    m_list = [zeros(output_dim, n_samples, n_test) for i in 1:nthreads]
-    moc_list = [zeros(output_dim, output_dim, n_test) for i in 1:nthreads]
-    moc_tmp_list = [zeros(output_dim, output_dim, n_test) for i in 1:nthreads]
-    mtmp_list = [zeros(output_dim, n_test) for i in 1:nthreads]
-    buffer_list = [zeros(n_test, output_dim, n_features) for i in 1:nthreads]
-
-
-    Threads.@threads for i in ProgressBar(1:n_samples)
-        tid = Threads.threadid()
-        rngtmp = rng_list[tid]
-        mtmp = mtmp_list[tid]
-        moc_tmp = moc_tmp_list[tid]
-        buffer = buffer_list[tid]
-        for j in 1:repeats
-
-            c, cplxty = calculate_mean_cov_and_coeffs(
-                rfi,
-                rngtmp,
-                l,
-                regularization,
-                n_features,
-                n_train,
-                n_test,
-                batch_sizes,
-                io_pairs,
-                decomp_type,
-                mtmp,
-                moc_tmp,
-                buffer,
-                multithread_type,
-            )
-
-
-            # m output_dim x n_test
-            # v output_dim x output_dim x n_test
-            # c n_features
-            # cplxty 1
-
-            # update vbles needed for cov
-            # update vbles needed for cov
-            m_list[tid][:, i, :] += mtmp ./ repeats
-            c_list[tid][1, i] += sqrt(sum(abs2, c)) / repeats
-            cp_list[tid][1, i] += cplxty / repeats
-
-            # update vbles needed for mean
-            @. moc_list[tid] += moc_tmp ./ (repeats * n_samples)
-
-        end
-    end
-
-    #put back together after threading
-    mean_of_covs = sum(moc_list)
-    means = sum(m_list)
-    coeffl2norm = sum(c_list)
-    complexity = sum(cp_list)
-    means = permutedims(means, (3, 2, 1))
-    mean_of_covs = permutedims(mean_of_covs, (3, 1, 2))
-
-    approx_σ2 = zeros(n_test * output_dim, n_test * output_dim)
-    blockmeans = zeros(n_test * output_dim, n_samples)
-    for i in 1:n_test
-        id = ((i - 1) * output_dim + 1):(i * output_dim)
-        approx_σ2[id, id] = mean_of_covs[i, :, :] # this ordering, so we can take a mean/cov in dims = 2.
-        blockmeans[id, :] = permutedims(means[i, :, :], (2, 1))
-    end
-
-
-    sample_mat = vcat(blockmeans, coeffl2norm, complexity)
-
-    if cov_correction == "shrinkage"
-        Γ = shrinkage_cov(sample_mat)
-    elseif cov_correction == "nice"
-        Γ = nice_cov(sample_mat)
-    else
-        Γ = cov(sample_mat, dims = 2)
-    end
-
-    if !isposdef(approx_σ2)
-        println("approx_σ2 not posdef")
-        approx_σ2 = posdef_correct(approx_σ2)
-    end
-
-    return Γ, approx_σ2
-
-end
-
-
-
-
-function calculate_ensemble_mean_and_coeffnorm(
-    rfi::RFI,
-    rng::RNG,
-    lvecormat::VorM,
-    regularization::MorUSorD,
-    n_features::Int,
-    n_train::Int,
-    n_test::Int,
-    batch_sizes::Union{Dict{S, Int}, Nothing},
-    io_pairs::PairedDataContainer,
-    decomp_type::S,
-    multithread_type::EnsembleThreading;
-    repeats::Int = 1,
-) where {
-    RFI <: RandomFeatureInterface,
-    RNG <: AbstractRNG,
-    VorM <: AbstractVecOrMat,
-    S <: AbstractString,
-    MorUSorD <: Union{Matrix, UniformScaling, Diagonal},
-}
-    if isa(lvecormat, AbstractVector)
-        lmat = reshape(lvecormat, 1, :)
-    else
-        lmat = lvecormat
-    end
-    N_ens = size(lmat, 2)
-    output_dim = size(get_outputs(io_pairs), 1)
-
-
-
-    println("calculating " * string(N_ens * repeats) * " ensemble members...")
-
-    nthreads = Threads.nthreads()
-    c_list = [zeros(1, N_ens) for i in 1:nthreads]
-    cp_list = [zeros(1, N_ens) for i in 1:nthreads]
-    m_list = [zeros(output_dim, N_ens, n_test) for i in 1:nthreads]
-    moc_list = [zeros(output_dim, output_dim, n_test) for i in 1:nthreads]
-    buffer_list = [zeros(n_test, output_dim, n_features) for i in 1:nthreads]
-    moc_tmp_list = [zeros(output_dim, output_dim, n_test) for i in 1:nthreads]
-    mtmp_list = [zeros(output_dim, n_test) for i in 1:nthreads]
-    rng_seed = randperm(rng, 10^5)[1] # dumb way to get a random integer in 1:10^5
-    rng_list = [Random.MersenneTwister(rng_seed + i) for i in 1:nthreads]
-    Threads.@threads for i in ProgressBar(1:N_ens)
-        tid = Threads.threadid()
-        rngtmp = rng_list[tid]
-        mtmp = mtmp_list[tid]
-        moc_tmp = moc_tmp_list[tid]
-        buffer = buffer_list[tid]
-        l = lmat[:, i]
-        for j in collect(1:repeats)
-
-            c, cplxty = calculate_mean_cov_and_coeffs(
-                rfi,
-                rngtmp,
-                l,
-                regularization,
-                n_features,
-                n_train,
-                n_test,
-                batch_sizes,
-                io_pairs,
-                decomp_type,
-                mtmp,
-                moc_tmp,
-                buffer,
-                multithread_type,
-            )
-
-            # m output_dim x n_test
-            # v output_dim x output_dim x n_test
-            # c n_features
-            m_list[tid][:, i, :] += mtmp ./ repeats
-            @. moc_list[tid] += moc_tmp ./ (repeats * N_ens)
-            c_list[tid][1, i] += sqrt(sum(c .^ 2)) / repeats
-            cp_list[tid][1, i] += cplxty / repeats
-
-        end
-    end
-
-    # put back together after threading
-    mean_of_covs = sum(moc_list)
-    means = sum(m_list)
-    coeffl2norm = sum(c_list)
-    complexity = sum(cp_list)
-    means = permutedims(means, (3, 2, 1))
-    mean_of_covs = permutedims(mean_of_covs, (3, 1, 2))
-    blockcovmat = zeros(n_test * output_dim, n_test * output_dim)
-    blockmeans = zeros(n_test * output_dim, N_ens)
-    for i in 1:n_test
-        id = ((i - 1) * output_dim + 1):(i * output_dim)
-        blockcovmat[id, id] = mean_of_covs[i, :, :]
-        blockmeans[id, :] = permutedims(means[i, :, :], (2, 1))
-    end
-
-    if !isposdef(blockcovmat)
-        println("blockcovmat not posdef")
-        blockcovmat = posdef_correct(blockcovmat)
-    end
-
-    return vcat(blockmeans, coeffl2norm, complexity), blockcovmat
-
-end
-
-include("ScalarRandomFeature.jl")
-include("VectorRandomFeature.jl")
diff --git a/src/ScalarRandomFeature.jl b/src/ScalarRandomFeature.jl
index 3ed90dda..0ba32c63 100644
--- a/src/ScalarRandomFeature.jl
+++ b/src/ScalarRandomFeature.jl
@@ -157,7 +157,7 @@ function ScalarRandomFeatureInterface(
         "prior" => prior, #the hyperparameter_prior
         "n_ensemble" => max(ndims(prior) + 1, 10), #number of ensemble
         "n_iteration" => 5, # number of eki iterations
-        "scheduler" => EKP.DataMisfitController(terminate_at=1000), # Adaptive timestepping,
+        "scheduler" => EKP.DataMisfitController(terminate_at = 1000), # Adaptive timestepping,
         "cov_sample_multiplier" => 2.0, # multiplier for samples to estimate covariance in optimization scheme
         "inflation" => 1e-4, # additive inflation ∈ [0,1] with 0 being no inflation
         "train_fraction" => 0.8, # 80:20 train - test split
@@ -341,33 +341,37 @@ function build_models!(
 
     train_idx = []
     test_idx = []
-    n_train=0
-    n_test=0
-    if n_cross_val_sets == 0 
-        push!(train_idx,idx_shuffle)
-        push!(test_idx,idx_shuffle)
+    n_train = 0
+    n_test = 0
+    if n_cross_val_sets == 0
+        push!(train_idx, idx_shuffle)
+        push!(test_idx, idx_shuffle)
         n_cross_val_sets = 1 # now just pretend there is one partition for looping purposes
         n_train = n_data
-        n_test = n_data    
+        n_test = n_data
     else
         train_fraction = optimizer_options["train_fraction"]
         n_train = Int(floor(train_fraction * n_data))
         n_test = n_data - n_train
-        
-        if n_test*n_cross_val_sets > n_data
-            throw(ArgumentError("train/test split produces cross validation test sets of size $(n_test), out of $(n_data). \"n_cross_val_sets\" optimizer_options keyword < $(Int(floor(n_data/n_test))). Received $n_cross_val_sets"))
+
+        if n_test * n_cross_val_sets > n_data
+            throw(
+                ArgumentError(
+                    "train/test split produces cross validation test sets of size $(n_test), out of $(n_data). \"n_cross_val_sets\" optimizer_options keyword < $(Int(floor(n_data/n_test))). Received $n_cross_val_sets",
+                ),
+            )
         end
 
-        
-        for i = 1:n_cross_val_sets
-            tmp = idx_shuffle[(i-1)*n_test+1:i*n_test]
+
+        for i in 1:n_cross_val_sets
+            tmp = idx_shuffle[((i - 1) * n_test + 1):(i * n_test)]
             push!(test_idx, tmp)
             push!(train_idx, setdiff(collect(1:n_data), tmp))
         end
     end
-    
-    
-    
+
+
+
     #regularization = I = 1.0 in scalar case
     regularization = I
 
@@ -377,12 +381,9 @@ function build_models!(
     n_iteration = optimizer_options["n_iteration"]
     diagnostics = zeros(n_iteration, n_rfms)
     for i in 1:n_rfms
-        
-        io_pairs_opt = PairedDataContainer(
-            input_values,
-            reshape(output_values[i,:],1,size(output_values, 2)),
-        )
-       
+
+        io_pairs_opt = PairedDataContainer(input_values, reshape(output_values[i, :], 1, size(output_values, 2)))
+
         multithread = optimizer_options["multithread"]
         if multithread == "ensemble"
             multithread_type = EnsembleThreading()
@@ -415,7 +416,7 @@ function build_models!(
         n_cov_samples_min = n_test + 2
         n_cov_samples = Int(floor(n_cov_samples_min * max(cov_sample_multiplier, 0.0)))
         observation_vec = []
-        for cv_idx = 1:n_cross_val_sets
+        for cv_idx in 1:n_cross_val_sets
             internal_Γ, approx_σ2 = estimate_mean_and_coeffnorm_covariance(
                 srfi,
                 rng,
@@ -439,13 +440,9 @@ function build_models!(
             end
             data = vcat(get_outputs(io_pairs_opt)[test_idx[cv_idx]], 0.0, 0.0)
 
-            push!(observation_vec, EKP.Observation(
-                Dict(
-                    "names" => "$(cv_idx)",
-                    "samples" => data[:],
-                    "covariances" => Γ,
-                ),
-            ),
+            push!(
+                observation_vec,
+                EKP.Observation(Dict("names" => "$(cv_idx)", "samples" => data[:], "covariances" => Γ)),
             )
         end
         observation = combine_observations(observation_vec)
@@ -475,9 +472,9 @@ function build_models!(
 
             #get parameters:
             lvec = transform_unconstrained_to_constrained(prior, get_u_final(ekiobj))
-            g_ens = zeros(n_cross_val_sets*(n_test+2),n_ensemble)
-            for cv_idx = 1:n_cross_val_sets
-        
+            g_ens = zeros(n_cross_val_sets * (n_test + 2), n_ensemble)
+            for cv_idx in 1:n_cross_val_sets
+
                 g_ens_tmp, _ = calculate_ensemble_mean_and_coeffnorm(
                     srfi,
                     rng,
@@ -491,9 +488,9 @@ function build_models!(
                     decomp_type,
                     multithread_type,
                 )
-                g_ens[(cv_idx-1)*(n_test+2) + 1: cv_idx*(n_test+2), :] = g_ens_tmp
+                g_ens[((cv_idx - 1) * (n_test + 2) + 1):(cv_idx * (n_test + 2)), :] = g_ens_tmp
             end
-            
+
             inflation = optimizer_options["inflation"]
             if inflation > 0
                 terminated = EKP.update_ensemble!(ekiobj, g_ens, additive_inflation = true, s = inflation) # small regularizing inflation
diff --git a/src/ScalarRandomFeature_crossval.jl b/src/ScalarRandomFeature_crossval.jl
deleted file mode 100644
index fe770289..00000000
--- a/src/ScalarRandomFeature_crossval.jl
+++ /dev/null
@@ -1,584 +0,0 @@
-export ScalarRandomFeatureInterface
-# getters already exported in VRFI
-
-"""
-$(DocStringExtensions.TYPEDEF)
-
-Structure holding the Scalar Random Feature models. 
-
-# Fields
-$(DocStringExtensions.TYPEDFIELDS)
-
-"""
-struct ScalarRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG, KST <: KernelStructureType} <:
-       RandomFeatureInterface
-    "vector of `RandomFeatureMethod`s, contains the feature structure, batch-sizes and regularization"
-    rfms::Vector{RF.Methods.RandomFeatureMethod}
-    "vector of `Fit`s, containing the matrix decomposition and coefficients of RF when fitted to data"
-    fitted_features::Vector{RF.Methods.Fit}
-    "batch sizes"
-    batch_sizes::Union{Dict{S, Int}, Nothing}
-    "n_features"
-    n_features::Union{Int, Nothing}
-    "input dimension"
-    input_dim::Int
-    "choice of random number generator"
-    rng::RNG
-    "Kernel structure type (e.g. Separable or Nonseparable)"
-    kernel_structure::KST
-    "Random Feature decomposition, choose from \"svd\" or \"cholesky\" (default)"
-    feature_decomposition::S
-    "dictionary of options for hyperparameter optimizer"
-    optimizer_options::Dict{S}
-    "diagnostics from optimizer"
-    optimizer::Vector
-end
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-gets the rfms field
-"""
-get_rfms(srfi::ScalarRandomFeatureInterface) = srfi.rfms
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-gets the fitted_features field
-"""
-get_fitted_features(srfi::ScalarRandomFeatureInterface) = srfi.fitted_features
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-gets batch_sizes the field
-"""
-get_batch_sizes(srfi::ScalarRandomFeatureInterface) = srfi.batch_sizes
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-gets the n_features field
-"""
-get_n_features(srfi::ScalarRandomFeatureInterface) = srfi.n_features
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-gets the input_dim field
-"""
-get_input_dim(srfi::ScalarRandomFeatureInterface) = srfi.input_dim
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-gets the rng field
-"""
-EKP.get_rng(srfi::ScalarRandomFeatureInterface) = srfi.rng
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the kernel_structure field
-"""
-get_kernel_structure(srfi::ScalarRandomFeatureInterface) = srfi.kernel_structure
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-gets the feature_decomposition field
-"""
-get_feature_decomposition(srfi::ScalarRandomFeatureInterface) = srfi.feature_decomposition
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-gets the optimizer_options field
-"""
-get_optimizer_options(srfi::ScalarRandomFeatureInterface) = srfi.optimizer_options
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-gets the optimizer field
-"""
-get_optimizer(srfi::ScalarRandomFeatureInterface) = srfi.optimizer
-
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Constructs a `ScalarRandomFeatureInterface <: MachineLearningTool` interface for the `RandomFeatures.jl` package for multi-input and single- (or decorrelated-)output emulators.
- - `n_features` - the number of random features
- - `input_dim` - the dimension of the input space
- - `kernel_structure` -  - a prescribed form of kernel structure
- - `batch_sizes = nothing` - Dictionary of batch sizes passed `RandomFeatures.jl` object (see definition there)
- - `rng = Random.GLOBAL_RNG` - random number generator 
- - `feature_decomposition = "cholesky"` - choice of how to store decompositions of random features, `cholesky` or `svd` available
- - `optimizer_options = nothing` - Dict of options to pass into EKI optimization of hyperparameters (defaults created in `ScalarRandomFeatureInterface` constructor):
-     - "prior":  the prior for the hyperparameter optimization 
-     - "prior_in_scale": use this to tune the input prior scale
-     - "n_ensemble":  number of ensemble members
-     - "n_iteration":  number of eki iterations
-     - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 0.0)  
-     - "scheduler": Learning rate Scheduler (a.k.a. EKP timestepper) Default: DataMisfitController
-     - "tikhonov":  tikhonov regularization parameter if >0
-     - "inflation":  additive inflation ∈ [0,1] with 0 being no inflation
-     - "train_fraction":  e.g. 0.8 (default)  means 80:20 train - test split
-     - "n_features_opt":  fix the number of features for optimization (default `n_features`, as used for prediction)
-     - "multithread": how to multithread. "ensemble" (default) threads across ensemble members "tullio" threads random feature matrix algebra
-     - "accelerator": use EKP accelerators (default is no acceleration)
-     - "verbose" => false, verbose optimizer statements
-     - "cov_correction" => "shrinkage", type of conditioning to improve estimated covariance (Ledoit Wolfe 03), also "nice" for (Vishny, Morzfeld et al. 2024)
-"""
-function ScalarRandomFeatureInterface(
-    n_features::Int,
-    input_dim::Int;
-    kernel_structure::Union{KST, Nothing} = nothing,
-    batch_sizes::Union{Dict{S, Int}, Nothing} = nothing,
-    rng::RNG = Random.GLOBAL_RNG,
-    feature_decomposition::S = "cholesky",
-    optimizer_options::Union{Dict{S}, Nothing} = nothing,
-) where {S <: AbstractString, RNG <: AbstractRNG, KST <: KernelStructureType}
-    # Initialize vector for GP models
-    rfms = Vector{RF.Methods.RandomFeatureMethod}(undef, 0)
-    fitted_features = Vector{RF.Methods.Fit}(undef, 0)
-
-    if isnothing(kernel_structure)
-        kernel_structure = SeparableKernel(cov_structure_from_string("lowrank", input_dim), OneDimFactor())
-    end
-    KSType = typeof(kernel_structure)
-
-    prior = build_default_prior(input_dim, kernel_structure)
-
-    # default optimizer settings
-    optimizer_opts = Dict(
-        "prior" => prior, #the hyperparameter_prior
-        "n_ensemble" => max(ndims(prior) + 1, 10), #number of ensemble
-        "n_iteration" => 5, # number of eki iterations
-        "scheduler" => EKP.DataMisfitController(terminate_at=1000), # Adaptive timestepping,
-        "cov_sample_multiplier" => 2.0, # multiplier for samples to estimate covariance in optimization scheme
-        "inflation" => 1e-4, # additive inflation ∈ [0,1] with 0 being no inflation
-        "train_fraction" => 0.8, # 80:20 train - test split
-        "n_features_opt" => n_features, # number of features for the optimization 
-        "multithread" => "ensemble", # instead of "tullio"
-        "verbose" => false, # verbose optimizer statements
-        "accelerator" => EKP.DefaultAccelerator(), # acceleration with momentum
-        "localization" => EKP.Localizers.NoLocalization(), # localization / sample error correction for small ensembles
-        "cov_correction" => "shrinkage", # type of conditioning to improve estimated covariance
-        "n_cross_val_sets" => 1,
-    )
-
-    if !isnothing(optimizer_options)
-        for key in keys(optimizer_options)
-            optimizer_opts[key] = optimizer_options[key]
-        end
-    end
-
-    opt_tmp = Dict()
-    for key in keys(optimizer_opts)
-        if key != "prior"
-            opt_tmp[key] = optimizer_opts[key]
-        end
-    end
-    @info("hyperparameter optimization with EKI configured with $opt_tmp")
-
-    return ScalarRandomFeatureInterface{S, RNG, KSType}(
-        rfms,
-        fitted_features,
-        batch_sizes,
-        n_features,
-        input_dim,
-        rng,
-        kernel_structure,
-        feature_decomposition,
-        optimizer_opts,
-        [],
-    )
-end
-
-function hyperparameter_distribution_from_flat(
-    x::VV,
-    input_dim::Int,
-    kernel_structure::SK,
-) where {VV <: AbstractVector, SK <: SeparableKernel}
-
-    M = zeros(input_dim) #scalar output
-    U = hyperparameters_from_flat(x, input_dim, kernel_structure)
-
-    if !isposdef(U)
-        println("U not posdef - correcting")
-        U = posdef_correct(U)
-    end
-
-    dist = MvNormal(M, U)
-    pd = ParameterDistribution(
-        Dict(
-            "distribution" => Parameterized(dist),
-            "constraint" => repeat([no_constraint()], input_dim),
-            "name" => "xi",
-        ),
-    )
-
-    return pd
-end
-
-function hyperparameter_distribution_from_flat(
-    x::VV,
-    input_dim::Int,
-    kernel_structure::NK,
-) where {VV <: AbstractVector, NK <: NonseparableKernel}
-    throw(
-        ArgumentError(
-            "Scalar Kernels must be of type `Separable( *** , OneDimFactor())`, received $(kernel_structure)",
-        ),
-    )
-end
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Builds the random feature method from hyperparameters. We use cosine activation functions and a MatrixVariateNormal(M,U,V) distribution (from `Distributions.jl`) with mean M=0, and input covariance U built using a `CovarianceStructureType`.
-"""
-function RFM_from_hyperparameters(
-    srfi::ScalarRandomFeatureInterface,
-    rng::RNG,
-    l::ForVM,
-    regularization::MorUSorD, # just a 1x1 matrix though
-    n_features::Int,
-    batch_sizes::Union{Dict{S, Int}, Nothing},
-    input_dim::Int,
-    multithread_type::MT;
-) where {
-    RNG <: AbstractRNG,
-    ForVM <: Union{Real, AbstractVecOrMat},
-    MorUSorD <: Union{Matrix, UniformScaling, Diagonal},
-    S <: AbstractString,
-    MT <: MultithreadType,
-}
-
-    xi_hp = l[1:(end - 1)]
-    sigma_hp = l[end]
-    kernel_structure = get_kernel_structure(srfi)
-    pd = hyperparameter_distribution_from_flat(xi_hp, input_dim, kernel_structure)
-
-    feature_sampler = RF.Samplers.FeatureSampler(pd, rng = rng)
-    # Learn hyperparameters for different feature types
-    feature_parameters = Dict("sigma" => sigma_hp)
-    sff = RF.Features.ScalarFourierFeature(n_features, feature_sampler, feature_parameters = feature_parameters)
-    thread_opt = isa(multithread_type, TullioThreading) # if we want to multithread with tullio
-    if isnothing(batch_sizes)
-        return RF.Methods.RandomFeatureMethod(sff, regularization = regularization, tullio_threading = thread_opt)
-    else
-        return RF.Methods.RandomFeatureMethod(
-            sff,
-            regularization = regularization,
-            batch_sizes = batch_sizes,
-            tullio_threading = thread_opt,
-        )
-    end
-end
-
-#removes vector-only input arguments
-RFM_from_hyperparameters(
-    srfi::ScalarRandomFeatureInterface,
-    rng::RNG,
-    l::ForVM,
-    regularization::MorUS, # just a 1x1 matrix though
-    n_features::Int,
-    batch_sizes::Union{Dict{S, Int}, Nothing},
-    input_dim::Int,
-    output_dim::Int,
-    multithread_type::MT;
-) where {
-    RNG <: AbstractRNG,
-    ForVM <: Union{Real, AbstractVecOrMat},
-    MorUS <: Union{Matrix, UniformScaling},
-    S <: AbstractString,
-    MT <: MultithreadType,
-} = RFM_from_hyperparameters(srfi, rng, l, regularization, n_features, batch_sizes, input_dim, multithread_type)
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Builds the random feature method from hyperparameters. We use cosine activation functions and a Multivariate Normal distribution (from `Distributions.jl`) with mean M=0, and input covariance U built with the `CovarianceStructureType`.
-"""
-function build_models!(
-    srfi::ScalarRandomFeatureInterface,
-    input_output_pairs::PairedDataContainer{FT},
-) where {FT <: AbstractFloat}
-    # get inputs and outputs 
-    input_values = get_inputs(input_output_pairs)
-    output_values = get_outputs(input_output_pairs)
-    n_rfms, n_data = size(output_values)
-    noise_sd = 1.0
-
-    input_dim = size(input_values, 1)
-
-    kernel_structure = get_kernel_structure(srfi)
-    n_hp = calculate_n_hyperparameters(input_dim, kernel_structure)
-
-
-    rfms = get_rfms(srfi)
-    if length(rfms) > 0
-        @warn "ScalarRandomFeatureInterface already built. skipping..."
-        return
-    end
-    fitted_features = get_fitted_features(srfi)
-    n_features = get_n_features(srfi)
-    batch_sizes = get_batch_sizes(srfi)
-    rng = get_rng(srfi)
-    decomp_type = get_feature_decomposition(srfi)
-    optimizer_options = get_optimizer_options(srfi)
-    optimizer = get_optimizer(srfi) # empty vector
-
-    # Optimize features with EKP for each output dim
-    # [1.] Split data into test/train 80/20 
-    train_fraction = optimizer_options["train_fraction"]
-    n_train = Int(floor(train_fraction * n_data))
-    n_test = n_data - n_train
-    n_features_opt = optimizer_options["n_features_opt"]
-
-    idx_shuffle = randperm(rng, n_data)
-    n_cross_val_sets = optimizer_options["n_cross_val_sets"]
-    if n_test*n_cross_val_sets > n_data
-        throw(ArgumentError("train/test split produces cross validation test sets of size $(n_test), out of $(n_data). \"n_cross_val_sets\" optimizer_options keyword < $(Int(floor(n_data/n_test))). Received $n_cross_val_sets"))
-    end
-    train_idx = []
-    test_idx = []
-    for i = 1:n_cross_val_sets
-        tmp = idx_shuffle[(i-1)*n_test+1:i*n_test]
-        push!(test_idx, tmp)
-        push!(train_idx, setdiff(collect(1:n_data), tmp))
-    end
-        
-    #regularization = I = 1.0 in scalar case
-    regularization = I
-
-    @info (
-        "hyperparameter learning for $n_rfms models using $n_train training points, $n_test validation points and $n_features_opt features"
-    )
-    n_iteration = optimizer_options["n_iteration"]
-    diagnostics = zeros(n_iteration, n_rfms)
-    for i in 1:n_rfms
-        
-        io_pairs_opt = PairedDataContainer(
-            input_values,
-            reshape(output_values[i,:],1,size(output_values, 2)),
-        )
-       
-        multithread = optimizer_options["multithread"]
-        if multithread == "ensemble"
-            multithread_type = EnsembleThreading()
-        elseif multithread == "tullio"
-            multithread_type = TullioThreading()
-        else
-            throw(
-                ArgumentError(
-                    "Unknown optimizer option for multithreading, please choose from \"tullio\" (allows Tullio.jl to control threading in RandomFeatures.jl), or \"ensemble\" (threading is done over the ensemble)",
-                ),
-            )
-        end
-        # [2.] Estimate covariance at mean value
-        prior = optimizer_options["prior"]
-
-        # where prior space has changed we need to rebuild the priors
-        if ndims(prior) > n_hp
-
-            # comes from having a truncated output_dimension
-            # TODO not really a truncation here, resetting to default
-            @info "Original input space of dimension $(get_input_dim(srfi)) has been truncated to $(input_dim). \n Rebuilding prior... number of hyperparameters reduced from $(ndims(prior)) to $(n_hp)."
-            prior = build_default_prior(input_dim, kernel_structure)
-
-        end
-
-        μ_hp = transform_unconstrained_to_constrained(prior, mean(prior))
-
-        cov_sample_multiplier = optimizer_options["cov_sample_multiplier"]
-        cov_correction = optimizer_options["cov_correction"]
-        n_cov_samples_min = n_test + 2
-        n_cov_samples = Int(floor(n_cov_samples_min * max(cov_sample_multiplier, 0.0)))
-        observation_vec = []
-        for cv_idx = 1:n_cross_val_sets
-            internal_Γ, approx_σ2 = estimate_mean_and_coeffnorm_covariance(
-                srfi,
-                rng,
-                μ_hp,
-                regularization,
-                n_features_opt,
-                train_idx[cv_idx],
-                test_idx[cv_idx],
-                batch_sizes,
-                io_pairs_opt,
-                n_cov_samples,
-                decomp_type,
-                multithread_type,
-                cov_correction = cov_correction,
-            )
-            Γ = internal_Γ
-            Γ[1:n_test, 1:n_test] += regularization  # + approx_σ2 
-            Γ[(n_test + 1):end, (n_test + 1):end] += I
-            if !isposdef(Γ)
-                Γ = posdef_correct(Γ)
-            end
-            data = vcat(get_outputs(io_pairs_opt)[test_idx[cv_idx]], 0.0, 0.0)
-
-            push!(observation_vec, EKP.Observation(
-                Dict(
-                    "names" => "$(cv_idx)",
-                    "samples" => data[:],
-                    "covariances" => Γ,
-                ),
-            ),
-            )
-        end
-        observation = combine_observations(observation_vec)
-        # [3.] set up EKP optimization
-        n_ensemble = optimizer_options["n_ensemble"]
-        n_iteration = optimizer_options["n_iteration"]
-        opt_verbose_flag = optimizer_options["verbose"]
-        scheduler = optimizer_options["scheduler"]
-        accelerator = optimizer_options["accelerator"]
-        localization = optimizer_options["localization"]
-
-        initial_params = construct_initial_ensemble(rng, prior, n_ensemble)
-        ekiobj = EKP.EnsembleKalmanProcess(
-            initial_params,
-            observation,
-            Inversion(),
-            scheduler = scheduler,
-            rng = rng,
-            accelerator = accelerator,
-            verbose = opt_verbose_flag,
-            localization_method = localization,
-        )
-        err = zeros(n_iteration)
-
-        # [4.] optimize with EKP
-        for i in 1:n_iteration
-
-            #get parameters:
-            lvec = transform_unconstrained_to_constrained(prior, get_u_final(ekiobj))
-            g_ens = zeros(n_cross_val_sets*(n_test+2),n_ensemble)
-            for cv_idx = 1:n_cross_val_sets
-        
-                g_ens_tmp, _ = calculate_ensemble_mean_and_coeffnorm(
-                    srfi,
-                    rng,
-                    lvec,
-                    regularization,
-                    n_features_opt,
-                    train_idx[cv_idx],
-                    test_idx[cv_idx],
-                    batch_sizes,
-                    io_pairs_opt,
-                    decomp_type,
-                    multithread_type,
-                )
-                g_ens[(cv_idx-1)*(n_test+2) + 1: cv_idx*(n_test+2), :] = g_ens_tmp
-            end
-            
-            inflation = optimizer_options["inflation"]
-            if inflation > 0
-                terminated = EKP.update_ensemble!(ekiobj, g_ens, additive_inflation = true, s = inflation) # small regularizing inflation
-            else
-                terminated = EKP.update_ensemble!(ekiobj, g_ens) # small regularizing inflation
-            end
-            if !isnothing(terminated)
-                break # if the timestep was terminated due to timestepping condition
-            end
-
-            err[i] = get_error(ekiobj)[end] #mean((params_true - mean(params_i,dims=2)).^2)
-        end
-        diagnostics[:, i] = copy(err)
-
-        # [5.] extract optimal hyperparameters
-        hp_optimal = get_ϕ_mean_final(prior, ekiobj)[:]
-
-        if opt_verbose_flag
-            names = get_name(prior)
-            hp_optimal_batch = [hp_optimal[b] for b in batch(prior)]
-            hp_optimal_range =
-                [(minimum(hp_optimal_batch[i]), maximum(hp_optimal_batch[i])) for i in 1:length(hp_optimal_batch)] #the min and max of the hparams
-            prior_conf_interval = [mean(prior) .- 3 * sqrt.(var(prior)), mean(prior) .+ 3 * sqrt.(var(prior))]
-            pci_constrained = [transform_unconstrained_to_constrained(prior, prior_conf_interval[i]) for i in 1:2]
-            pcic = [(pci_constrained[1][i], pci_constrained[2][i]) for i in 1:length(pci_constrained[1])]
-            pcic_batched = [pcic[b][1] for b in batch(prior)]
-            @info("EKI Optimization result:")
-            println(
-                display(
-                    [
-                        "name" "number of hyperparameters" "optimized value range" "99% prior mass"
-                        names length.(hp_optimal_batch) hp_optimal_range pcic_batched
-                    ],
-                ),
-            )
-        end
-
-        io_pairs_i = PairedDataContainer(input_values, reshape(output_values[i, :], 1, size(output_values, 2)))
-        # Now, fit new RF model with the optimized hyperparameters
-
-        rfm_i = RFM_from_hyperparameters(
-            srfi,
-            rng,
-            hp_optimal,
-            regularization,
-            n_features,
-            batch_sizes,
-            input_dim,
-            multithread_type,
-        )
-        fitted_features_i = RF.Methods.fit(rfm_i, io_pairs_i, decomposition_type = decomp_type) #fit features
-
-        push!(rfms, rfm_i)
-        push!(fitted_features, fitted_features_i)
-
-    end
-    push!(optimizer, diagnostics)
-
-end
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Empty method, as optimization takes place within the build_models stage
-"""
-function optimize_hyperparameters!(srfi::ScalarRandomFeatureInterface, args...; kwargs...)
-    @info("Random Features already trained. continuing...")
-end
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Prediction of data observation (not latent function) at new inputs (passed in as columns in a matrix). That is, we add the observational noise into predictions.
-"""
-function predict(
-    srfi::ScalarRandomFeatureInterface,
-    new_inputs::MM;
-    multithread = "ensemble",
-) where {MM <: AbstractMatrix}
-    M = length(get_rfms(srfi))
-    N_samples = size(new_inputs, 2)
-    # Predicts columns of inputs: input_dim × N_samples
-    μ = zeros(M, N_samples)
-    σ2 = zeros(M, N_samples)
-    if multithread == "ensemble"
-        tullio_threading = false
-    elseif multithread == "tullio"
-        tullio_threading = true
-    end
-
-    for i in 1:M
-        μ[i, :], σ2[i, :] = RF.Methods.predict(
-            get_rfms(srfi)[i],
-            get_fitted_features(srfi)[i],
-            DataContainer(new_inputs),
-            tullio_threading = tullio_threading,
-        )
-    end
-
-    # add the noise contribution from the regularization
-    σ2[:, :] = σ2[:, :] .+ 1.0
-
-    return μ, σ2
-end
diff --git a/src/ScalarRandomFeature_nodatasplit.jl b/src/ScalarRandomFeature_nodatasplit.jl
deleted file mode 100644
index 1158def1..00000000
--- a/src/ScalarRandomFeature_nodatasplit.jl
+++ /dev/null
@@ -1,561 +0,0 @@
-export ScalarRandomFeatureInterface
-# getters already exported in VRFI
-
-"""
-$(DocStringExtensions.TYPEDEF)
-
-Structure holding the Scalar Random Feature models. 
-
-# Fields
-$(DocStringExtensions.TYPEDFIELDS)
-
-"""
-struct ScalarRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG, KST <: KernelStructureType} <:
-       RandomFeatureInterface
-    "vector of `RandomFeatureMethod`s, contains the feature structure, batch-sizes and regularization"
-    rfms::Vector{RF.Methods.RandomFeatureMethod}
-    "vector of `Fit`s, containing the matrix decomposition and coefficients of RF when fitted to data"
-    fitted_features::Vector{RF.Methods.Fit}
-    "batch sizes"
-    batch_sizes::Union{Dict{S, Int}, Nothing}
-    "n_features"
-    n_features::Union{Int, Nothing}
-    "input dimension"
-    input_dim::Int
-    "choice of random number generator"
-    rng::RNG
-    "Kernel structure type (e.g. Separable or Nonseparable)"
-    kernel_structure::KST
-    "Random Feature decomposition, choose from \"svd\" or \"cholesky\" (default)"
-    feature_decomposition::S
-    "dictionary of options for hyperparameter optimizer"
-    optimizer_options::Dict{S}
-    "diagnostics from optimizer"
-    optimizer::Vector
-end
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-gets the rfms field
-"""
-get_rfms(srfi::ScalarRandomFeatureInterface) = srfi.rfms
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-gets the fitted_features field
-"""
-get_fitted_features(srfi::ScalarRandomFeatureInterface) = srfi.fitted_features
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-gets batch_sizes the field
-"""
-get_batch_sizes(srfi::ScalarRandomFeatureInterface) = srfi.batch_sizes
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-gets the n_features field
-"""
-get_n_features(srfi::ScalarRandomFeatureInterface) = srfi.n_features
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-gets the input_dim field
-"""
-get_input_dim(srfi::ScalarRandomFeatureInterface) = srfi.input_dim
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-gets the rng field
-"""
-EKP.get_rng(srfi::ScalarRandomFeatureInterface) = srfi.rng
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the kernel_structure field
-"""
-get_kernel_structure(srfi::ScalarRandomFeatureInterface) = srfi.kernel_structure
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-gets the feature_decomposition field
-"""
-get_feature_decomposition(srfi::ScalarRandomFeatureInterface) = srfi.feature_decomposition
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-gets the optimizer_options field
-"""
-get_optimizer_options(srfi::ScalarRandomFeatureInterface) = srfi.optimizer_options
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-gets the optimizer field
-"""
-get_optimizer(srfi::ScalarRandomFeatureInterface) = srfi.optimizer
-
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Constructs a `ScalarRandomFeatureInterface <: MachineLearningTool` interface for the `RandomFeatures.jl` package for multi-input and single- (or decorrelated-)output emulators.
- - `n_features` - the number of random features
- - `input_dim` - the dimension of the input space
- - `kernel_structure` -  - a prescribed form of kernel structure
- - `batch_sizes = nothing` - Dictionary of batch sizes passed `RandomFeatures.jl` object (see definition there)
- - `rng = Random.GLOBAL_RNG` - random number generator 
- - `feature_decomposition = "cholesky"` - choice of how to store decompositions of random features, `cholesky` or `svd` available
- - `optimizer_options = nothing` - Dict of options to pass into EKI optimization of hyperparameters (defaults created in `ScalarRandomFeatureInterface` constructor):
-     - "prior":  the prior for the hyperparameter optimization 
-     - "prior_in_scale": use this to tune the input prior scale
-     - "n_ensemble":  number of ensemble members
-     - "n_iteration":  number of eki iterations
-     - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 0.0)  
-     - "scheduler": Learning rate Scheduler (a.k.a. EKP timestepper) Default: DataMisfitController
-     - "tikhonov":  tikhonov regularization parameter if >0
-     - "inflation":  additive inflation ∈ [0,1] with 0 being no inflation
-     - "train_fraction":  e.g. 0.8 (default)  means 80:20 train - test split
-     - "n_features_opt":  fix the number of features for optimization (default `n_features`, as used for prediction)
-     - "multithread": how to multithread. "ensemble" (default) threads across ensemble members "tullio" threads random feature matrix algebra
-     - "accelerator": use EKP accelerators (default is no acceleration)
-     - "verbose" => false, verbose optimizer statements
-     - "cov_correction" => "shrinkage", type of conditioning to improve estimated covariance (Ledoit Wolfe 03), also "nice" for (Vishny, Morzfeld et al. 2024)
-"""
-function ScalarRandomFeatureInterface(
-    n_features::Int,
-    input_dim::Int;
-    kernel_structure::Union{KST, Nothing} = nothing,
-    batch_sizes::Union{Dict{S, Int}, Nothing} = nothing,
-    rng::RNG = Random.GLOBAL_RNG,
-    feature_decomposition::S = "cholesky",
-    optimizer_options::Union{Dict{S}, Nothing} = nothing,
-) where {S <: AbstractString, RNG <: AbstractRNG, KST <: KernelStructureType}
-    # Initialize vector for GP models
-    rfms = Vector{RF.Methods.RandomFeatureMethod}(undef, 0)
-    fitted_features = Vector{RF.Methods.Fit}(undef, 0)
-
-    if isnothing(kernel_structure)
-        kernel_structure = SeparableKernel(cov_structure_from_string("lowrank", input_dim), OneDimFactor())
-    end
-    KSType = typeof(kernel_structure)
-
-    prior = build_default_prior(input_dim, kernel_structure)
-
-    # default optimizer settings
-    optimizer_opts = Dict(
-        "prior" => prior, #the hyperparameter_prior
-        "n_ensemble" => max(ndims(prior) + 1, 10), #number of ensemble
-        "n_iteration" => 5, # number of eki iterations
-        "scheduler" => EKP.DataMisfitController(terminate_at=100), # Adaptive timestepping,
-        "cov_sample_multiplier" => 2.0, # multiplier for samples to estimate covariance in optimization scheme
-        "inflation" => 1e-4, # additive inflation ∈ [0,1] with 0 being no inflation
-        "train_fraction" => 0.8, # 80:20 train - test split
-        "n_features_opt" => n_features, # number of features for the optimization 
-        "multithread" => "ensemble", # instead of "tullio"
-        "verbose" => false, # verbose optimizer statements
-        "accelerator" => EKP.DefaultAccelerator(), # acceleration with momentum
-        "localization" => EKP.Localizers.NoLocalization(), # localization / sample error correction for small ensembles
-        "cov_correction" => "shrinkage", # type of conditioning to improve estimated covariance
-    )
-
-    if !isnothing(optimizer_options)
-        for key in keys(optimizer_options)
-            optimizer_opts[key] = optimizer_options[key]
-        end
-    end
-
-    opt_tmp = Dict()
-    for key in keys(optimizer_opts)
-        if key != "prior"
-            opt_tmp[key] = optimizer_opts[key]
-        end
-    end
-    @info("hyperparameter optimization with EKI configured with $opt_tmp")
-
-    return ScalarRandomFeatureInterface{S, RNG, KSType}(
-        rfms,
-        fitted_features,
-        batch_sizes,
-        n_features,
-        input_dim,
-        rng,
-        kernel_structure,
-        feature_decomposition,
-        optimizer_opts,
-        [],
-    )
-end
-
-function hyperparameter_distribution_from_flat(
-    x::VV,
-    input_dim::Int,
-    kernel_structure::SK,
-) where {VV <: AbstractVector, SK <: SeparableKernel}
-
-    M = zeros(input_dim) #scalar output
-    U = hyperparameters_from_flat(x, input_dim, kernel_structure)
-
-    if !isposdef(U)
-        println("U not posdef - correcting")
-        U = posdef_correct(U)
-    end
-
-    dist = MvNormal(M, U)
-    pd = ParameterDistribution(
-        Dict(
-            "distribution" => Parameterized(dist),
-            "constraint" => repeat([no_constraint()], input_dim),
-            "name" => "xi",
-        ),
-    )
-
-    return pd
-end
-
-function hyperparameter_distribution_from_flat(
-    x::VV,
-    input_dim::Int,
-    kernel_structure::NK,
-) where {VV <: AbstractVector, NK <: NonseparableKernel}
-    throw(
-        ArgumentError(
-            "Scalar Kernels must be of type `Separable( *** , OneDimFactor())`, received $(kernel_structure)",
-        ),
-    )
-end
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Builds the random feature method from hyperparameters. We use cosine activation functions and a MatrixVariateNormal(M,U,V) distribution (from `Distributions.jl`) with mean M=0, and input covariance U built using a `CovarianceStructureType`.
-"""
-function RFM_from_hyperparameters(
-    srfi::ScalarRandomFeatureInterface,
-    rng::RNG,
-    l::ForVM,
-    regularization::MorUSorD, # just a 1x1 matrix though
-    n_features::Int,
-    batch_sizes::Union{Dict{S, Int}, Nothing},
-    input_dim::Int,
-    multithread_type::MT;
-) where {
-    RNG <: AbstractRNG,
-    ForVM <: Union{Real, AbstractVecOrMat},
-    MorUSorD <: Union{Matrix, UniformScaling, Diagonal},
-    S <: AbstractString,
-    MT <: MultithreadType,
-}
-
-    xi_hp = l[1:(end - 1)]
-    sigma_hp = l[end]
-    kernel_structure = get_kernel_structure(srfi)
-    pd = hyperparameter_distribution_from_flat(xi_hp, input_dim, kernel_structure)
-
-    feature_sampler = RF.Samplers.FeatureSampler(pd, rng = rng)
-    # Learn hyperparameters for different feature types
-    feature_parameters = Dict("sigma" => sigma_hp)
-    sff = RF.Features.ScalarFourierFeature(n_features, feature_sampler, feature_parameters = feature_parameters)
-    thread_opt = isa(multithread_type, TullioThreading) # if we want to multithread with tullio
-    if isnothing(batch_sizes)
-        return RF.Methods.RandomFeatureMethod(sff, regularization = regularization, tullio_threading = thread_opt)
-    else
-        return RF.Methods.RandomFeatureMethod(
-            sff,
-            regularization = regularization,
-            batch_sizes = batch_sizes,
-            tullio_threading = thread_opt,
-        )
-    end
-end
-
-#removes vector-only input arguments
-RFM_from_hyperparameters(
-    srfi::ScalarRandomFeatureInterface,
-    rng::RNG,
-    l::ForVM,
-    regularization::MorUS, # just a 1x1 matrix though
-    n_features::Int,
-    batch_sizes::Union{Dict{S, Int}, Nothing},
-    input_dim::Int,
-    output_dim::Int,
-    multithread_type::MT;
-) where {
-    RNG <: AbstractRNG,
-    ForVM <: Union{Real, AbstractVecOrMat},
-    MorUS <: Union{Matrix, UniformScaling},
-    S <: AbstractString,
-    MT <: MultithreadType,
-} = RFM_from_hyperparameters(srfi, rng, l, regularization, n_features, batch_sizes, input_dim, multithread_type)
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Builds the random feature method from hyperparameters. We use cosine activation functions and a Multivariate Normal distribution (from `Distributions.jl`) with mean M=0, and input covariance U built with the `CovarianceStructureType`.
-"""
-function build_models!(
-    srfi::ScalarRandomFeatureInterface,
-    input_output_pairs::PairedDataContainer{FT},
-) where {FT <: AbstractFloat}
-    # get inputs and outputs 
-    input_values = get_inputs(input_output_pairs)
-    output_values = get_outputs(input_output_pairs)
-    n_rfms, n_data = size(output_values)
-    noise_sd = 1.0
-
-    input_dim = size(input_values, 1)
-
-    kernel_structure = get_kernel_structure(srfi)
-    n_hp = calculate_n_hyperparameters(input_dim, kernel_structure)
-
-
-    rfms = get_rfms(srfi)
-    if length(rfms) > 0
-        @warn "ScalarRandomFeatureInterface already built. skipping..."
-        return
-    end
-    fitted_features = get_fitted_features(srfi)
-    n_features = get_n_features(srfi)
-    batch_sizes = get_batch_sizes(srfi)
-    rng = get_rng(srfi)
-    decomp_type = get_feature_decomposition(srfi)
-    optimizer_options = get_optimizer_options(srfi)
-    optimizer = get_optimizer(srfi) # empty vector
-
-    # Optimize features with EKP for each output dim
-    # [1.] Split data into test/train 80/20 
-    train_fraction = optimizer_options["train_fraction"]
-    n_train=n_data
-    n_test = n_data
-#    n_train = Int(floor(train_fraction * n_data))
-#    n_test = n_data - n_train
-    n_features_opt = optimizer_options["n_features_opt"]
-
-    #idx_shuffle = randperm(rng, n_data)
-
-    #regularization = I = 1.0 in scalar case
-    regularization = I
-
-
-    @info (
-        "hyperparameter learning for $n_rfms models using $n_train training points, $n_test validation points and $n_features_opt features"
-    )
-    n_iteration = optimizer_options["n_iteration"]
-    diagnostics = zeros(n_iteration, n_rfms)
-    for i in 1:n_rfms
-        io_pairs_opt = PairedDataContainer(
-            input_values,
-            reshape(output_values[i,:],1,size(output_values, 2)),
-        )
-        #io_pairs_opt = PairedDataContainer(
-        #    input_values[:, idx_shuffle],
-        #    reshape(output_values[i, idx_shuffle], 1, size(output_values, 2)),
-        #)
-
-        multithread = optimizer_options["multithread"]
-        if multithread == "ensemble"
-            multithread_type = EnsembleThreading()
-        elseif multithread == "tullio"
-            multithread_type = TullioThreading()
-        else
-            throw(
-                ArgumentError(
-                    "Unknown optimizer option for multithreading, please choose from \"tullio\" (allows Tullio.jl to control threading in RandomFeatures.jl), or \"ensemble\" (threading is done over the ensemble)",
-                ),
-            )
-        end
-        # [2.] Estimate covariance at mean value
-        prior = optimizer_options["prior"]
-
-        # where prior space has changed we need to rebuild the priors
-        if ndims(prior) > n_hp
-
-            # comes from having a truncated output_dimension
-            # TODO not really a truncation here, resetting to default
-            @info "Original input space of dimension $(get_input_dim(srfi)) has been truncated to $(input_dim). \n Rebuilding prior... number of hyperparameters reduced from $(ndims(prior)) to $(n_hp)."
-            prior = build_default_prior(input_dim, kernel_structure)
-
-        end
-
-        μ_hp = transform_unconstrained_to_constrained(prior, mean(prior))
-
-        cov_sample_multiplier = optimizer_options["cov_sample_multiplier"]
-        cov_correction = optimizer_options["cov_correction"]
-        n_cov_samples_min = n_test + 2
-        n_cov_samples = Int(floor(n_cov_samples_min * max(cov_sample_multiplier, 0.0)))
-
-        internal_Γ, approx_σ2 = estimate_mean_and_coeffnorm_covariance(
-            srfi,
-            rng,
-            μ_hp,
-            regularization,
-            n_features_opt,
-            n_train,
-            n_test,
-            batch_sizes,
-            io_pairs_opt,
-            n_cov_samples,
-            decomp_type,
-            multithread_type,
-            cov_correction = cov_correction,
-        )
-        Γ = internal_Γ
-        Γ[1:n_test, 1:n_test] += regularization  # + approx_σ2 
-        Γ[(n_test + 1):end, (n_test + 1):end] += I
-
-        # [3.] set up EKP optimization
-        n_ensemble = optimizer_options["n_ensemble"]
-        n_iteration = optimizer_options["n_iteration"]
-        opt_verbose_flag = optimizer_options["verbose"]
-        scheduler = optimizer_options["scheduler"]
-        accelerator = optimizer_options["accelerator"]
-        localization = optimizer_options["localization"]
-
-        initial_params = construct_initial_ensemble(rng, prior, n_ensemble)
-#        data = vcat(get_outputs(io_pairs_opt)[(n_train + 1):end], 0.0, 0.0)
-         data = vcat(get_outputs(io_pairs_opt)[:], 0.0, 0.0)
-        ekiobj = EKP.EnsembleKalmanProcess(
-            initial_params,
-            data,
-            Γ,
-            Inversion(),
-            scheduler = scheduler,
-            rng = rng,
-            accelerator = accelerator,
-            verbose = opt_verbose_flag,
-            localization_method = localization,
-        )
-        err = zeros(n_iteration)
-
-        # [4.] optimize with EKP
-        for i in 1:n_iteration
-
-            #get parameters:
-            lvec = transform_unconstrained_to_constrained(prior, get_u_final(ekiobj))
-            g_ens, _ = calculate_ensemble_mean_and_coeffnorm(
-                srfi,
-                rng,
-                lvec,
-                regularization,
-                n_features_opt,
-                n_train,
-                n_test,
-                batch_sizes,
-                io_pairs_opt,
-                decomp_type,
-                multithread_type,
-            )
-
-            inflation = optimizer_options["inflation"]
-            if inflation > 0
-                terminated = EKP.update_ensemble!(ekiobj, g_ens, additive_inflation = true, s = inflation) # small regularizing inflation
-            else
-                terminated = EKP.update_ensemble!(ekiobj, g_ens) # small regularizing inflation
-            end
-            if !isnothing(terminated)
-                break # if the timestep was terminated due to timestepping condition
-            end
-
-            err[i] = get_error(ekiobj)[end] #mean((params_true - mean(params_i,dims=2)).^2)
-        end
-        diagnostics[:, i] = copy(err)
-
-        # [5.] extract optimal hyperparameters
-        hp_optimal = get_ϕ_mean_final(prior, ekiobj)[:]
-
-        if opt_verbose_flag
-            names = get_name(prior)
-            hp_optimal_batch = [hp_optimal[b] for b in batch(prior)]
-            hp_optimal_range =
-                [(minimum(hp_optimal_batch[i]), maximum(hp_optimal_batch[i])) for i in 1:length(hp_optimal_batch)] #the min and max of the hparams
-            prior_conf_interval = [mean(prior) .- 3 * sqrt.(var(prior)), mean(prior) .+ 3 * sqrt.(var(prior))]
-            pci_constrained = [transform_unconstrained_to_constrained(prior, prior_conf_interval[i]) for i in 1:2]
-            pcic = [(pci_constrained[1][i], pci_constrained[2][i]) for i in 1:length(pci_constrained[1])]
-            pcic_batched = [pcic[b][1] for b in batch(prior)]
-            @info("EKI Optimization result:")
-            println(
-                display(
-                    [
-                        "name" "number of hyperparameters" "optimized value range" "99% prior mass"
-                        names length.(hp_optimal_batch) hp_optimal_range pcic_batched
-                    ],
-                ),
-            )
-        end
-
-        io_pairs_i = PairedDataContainer(input_values, reshape(output_values[i, :], 1, size(output_values, 2)))
-        # Now, fit new RF model with the optimized hyperparameters
-
-        rfm_i = RFM_from_hyperparameters(
-            srfi,
-            rng,
-            hp_optimal,
-            regularization,
-            n_features,
-            batch_sizes,
-            input_dim,
-            multithread_type,
-        )
-        fitted_features_i = RF.Methods.fit(rfm_i, io_pairs_i, decomposition_type = decomp_type) #fit features
-
-        push!(rfms, rfm_i)
-        push!(fitted_features, fitted_features_i)
-
-    end
-    push!(optimizer, diagnostics)
-
-end
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Empty method, as optimization takes place within the build_models stage
-"""
-function optimize_hyperparameters!(srfi::ScalarRandomFeatureInterface, args...; kwargs...)
-    @info("Random Features already trained. continuing...")
-end
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Prediction of data observation (not latent function) at new inputs (passed in as columns in a matrix). That is, we add the observational noise into predictions.
-"""
-function predict(
-    srfi::ScalarRandomFeatureInterface,
-    new_inputs::MM;
-    multithread = "ensemble",
-) where {MM <: AbstractMatrix}
-    M = length(get_rfms(srfi))
-    N_samples = size(new_inputs, 2)
-    # Predicts columns of inputs: input_dim × N_samples
-    μ = zeros(M, N_samples)
-    σ2 = zeros(M, N_samples)
-    if multithread == "ensemble"
-        tullio_threading = false
-    elseif multithread == "tullio"
-        tullio_threading = true
-    end
-
-    for i in 1:M
-        μ[i, :], σ2[i, :] = RF.Methods.predict(
-            get_rfms(srfi)[i],
-            get_fitted_features(srfi)[i],
-            DataContainer(new_inputs),
-            tullio_threading = tullio_threading,
-        )
-    end
-
-    # add the noise contribution from the regularization
-    σ2[:, :] = σ2[:, :] .+ 1.0
-
-    return μ, σ2
-end
diff --git a/src/VectorRandomFeature.jl b/src/VectorRandomFeature.jl
index bda02734..d675391d 100644
--- a/src/VectorRandomFeature.jl
+++ b/src/VectorRandomFeature.jl
@@ -192,7 +192,7 @@ function VectorRandomFeatureInterface(
         "prior" => prior, #the hyperparameter_prior (note scalings have already been applied)
         "n_ensemble" => max(ndims(prior) + 1, 10), #number of ensemble
         "n_iteration" => 5, # number of eki iterations
-        "scheduler" => EKP.DataMisfitController(terminate_at=1000), # Adaptive timestepping
+        "scheduler" => EKP.DataMisfitController(terminate_at = 1000), # Adaptive timestepping
         "cov_sample_multiplier" => 10.0, # multiplier for samples to estimate covariance in optimization scheme 
         "tikhonov" => 0, # tikhonov regularization parameter if >0
         "inflation" => 1e-4, # additive inflation ∈ [0,1] with 0 being no inflation
@@ -409,27 +409,31 @@ function build_models!(
     n_features_opt = optimizer_options["n_features_opt"]
     idx_shuffle = randperm(rng, n_data)
     n_cross_val_sets = Int(optimizer_options["n_cross_val_sets"])
-    
+
     train_idx = []
     test_idx = []
-    if n_cross_val_sets == 0 
-        push!(train_idx,idx_shuffle)
-        push!(test_idx,idx_shuffle)
+    if n_cross_val_sets == 0
+        push!(train_idx, idx_shuffle)
+        push!(test_idx, idx_shuffle)
         n_cross_val_sets = 1 # now just pretend there is one partition for looping purposes
         n_train = n_data
-        n_test = n_data    
+        n_test = n_data
     else
         train_fraction = optimizer_options["train_fraction"]
         n_train = Int(floor(train_fraction * n_data)) # 20% split
         n_test = n_data - n_train
 
-        if n_test*n_cross_val_sets > n_data
-            throw(ArgumentError("train/test split produces cross validation test sets of size $(n_test), out of $(n_data). \"n_cross_val_sets\" optimizer_options keyword < $(Int(floor(n_data/n_test))). Received $n_cross_val_sets"))
+        if n_test * n_cross_val_sets > n_data
+            throw(
+                ArgumentError(
+                    "train/test split produces cross validation test sets of size $(n_test), out of $(n_data). \"n_cross_val_sets\" optimizer_options keyword < $(Int(floor(n_data/n_test))). Received $n_cross_val_sets",
+                ),
+            )
         end
 
-        
-        for i = 1:n_cross_val_sets
-            tmp = idx_shuffle[(i-1)*n_test+1:i*n_test]
+
+        for i in 1:n_cross_val_sets
+            tmp = idx_shuffle[((i - 1) * n_test + 1):(i * n_test)]
             push!(test_idx, tmp)
             push!(train_idx, setdiff(collect(1:n_data), tmp))
         end
@@ -453,7 +457,7 @@ function build_models!(
             regularization = regularization_matrix
         end
 
-    end    
+    end
     # [2.] Estimate covariance at mean value
     μ_hp = transform_unconstrained_to_constrained(prior, mean(prior))
     cov_sample_multiplier = optimizer_options["cov_sample_multiplier"]
@@ -470,7 +474,7 @@ function build_models!(
     n_cov_samples = Int(floor(n_cov_samples_min * max(cov_sample_multiplier, 0.0)))
     observation_vec = []
     tikhonov_opt_val = optimizer_options["tikhonov"]
-    for cv_idx = 1:n_cross_val_sets
+    for cv_idx in 1:n_cross_val_sets
         internal_Γ, approx_σ2 = estimate_mean_and_coeffnorm_covariance(
             vrfi,
             rng,
@@ -486,7 +490,7 @@ function build_models!(
             multithread_type,
             cov_correction = cov_correction,
         )
-        
+
         if tikhonov_opt_val == 0
             # Build the covariance
             Γ = internal_Γ
@@ -494,7 +498,7 @@ function build_models!(
                 isa(regularization, UniformScaling) ? regularization : kron(I(n_test), regularization) # + approx_σ2
             Γ[(n_test * output_dim + 1):end, (n_test * output_dim + 1):end] += I
             data = vcat(reshape(get_outputs(input_output_pairs)[:, test_idx[cv_idx]], :, 1), 0.0, 0.0) #flatten data
-            
+
         elseif tikhonov_opt_val > 0
             # augment the state to add tikhonov
             outsize = size(internal_Γ, 1)
@@ -502,37 +506,30 @@ function build_models!(
             Γ[1:outsize, 1:outsize] = internal_Γ
             Γ[1:(n_test * output_dim), 1:(n_test * output_dim)] += kron(I(n_test), regularization) # block diag regularization
             Γ[(n_test * output_dim + 1):outsize, (n_test * output_dim + 1):outsize] += I
-            
+
             Γ[(outsize + 1):end, (outsize + 1):end] = tikhonov_opt_val .* cov(prior)
-            
+
             data = vcat(
-            reshape(get_outputs(input_output_pairs)[:, test_idx[cv_idx]], :, 1),
-            0.0,
-            0.0,
-            zeros(size(Γ, 1) - outsize, 1),
+                reshape(get_outputs(input_output_pairs)[:, test_idx[cv_idx]], :, 1),
+                0.0,
+                0.0,
+                zeros(size(Γ, 1) - outsize, 1),
             ) #flatten data with additional zeros
         else
             throw(
-            ArgumentError(
-                "Tikhonov parameter must be non-negative, instead received tikhonov_opt_val=$tikhonov_opt_val",
-            ),
+                ArgumentError(
+                    "Tikhonov parameter must be non-negative, instead received tikhonov_opt_val=$tikhonov_opt_val",
+                ),
             )
         end
         if !isposdef(Γ)
             Γ = posdef_correct(Γ)
         end
-        push!(observation_vec, EKP.Observation(
-                Dict(
-                    "names" => "$(cv_idx)",
-                    "samples" => data[:],
-                    "covariances" => Γ,
-                ),
-            ),
-            )
-     
+        push!(observation_vec, EKP.Observation(Dict("names" => "$(cv_idx)", "samples" => data[:], "covariances" => Γ)))
+
     end
     observation = combine_observations(observation_vec)
-    
+
     # [3.] set up EKP optimization
     n_ensemble = optimizer_options["n_ensemble"] # minimal ensemble size n_hp,
     n_iteration = optimizer_options["n_iteration"]
@@ -562,11 +559,11 @@ function build_models!(
         lvec = get_ϕ_final(prior, ekiobj)
 
         if tikhonov_opt_val > 0
-            g_ens = zeros(n_cross_val_sets*(output_dim*n_test+input_dim+2),n_ensemble)
+            g_ens = zeros(n_cross_val_sets * (output_dim * n_test + input_dim + 2), n_ensemble)
         else
-            g_ens = zeros(n_cross_val_sets*(output_dim*n_test+2),n_ensemble)
+            g_ens = zeros(n_cross_val_sets * (output_dim * n_test + 2), n_ensemble)
         end
-        for cv_idx = 1:n_cross_val_sets
+        for cv_idx in 1:n_cross_val_sets
             g_ens_tmp, _ = calculate_ensemble_mean_and_coeffnorm(
                 vrfi,
                 rng,
@@ -588,11 +585,11 @@ function build_models!(
                 else
                     umat = uvecormat
                 end
-                
+
                 g_ens_tmp = vcat(g_ens_tmp, umat)
             end
 
-            g_ens[(cv_idx-1)*(output_dim*n_test+2) + 1: cv_idx*(output_dim*n_test+2), :] = g_ens_tmp
+            g_ens[((cv_idx - 1) * (output_dim * n_test + 2) + 1):(cv_idx * (output_dim * n_test + 2)), :] = g_ens_tmp
 
         end
 
@@ -605,7 +602,7 @@ function build_models!(
         if !isnothing(terminated)
             break # if the timestep was terminated due to timestepping condition
         end
-        
+
         err[i] = get_error(ekiobj)[end] #mean((params_true - mean(params_i,dims=2)).^2)
 
     end
diff --git a/src/VectorRandomFeature_crossval.jl b/src/VectorRandomFeature_crossval.jl
deleted file mode 100644
index 6455e259..00000000
--- a/src/VectorRandomFeature_crossval.jl
+++ /dev/null
@@ -1,696 +0,0 @@
-using ProgressBars
-
-export VectorRandomFeatureInterface
-export get_rfms,
-    get_fitted_features,
-    get_batch_sizes,
-    get_n_features,
-    get_input_dim,
-    get_output_dim,
-    get_rng,
-    get_kernel_structure,
-    get_optimizer_options,
-    get_optimizer
-
-"""
-$(DocStringExtensions.TYPEDEF)
-
-Structure holding the Vector Random Feature models. 
-
-# Fields
-$(DocStringExtensions.TYPEDFIELDS)
-
-"""
-struct VectorRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG, KST <: KernelStructureType} <:
-       RandomFeatureInterface
-    "A vector of `RandomFeatureMethod`s, contains the feature structure, batch-sizes and regularization"
-    rfms::Vector{RF.Methods.RandomFeatureMethod}
-    "vector of `Fit`s, containing the matrix decomposition and coefficients of RF when fitted to data"
-    fitted_features::Vector{RF.Methods.Fit}
-    "batch sizes"
-    batch_sizes::Union{Dict{S, Int}, Nothing}
-    "number of features"
-    n_features::Union{Int, Nothing}
-    "input dimension"
-    input_dim::Int
-    "output_dimension"
-    output_dim::Int
-    "rng"
-    rng::RNG
-    "regularization"
-    regularization::Vector{Union{Matrix, UniformScaling, Diagonal}}
-    "Kernel structure type (e.g. Separable or Nonseparable)"
-    kernel_structure::KST
-    "Random Feature decomposition, choose from \"svd\" or \"cholesky\" (default)"
-    feature_decomposition::S
-    "dictionary of options for hyperparameter optimizer"
-    optimizer_options::Dict
-    "diagnostics from optimizer"
-    optimizer::Vector
-end
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the rfms field
-"""
-get_rfms(vrfi::VectorRandomFeatureInterface) = vrfi.rfms
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the fitted_features field
-"""
-get_fitted_features(vrfi::VectorRandomFeatureInterface) = vrfi.fitted_features
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the batch_sizes field
-"""
-get_batch_sizes(vrfi::VectorRandomFeatureInterface) = vrfi.batch_sizes
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the n_features field
-"""
-get_n_features(vrfi::VectorRandomFeatureInterface) = vrfi.n_features
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the input_dim field
-"""
-get_input_dim(vrfi::VectorRandomFeatureInterface) = vrfi.input_dim
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the output_dim field
-"""
-get_output_dim(vrfi::VectorRandomFeatureInterface) = vrfi.output_dim
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the rng field
-"""
-EKP.get_rng(vrfi::VectorRandomFeatureInterface) = vrfi.rng
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the regularization field
-"""
-get_regularization(vrfi::VectorRandomFeatureInterface) = vrfi.regularization
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the kernel_structure field
-"""
-get_kernel_structure(vrfi::VectorRandomFeatureInterface) = vrfi.kernel_structure
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the feature_decomposition field
-"""
-get_feature_decomposition(vrfi::VectorRandomFeatureInterface) = vrfi.feature_decomposition
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the optimizer_options field
-"""
-get_optimizer_options(vrfi::VectorRandomFeatureInterface) = vrfi.optimizer_options
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-gets the optimizer field
-"""
-get_optimizer(vrfi::VectorRandomFeatureInterface) = vrfi.optimizer
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Constructs a `VectorRandomFeatureInterface <: MachineLearningTool` interface for the `RandomFeatures.jl` package for multi-input and multi-output emulators.
- - `n_features` - the number of random features
- - `input_dim` - the dimension of the input space
- - `output_dim` - the dimension of the output space
- - `kernel_structure` -  - a prescribed form of kernel structure
- - `batch_sizes = nothing` - Dictionary of batch sizes passed `RandomFeatures.jl` object (see definition there)
- - `rng = Random.GLOBAL_RNG` - random number generator 
- - `feature_decomposition = "cholesky"` - choice of how to store decompositions of random features, `cholesky` or `svd` available
- - `optimizer_options = nothing` - Dict of options to pass into EKI optimization of hyperparameters (defaults created in `VectorRandomFeatureInterface` constructor):
-      - "prior": the prior for the hyperparameter optimization
-      - "prior_in_scale"/"prior_out_scale": use these to tune the input/output prior scale.
-      - "n_ensemble": number of ensemble members
-      - "n_iteration": number of eki iterations
-      - "scheduler": Learning rate Scheduler (a.k.a. EKP timestepper) Default: DataMisfitController
-      - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 0.0) 
-      - "tikhonov": tikhonov regularization parameter if > 0
-      - "inflation": additive inflation ∈ [0,1] with 0 being no inflation
-      - "train_fraction": e.g. 0.8 (default)  means 80:20 train - test split
-      - "n_features_opt":  fix the number of features for optimization (default `n_features`, as used for prediction)
-      - "multithread": how to multithread. "ensemble" (default) threads across ensemble members "tullio" threads random feature matrix algebra
-      - "accelerator": use EKP accelerators (default is no acceleration)
-      - "verbose" => false, verbose optimizer statements to check convergence, priors and optimal parameters.
-      - "cov_correction" => "shrinkage", type of conditioning to improve estimated covariance (Ledoit Wolfe 03), also "nice" for (Vishny, Morzfeld et al. 2024)
-
-
-"""
-function VectorRandomFeatureInterface(
-    n_features::Int,
-    input_dim::Int,
-    output_dim::Int;
-    kernel_structure::Union{KST, Nothing} = nothing,
-    batch_sizes::Union{Dict{S, Int}, Nothing} = nothing,
-    rng::RNG = Random.GLOBAL_RNG,
-    feature_decomposition::S = "cholesky",
-    optimizer_options::Union{Dict{S}, Nothing} = nothing,
-) where {S <: AbstractString, RNG <: AbstractRNG, KST <: KernelStructureType}
-
-    # Initialize vector for RF model
-    rfms = Vector{RF.Methods.RandomFeatureMethod}(undef, 0)
-    fitted_features = Vector{RF.Methods.Fit}(undef, 0)
-    regularization = Vector{Union{Matrix, UniformScaling, Nothing}}(undef, 0)
-
-    if isnothing(kernel_structure)
-        kernel_structure = SeparableKernel(
-            cov_structure_from_string("lowrank", input_dim),
-            cov_structure_from_string("lowrank", output_dim),
-        )
-    end
-    KSType = typeof(kernel_structure)
-    prior = build_default_prior(input_dim, output_dim, kernel_structure)
-
-    #Optimization Defaults    
-    optimizer_opts = Dict(
-        "prior" => prior, #the hyperparameter_prior (note scalings have already been applied)
-        "n_ensemble" => max(ndims(prior) + 1, 10), #number of ensemble
-        "n_iteration" => 5, # number of eki iterations
-        "scheduler" => EKP.DataMisfitController(terminate_at=1000), # Adaptive timestepping
-        "cov_sample_multiplier" => 10.0, # multiplier for samples to estimate covariance in optimization scheme 
-        "tikhonov" => 0, # tikhonov regularization parameter if >0
-        "inflation" => 1e-4, # additive inflation ∈ [0,1] with 0 being no inflation
-        "train_fraction" => 0.8, # 80:20 train - test split
-        "n_features_opt" => n_features, # number of features for the optimization 
-        "multithread" => "ensemble", # instead of "tullio"
-        "verbose" => false, # verbose optimizer statements
-        "localization" => EKP.Localizers.NoLocalization(), # localization / sample error correction for small ensembles
-        "accelerator" => EKP.NesterovAccelerator(), # acceleration with momentum
-        "cov_correction" => "shrinkage", # type of conditioning to improve estimated covariance
-        "n_cross_val_sets" => 1,
-    )
-
-    if !isnothing(optimizer_options)
-        for key in keys(optimizer_options)
-            optimizer_opts[key] = optimizer_options[key]
-        end
-    end
-
-    opt_tmp = Dict()
-    for key in keys(optimizer_opts)
-        if key != "prior"
-            opt_tmp[key] = optimizer_opts[key]
-        end
-    end
-    @info("hyperparameter optimization with EKI configured with $opt_tmp")
-
-    return VectorRandomFeatureInterface{S, RNG, KSType}(
-        rfms,
-        fitted_features,
-        batch_sizes,
-        n_features,
-        input_dim,
-        output_dim,
-        rng,
-        regularization,
-        kernel_structure,
-        feature_decomposition,
-        optimizer_opts,
-        [],
-    )
-end
-
-
-
-function hyperparameter_distribution_from_flat(
-    x::VV,
-    input_dim::Int,
-    output_dim::Int,
-    kernel_structure::SK,
-) where {VV <: AbstractVector, SK <: SeparableKernel}
-    M = zeros(input_dim, output_dim) # n x p mean
-
-    U, V = hyperparameters_from_flat(x, input_dim, output_dim, kernel_structure)
-
-    if !isposdef(U)
-        println("U not posdef - correcting")
-        U = posdef_correct(U)
-    end
-    if !isposdef(V)
-        println("V not posdef - correcting")
-        V = posdef_correct(V)
-    end
-
-    dist = MatrixNormal(M, U, V)
-    pd = ParameterDistribution(
-        Dict(
-            "distribution" => Parameterized(dist),
-            "constraint" => repeat([no_constraint()], input_dim * output_dim),
-            "name" => "xi",
-        ),
-    )
-    return pd
-end
-
-function hyperparameter_distribution_from_flat(
-    x::VV,
-    input_dim::Int,
-    output_dim::Int,
-    kernel_structure::NK,
-) where {VV <: AbstractVector, NK <: NonseparableKernel}
-
-    C = hyperparameters_from_flat(x, input_dim, output_dim, kernel_structure)
-    if !isposdef(C)
-        println("C not posdef - correcting")
-        C = posdef_correct(C)
-    end
-    dist = MvNormal(zeros(input_dim * output_dim), C)
-    pd = ParameterDistribution(
-        Dict(
-            "distribution" => Parameterized(dist),
-            "constraint" => repeat([no_constraint()], input_dim * output_dim),
-            "name" => "xi",
-        ),
-    )
-    return pd
-end
-
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Builds the random feature method from hyperparameters. We use cosine activation functions and a Matrixvariate Normal distribution with mean M=0, and input(output) covariance U(V) built with a `CovarianceStructureType`.
-"""
-function RFM_from_hyperparameters(
-    vrfi::VectorRandomFeatureInterface,
-    rng::RNG,
-    l::ForVM,
-    regularization::MorUSorD,
-    n_features::Int,
-    batch_sizes::Union{Dict{S, Int}, Nothing},
-    input_dim::Int,
-    output_dim::Int,
-    multithread_type::MT;
-) where {
-    S <: AbstractString,
-    RNG <: AbstractRNG,
-    ForVM <: Union{AbstractFloat, AbstractVecOrMat},
-    MorUSorD <: Union{Matrix, UniformScaling, Diagonal},
-    MT <: MultithreadType,
-}
-
-    xi_hp = l[1:(end - 1)]
-    sigma_hp = l[end]
-
-    kernel_structure = get_kernel_structure(vrfi)
-    pd = hyperparameter_distribution_from_flat(xi_hp, input_dim, output_dim, kernel_structure)
-
-    feature_sampler = RF.Samplers.FeatureSampler(pd, output_dim, rng = rng)
-
-    feature_parameters = Dict("sigma" => sigma_hp)
-    vff = RF.Features.VectorFourierFeature(
-        n_features,
-        output_dim,
-        feature_sampler,
-        feature_parameters = feature_parameters,
-    )
-
-    thread_opt = isa(multithread_type, TullioThreading) # if we want to multithread with tullio
-    if isnothing(batch_sizes)
-        return RF.Methods.RandomFeatureMethod(vff, regularization = regularization, tullio_threading = thread_opt)
-    else
-        return RF.Methods.RandomFeatureMethod(
-            vff,
-            regularization = regularization,
-            batch_sizes = batch_sizes,
-            tullio_threading = thread_opt,
-        )
-    end
-end
-
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Build Vector Random Feature model for the input-output pairs subject to regularization, and optimizes the hyperparameters with EKP. 
-"""
-function build_models!(
-    vrfi::VectorRandomFeatureInterface,
-    input_output_pairs::PairedDataContainer{FT};
-    regularization_matrix::Union{M, Nothing} = nothing,
-) where {FT <: AbstractFloat, M <: AbstractMatrix}
-
-    # get inputs and outputs 
-    input_values = get_inputs(input_output_pairs)
-    output_values = get_outputs(input_output_pairs)
-    n_rfms, n_data = size(output_values)
-
-    input_dim = size(input_values, 1)
-    output_dim = size(output_values, 1)
-
-    kernel_structure = get_kernel_structure(vrfi)
-
-    n_hp = calculate_n_hyperparameters(input_dim, output_dim, kernel_structure)
-
-    rfms = get_rfms(vrfi)
-    if length(rfms) > 0
-        @warn "VectorRandomFeatureInterface already built. skipping..."
-        return
-    end
-
-    fitted_features = get_fitted_features(vrfi)
-    n_features = get_n_features(vrfi)
-    batch_sizes = get_batch_sizes(vrfi)
-    decomp_type = get_feature_decomposition(vrfi)
-    optimizer_options = get_optimizer_options(vrfi)
-    optimizer = get_optimizer(vrfi)
-    multithread = optimizer_options["multithread"]
-    if multithread == "ensemble"
-        multithread_type = EnsembleThreading()
-    elseif multithread == "tullio"
-        multithread_type = TullioThreading()
-    else
-        throw(
-            ArgumentError(
-                "Unknown optimizer option for multithreading, please choose from \"tullio\" (allows Tullio.jl to control threading in RandomFeatures.jl, or \"loops\" (threading optimization loops)",
-            ),
-        )
-    end
-    prior = optimizer_options["prior"]
-    rng = get_rng(vrfi)
-
-    # where prior space has changed we need to rebuild the priors [TODO just build them here in the first place?]
-    if ndims(prior) > n_hp
-
-        # comes from having a truncated output_dimension
-        # TODO not really a truncation here, resetting to default
-        @info "Original input-output space of dimension ($(get_input_dim(vrfi)), $(get_output_dim(vrfi))) has been truncated to ($(input_dim), $(output_dim)). \n Rebuilding prior... number of hyperparameters reduced from $(ndims(prior)) to $(n_hp)."
-        prior = build_default_prior(input_dim, output_dim, kernel_structure)
-    end
-
-    # Optimize feature cholesky factors with EKP
-    # [1.] Split data into test/train (e.g. 80/20)
-    train_fraction = optimizer_options["train_fraction"]
-    n_train = Int(floor(train_fraction * n_data)) # 20% split
-    n_test = n_data - n_train
-    n_features_opt = optimizer_options["n_features_opt"]
-
-
-    @info (
-        "hyperparameter learning using $n_train training points, $n_test validation points and $n_features_opt features"
-    )
-
-
-    # regularization_matrix = nothing when we use scaled SVD to decorrelate the space,
-    # in this setting, noise = I
-    if regularization_matrix === nothing
-        regularization = I
-
-    else
-        # think of the regularization_matrix as the observational noise covariance, or a related quantity
-        if !isposdef(regularization_matrix)
-            regularization = posdef_correct(regularization_matrix)
-            println("RF regularization matrix is not positive definite, correcting")
-
-        else
-            regularization = regularization_matrix
-        end
-
-    end
-
-    idx_shuffle = randperm(rng, n_data)
-    n_cross_val_sets = optimizer_options["n_cross_val_sets"]
-    if n_test*n_cross_val_sets > n_data
-        throw(ArgumentError("train/test split produces cross validation test sets of size $(n_test), out of $(n_data). \"n_cross_val_sets\" optimizer_options keyword < $(Int(floor(n_data/n_test))). Received $n_cross_val_sets"))
-    end
-
-    train_idx = []
-    test_idx = []
-    for i = 1:n_cross_val_sets
-        tmp = idx_shuffle[(i-1)*n_test+1:i*n_test]
-        push!(test_idx, tmp)
-        push!(train_idx, setdiff(collect(1:n_data), tmp))
-    end
-    
-    # [2.] Estimate covariance at mean value
-    μ_hp = transform_unconstrained_to_constrained(prior, mean(prior))
-    cov_sample_multiplier = optimizer_options["cov_sample_multiplier"]
-    cov_correction = optimizer_options["cov_correction"]
-    if nameof(typeof(kernel_structure)) == :SeparableKernel
-        if nameof(typeof(get_output_cov_structure(kernel_structure))) == :DiagonalFactor
-            n_cov_samples_min = n_test + 2 # diagonal case
-        else
-            n_cov_samples_min = (n_test * output_dim + 2)
-        end
-    else
-        n_cov_samples_min = (n_test * output_dim + 2)
-    end
-    n_cov_samples = Int(floor(n_cov_samples_min * max(cov_sample_multiplier, 0.0)))
-    observation_vec = []
-    tikhonov_opt_val = optimizer_options["tikhonov"]
-    for cv_idx = 1:n_cross_val_sets
-        internal_Γ, approx_σ2 = estimate_mean_and_coeffnorm_covariance(
-            vrfi,
-            rng,
-            μ_hp, # take mean values
-            regularization,
-            n_features_opt,
-            train_idx[cv_idx],
-            test_idx[cv_idx],
-            batch_sizes,
-            input_output_pairs,
-            n_cov_samples,
-            decomp_type,
-            multithread_type,
-            cov_correction = cov_correction,
-        )
-        
-        if tikhonov_opt_val == 0
-            # Build the covariance
-            Γ = internal_Γ
-            Γ[1:(n_test * output_dim), 1:(n_test * output_dim)] +=
-                isa(regularization, UniformScaling) ? regularization : kron(I(n_test), regularization) # + approx_σ2
-            Γ[(n_test * output_dim + 1):end, (n_test * output_dim + 1):end] += I
-            data = vcat(reshape(get_outputs(input_output_pairs)[:, test_idx[cv_idx]], :, 1), 0.0, 0.0) #flatten data
-            
-        elseif tikhonov_opt_val > 0
-            # augment the state to add tikhonov
-            outsize = size(internal_Γ, 1)
-            Γ = zeros(outsize + n_hp, outsize + n_hp)
-            Γ[1:outsize, 1:outsize] = internal_Γ
-            Γ[1:(n_test * output_dim), 1:(n_test * output_dim)] += kron(I(n_test), regularization) # block diag regularization
-            Γ[(n_test * output_dim + 1):outsize, (n_test * output_dim + 1):outsize] += I
-            
-            Γ[(outsize + 1):end, (outsize + 1):end] = tikhonov_opt_val .* cov(prior)
-            
-            data = vcat(
-            reshape(get_outputs(input_output_pairs)[:, test_idx[cv_idx]], :, 1),
-            0.0,
-            0.0,
-            zeros(size(Γ, 1) - outsize, 1),
-            ) #flatten data with additional zeros
-        else
-            throw(
-            ArgumentError(
-                "Tikhonov parameter must be non-negative, instead received tikhonov_opt_val=$tikhonov_opt_val",
-            ),
-            )
-        end
-        if !isposdef(Γ)
-            Γ = posdef_correct(Γ)
-        end
-        push!(observation_vec, EKP.Observation(
-                Dict(
-                    "names" => "$(cv_idx)",
-                    "samples" => data[:],
-                    "covariances" => Γ,
-                ),
-            ),
-            )
-     
-    end
-    observation = combine_observations(observation_vec)
-    
-    # [3.] set up EKP optimization
-    n_ensemble = optimizer_options["n_ensemble"] # minimal ensemble size n_hp,
-    n_iteration = optimizer_options["n_iteration"]
-    opt_verbose_flag = optimizer_options["verbose"]
-    scheduler = optimizer_options["scheduler"]
-    localization = optimizer_options["localization"]
-    accelerator = optimizer_options["accelerator"]
-
-
-    initial_params = construct_initial_ensemble(rng, prior, n_ensemble)
-
-    ekiobj = EKP.EnsembleKalmanProcess(
-        initial_params,
-        observation,
-        Inversion(),
-        scheduler = scheduler,
-        rng = rng,
-        verbose = opt_verbose_flag,
-        localization_method = localization,
-        accelerator = accelerator,
-    )
-    err = zeros(n_iteration)
-
-    # [4.] optimize with EKP
-    for i in 1:n_iteration
-        #get parameters:
-        lvec = get_ϕ_final(prior, ekiobj)
-
-        if tikhonov_opt_val > 0
-            g_ens = zeros(n_cross_val_sets*(output_dim*n_test+input_dim+2),n_ensemble)
-        else
-            g_ens = zeros(n_cross_val_sets*(output_dim*n_test+2),n_ensemble)
-        end
-        for cv_idx = 1:n_cross_val_sets
-            g_ens_tmp, _ = calculate_ensemble_mean_and_coeffnorm(
-                vrfi,
-                rng,
-                lvec,
-                regularization,
-                n_features_opt,
-                train_idx[cv_idx],
-                test_idx[cv_idx],
-                batch_sizes,
-                input_output_pairs,
-                decomp_type,
-                multithread_type,
-            )
-            if tikhonov_opt_val > 0
-                # augment with the computational parameters (u not ϕ)
-                uvecormat = get_u_final(ekiobj)
-                if isa(uvecormat, AbstractVector)
-                    umat = reshape(uvecormat, 1, :)
-                else
-                    umat = uvecormat
-                end
-                
-                g_ens_tmp = vcat(g_ens_tmp, umat)
-            end
-
-            g_ens[(cv_idx-1)*(output_dim*n_test+2) + 1: cv_idx*(output_dim*n_test+2), :] = g_ens_tmp
-
-        end
-
-        inflation = optimizer_options["inflation"]
-        if inflation > 0
-            terminated = EKP.update_ensemble!(ekiobj, g_ens, additive_inflation = true, s = inflation) # small regularizing inflation
-        else
-            terminated = EKP.update_ensemble!(ekiobj, g_ens) # small regularizing inflation
-        end
-        if !isnothing(terminated)
-            break # if the timestep was terminated due to timestepping condition
-        end
-        
-        err[i] = get_error(ekiobj)[end] #mean((params_true - mean(params_i,dims=2)).^2)
-
-    end
-    push!(optimizer, err)
-
-    # [5.] extract optimal hyperparameters
-    hp_optimal = get_ϕ_mean_final(prior, ekiobj)[:]
-
-    # Now, fit new RF model with the optimized hyperparameters
-    if opt_verbose_flag
-        names = get_name(prior)
-        hp_optimal_batch = [hp_optimal[b] for b in batch(prior)]
-        hp_optimal_range =
-            [(minimum(hp_optimal_batch[i]), maximum(hp_optimal_batch[i])) for i in 1:length(hp_optimal_batch)] #the min and max of the hparams
-        prior_conf_interval = [mean(prior) .- 3 * sqrt.(var(prior)), mean(prior) .+ 3 * sqrt.(var(prior))]
-        pci_constrained = [transform_unconstrained_to_constrained(prior, prior_conf_interval[i]) for i in 1:2]
-        pcic = [(pci_constrained[1][i], pci_constrained[2][i]) for i in 1:length(pci_constrained[1])]
-        pcic_batched = [pcic[b][1] for b in batch(prior)]
-        @info("EKI Optimization result:")
-
-        println(
-            display(
-                [
-                    "name" "number of hyperparameters" "optimized value range" "99% prior mass"
-                    names length.(hp_optimal_batch) hp_optimal_range pcic_batched
-                ],
-            ),
-        )
-    end
-
-    rfm_tmp = RFM_from_hyperparameters(
-        vrfi,
-        rng,
-        hp_optimal,
-        regularization,
-        n_features,
-        batch_sizes,
-        input_dim,
-        output_dim,
-        multithread_type,
-    )
-    fitted_features_tmp = fit(rfm_tmp, input_output_pairs, decomposition_type = decomp_type)
-
-
-    push!(rfms, rfm_tmp)
-    push!(fitted_features, fitted_features_tmp)
-    push!(get_regularization(vrfi), regularization)
-
-end
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Empty method, as optimization takes place within the build_models stage
-"""
-function optimize_hyperparameters!(vrfi::VectorRandomFeatureInterface, args...; kwargs...)
-    @info("Random Features already trained. continuing...")
-end
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Prediction of data observation (not latent function) at new inputs (passed in as columns in a matrix). That is, we add the observational noise into predictions.
-"""
-function predict(vrfi::VectorRandomFeatureInterface, new_inputs::M) where {M <: AbstractMatrix}
-    input_dim = get_input_dim(vrfi)
-    output_dim = get_output_dim(vrfi)
-    rfm = get_rfms(vrfi)[1]
-    ff = get_fitted_features(vrfi)[1]
-    optimizer_options = get_optimizer_options(vrfi)
-    multithread = optimizer_options["multithread"]
-    if multithread == "ensemble"
-        tullio_threading = false
-    elseif multithread == "tullio"
-        tullio_threading = true
-    end
-
-    N_samples = size(new_inputs, 2)
-    # Predicts columns of inputs: input_dim × N_samples
-    μ, σ2 = RF.Methods.predict(rfm, ff, DataContainer(new_inputs), tullio_threading = "tullio")
-    # μ, σ2 = RF.Methods.predict(rfm, ff, DataContainer(new_inputs), tullio_threading = tullio_threading)
-    # sizes (output_dim x n_test), (output_dim x output_dim x n_test) 
-    # add the noise contribution from the regularization
-    # note this is because we are predicting the data here, not the latent function.
-    lambda = get_regularization(vrfi)[1]
-    for i in 1:N_samples
-        σ2[:, :, i] = 0.5 * (σ2[:, :, i] + permutedims(σ2[:, :, i], (2, 1))) + lambda
-
-        if !isposdef(σ2[:, :, i])
-            σ2[:, :, i] = posdef_correct(σ2[:, :, i])
-        end
-    end
-    return μ, σ2
-end
diff --git a/src/VectorRandomFeature_nodatasplit.jl b/src/VectorRandomFeature_nodatasplit.jl
deleted file mode 100644
index eca77b9a..00000000
--- a/src/VectorRandomFeature_nodatasplit.jl
+++ /dev/null
@@ -1,671 +0,0 @@
-using ProgressBars
-
-export VectorRandomFeatureInterface
-export get_rfms,
-    get_fitted_features,
-    get_batch_sizes,
-    get_n_features,
-    get_input_dim,
-    get_output_dim,
-    get_rng,
-    get_kernel_structure,
-    get_optimizer_options,
-    get_optimizer
-
-"""
-$(DocStringExtensions.TYPEDEF)
-
-Structure holding the Vector Random Feature models. 
-
-# Fields
-$(DocStringExtensions.TYPEDFIELDS)
-
-"""
-struct VectorRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG, KST <: KernelStructureType} <:
-       RandomFeatureInterface
-    "A vector of `RandomFeatureMethod`s, contains the feature structure, batch-sizes and regularization"
-    rfms::Vector{RF.Methods.RandomFeatureMethod}
-    "vector of `Fit`s, containing the matrix decomposition and coefficients of RF when fitted to data"
-    fitted_features::Vector{RF.Methods.Fit}
-    "batch sizes"
-    batch_sizes::Union{Dict{S, Int}, Nothing}
-    "number of features"
-    n_features::Union{Int, Nothing}
-    "input dimension"
-    input_dim::Int
-    "output_dimension"
-    output_dim::Int
-    "rng"
-    rng::RNG
-    "regularization"
-    regularization::Vector{Union{Matrix, UniformScaling, Diagonal}}
-    "Kernel structure type (e.g. Separable or Nonseparable)"
-    kernel_structure::KST
-    "Random Feature decomposition, choose from \"svd\" or \"cholesky\" (default)"
-    feature_decomposition::S
-    "dictionary of options for hyperparameter optimizer"
-    optimizer_options::Dict
-    "diagnostics from optimizer"
-    optimizer::Vector
-end
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the rfms field
-"""
-get_rfms(vrfi::VectorRandomFeatureInterface) = vrfi.rfms
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the fitted_features field
-"""
-get_fitted_features(vrfi::VectorRandomFeatureInterface) = vrfi.fitted_features
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the batch_sizes field
-"""
-get_batch_sizes(vrfi::VectorRandomFeatureInterface) = vrfi.batch_sizes
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the n_features field
-"""
-get_n_features(vrfi::VectorRandomFeatureInterface) = vrfi.n_features
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the input_dim field
-"""
-get_input_dim(vrfi::VectorRandomFeatureInterface) = vrfi.input_dim
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the output_dim field
-"""
-get_output_dim(vrfi::VectorRandomFeatureInterface) = vrfi.output_dim
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the rng field
-"""
-EKP.get_rng(vrfi::VectorRandomFeatureInterface) = vrfi.rng
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the regularization field
-"""
-get_regularization(vrfi::VectorRandomFeatureInterface) = vrfi.regularization
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the kernel_structure field
-"""
-get_kernel_structure(vrfi::VectorRandomFeatureInterface) = vrfi.kernel_structure
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the feature_decomposition field
-"""
-get_feature_decomposition(vrfi::VectorRandomFeatureInterface) = vrfi.feature_decomposition
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the optimizer_options field
-"""
-get_optimizer_options(vrfi::VectorRandomFeatureInterface) = vrfi.optimizer_options
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-gets the optimizer field
-"""
-get_optimizer(vrfi::VectorRandomFeatureInterface) = vrfi.optimizer
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Constructs a `VectorRandomFeatureInterface <: MachineLearningTool` interface for the `RandomFeatures.jl` package for multi-input and multi-output emulators.
- - `n_features` - the number of random features
- - `input_dim` - the dimension of the input space
- - `output_dim` - the dimension of the output space
- - `kernel_structure` -  - a prescribed form of kernel structure
- - `batch_sizes = nothing` - Dictionary of batch sizes passed `RandomFeatures.jl` object (see definition there)
- - `rng = Random.GLOBAL_RNG` - random number generator 
- - `feature_decomposition = "cholesky"` - choice of how to store decompositions of random features, `cholesky` or `svd` available
- - `optimizer_options = nothing` - Dict of options to pass into EKI optimization of hyperparameters (defaults created in `VectorRandomFeatureInterface` constructor):
-      - "prior": the prior for the hyperparameter optimization
-      - "prior_in_scale"/"prior_out_scale": use these to tune the input/output prior scale.
-      - "n_ensemble": number of ensemble members
-      - "n_iteration": number of eki iterations
-      - "scheduler": Learning rate Scheduler (a.k.a. EKP timestepper) Default: DataMisfitController
-      - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 0.0) 
-      - "tikhonov": tikhonov regularization parameter if > 0
-      - "inflation": additive inflation ∈ [0,1] with 0 being no inflation
-      - "train_fraction": e.g. 0.8 (default)  means 80:20 train - test split
-      - "n_features_opt":  fix the number of features for optimization (default `n_features`, as used for prediction)
-      - "multithread": how to multithread. "ensemble" (default) threads across ensemble members "tullio" threads random feature matrix algebra
-      - "accelerator": use EKP accelerators (default is no acceleration)
-      - "verbose" => false, verbose optimizer statements to check convergence, priors and optimal parameters.
-      - "cov_correction" => "shrinkage", type of conditioning to improve estimated covariance (Ledoit Wolfe 03), also "nice" for (Vishny, Morzfeld et al. 2024)
-
-
-"""
-function VectorRandomFeatureInterface(
-    n_features::Int,
-    input_dim::Int,
-    output_dim::Int;
-    kernel_structure::Union{KST, Nothing} = nothing,
-    batch_sizes::Union{Dict{S, Int}, Nothing} = nothing,
-    rng::RNG = Random.GLOBAL_RNG,
-    feature_decomposition::S = "cholesky",
-    optimizer_options::Union{Dict{S}, Nothing} = nothing,
-) where {S <: AbstractString, RNG <: AbstractRNG, KST <: KernelStructureType}
-
-    # Initialize vector for RF model
-    rfms = Vector{RF.Methods.RandomFeatureMethod}(undef, 0)
-    fitted_features = Vector{RF.Methods.Fit}(undef, 0)
-    regularization = Vector{Union{Matrix, UniformScaling, Nothing}}(undef, 0)
-
-    if isnothing(kernel_structure)
-        kernel_structure = SeparableKernel(
-            cov_structure_from_string("lowrank", input_dim),
-            cov_structure_from_string("lowrank", output_dim),
-        )
-    end
-    KSType = typeof(kernel_structure)
-    prior = build_default_prior(input_dim, output_dim, kernel_structure)
-
-    #Optimization Defaults    
-    optimizer_opts = Dict(
-        "prior" => prior, #the hyperparameter_prior (note scalings have already been applied)
-        "n_ensemble" => max(ndims(prior) + 1, 10), #number of ensemble
-        "n_iteration" => 5, # number of eki iterations
-        "scheduler" => EKP.DataMisfitController(terminate_at=100), # Adaptive timestepping
-        "cov_sample_multiplier" => 10.0, # multiplier for samples to estimate covariance in optimization scheme 
-        "tikhonov" => 0, # tikhonov regularization parameter if >0
-        "inflation" => 1e-4, # additive inflation ∈ [0,1] with 0 being no inflation
-        "train_fraction" => 0.8, # 80:20 train - test split
-        "n_features_opt" => n_features, # number of features for the optimization 
-        "multithread" => "ensemble", # instead of "tullio"
-        "verbose" => false, # verbose optimizer statements
-        "localization" => EKP.Localizers.NoLocalization(), # localization / sample error correction for small ensembles
-        "accelerator" => EKP.NesterovAccelerator(), # acceleration with momentum
-        "cov_correction" => "shrinkage", # type of conditioning to improve estimated covariance
-    )
-
-    if !isnothing(optimizer_options)
-        for key in keys(optimizer_options)
-            optimizer_opts[key] = optimizer_options[key]
-        end
-    end
-
-    opt_tmp = Dict()
-    for key in keys(optimizer_opts)
-        if key != "prior"
-            opt_tmp[key] = optimizer_opts[key]
-        end
-    end
-    @info("hyperparameter optimization with EKI configured with $opt_tmp")
-
-    return VectorRandomFeatureInterface{S, RNG, KSType}(
-        rfms,
-        fitted_features,
-        batch_sizes,
-        n_features,
-        input_dim,
-        output_dim,
-        rng,
-        regularization,
-        kernel_structure,
-        feature_decomposition,
-        optimizer_opts,
-        [],
-    )
-end
-
-
-
-function hyperparameter_distribution_from_flat(
-    x::VV,
-    input_dim::Int,
-    output_dim::Int,
-    kernel_structure::SK,
-) where {VV <: AbstractVector, SK <: SeparableKernel}
-    M = zeros(input_dim, output_dim) # n x p mean
-
-    U, V = hyperparameters_from_flat(x, input_dim, output_dim, kernel_structure)
-
-    if !isposdef(U)
-        println("U not posdef - correcting")
-        U = posdef_correct(U)
-    end
-    if !isposdef(V)
-        println("V not posdef - correcting")
-        V = posdef_correct(V)
-    end
-
-    dist = MatrixNormal(M, U, V)
-    pd = ParameterDistribution(
-        Dict(
-            "distribution" => Parameterized(dist),
-            "constraint" => repeat([no_constraint()], input_dim * output_dim),
-            "name" => "xi",
-        ),
-    )
-    return pd
-end
-
-function hyperparameter_distribution_from_flat(
-    x::VV,
-    input_dim::Int,
-    output_dim::Int,
-    kernel_structure::NK,
-) where {VV <: AbstractVector, NK <: NonseparableKernel}
-
-    C = hyperparameters_from_flat(x, input_dim, output_dim, kernel_structure)
-    if !isposdef(C)
-        println("C not posdef - correcting")
-        C = posdef_correct(C)
-    end
-    dist = MvNormal(zeros(input_dim * output_dim), C)
-    pd = ParameterDistribution(
-        Dict(
-            "distribution" => Parameterized(dist),
-            "constraint" => repeat([no_constraint()], input_dim * output_dim),
-            "name" => "xi",
-        ),
-    )
-    return pd
-end
-
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Builds the random feature method from hyperparameters. We use cosine activation functions and a Matrixvariate Normal distribution with mean M=0, and input(output) covariance U(V) built with a `CovarianceStructureType`.
-"""
-function RFM_from_hyperparameters(
-    vrfi::VectorRandomFeatureInterface,
-    rng::RNG,
-    l::ForVM,
-    regularization::MorUSorD,
-    n_features::Int,
-    batch_sizes::Union{Dict{S, Int}, Nothing},
-    input_dim::Int,
-    output_dim::Int,
-    multithread_type::MT;
-) where {
-    S <: AbstractString,
-    RNG <: AbstractRNG,
-    ForVM <: Union{AbstractFloat, AbstractVecOrMat},
-    MorUSorD <: Union{Matrix, UniformScaling, Diagonal},
-    MT <: MultithreadType,
-}
-
-    xi_hp = l[1:(end - 1)]
-    sigma_hp = l[end]
-
-    kernel_structure = get_kernel_structure(vrfi)
-    pd = hyperparameter_distribution_from_flat(xi_hp, input_dim, output_dim, kernel_structure)
-
-    feature_sampler = RF.Samplers.FeatureSampler(pd, output_dim, rng = rng)
-
-    feature_parameters = Dict("sigma" => sigma_hp)
-    vff = RF.Features.VectorFourierFeature(
-        n_features,
-        output_dim,
-        feature_sampler,
-        feature_parameters = feature_parameters,
-    )
-
-    thread_opt = isa(multithread_type, TullioThreading) # if we want to multithread with tullio
-    if isnothing(batch_sizes)
-        return RF.Methods.RandomFeatureMethod(vff, regularization = regularization, tullio_threading = thread_opt)
-    else
-        return RF.Methods.RandomFeatureMethod(
-            vff,
-            regularization = regularization,
-            batch_sizes = batch_sizes,
-            tullio_threading = thread_opt,
-        )
-    end
-end
-
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Build Vector Random Feature model for the input-output pairs subject to regularization, and optimizes the hyperparameters with EKP. 
-"""
-function build_models!(
-    vrfi::VectorRandomFeatureInterface,
-    input_output_pairs::PairedDataContainer{FT};
-    regularization_matrix::Union{M, Nothing} = nothing,
-) where {FT <: AbstractFloat, M <: AbstractMatrix}
-
-    # get inputs and outputs 
-    input_values = get_inputs(input_output_pairs)
-    output_values = get_outputs(input_output_pairs)
-    n_rfms, n_data = size(output_values)
-
-    input_dim = size(input_values, 1)
-    output_dim = size(output_values, 1)
-
-    kernel_structure = get_kernel_structure(vrfi)
-
-    n_hp = calculate_n_hyperparameters(input_dim, output_dim, kernel_structure)
-
-    rfms = get_rfms(vrfi)
-    if length(rfms) > 0
-        @warn "VectorRandomFeatureInterface already built. skipping..."
-        return
-    end
-
-    fitted_features = get_fitted_features(vrfi)
-    n_features = get_n_features(vrfi)
-    batch_sizes = get_batch_sizes(vrfi)
-    decomp_type = get_feature_decomposition(vrfi)
-    optimizer_options = get_optimizer_options(vrfi)
-    optimizer = get_optimizer(vrfi)
-    multithread = optimizer_options["multithread"]
-    if multithread == "ensemble"
-        multithread_type = EnsembleThreading()
-    elseif multithread == "tullio"
-        multithread_type = TullioThreading()
-    else
-        throw(
-            ArgumentError(
-                "Unknown optimizer option for multithreading, please choose from \"tullio\" (allows Tullio.jl to control threading in RandomFeatures.jl, or \"loops\" (threading optimization loops)",
-            ),
-        )
-    end
-    prior = optimizer_options["prior"]
-    rng = get_rng(vrfi)
-
-    # where prior space has changed we need to rebuild the priors [TODO just build them here in the first place?]
-    if ndims(prior) > n_hp
-
-        # comes from having a truncated output_dimension
-        # TODO not really a truncation here, resetting to default
-        @info "Original input-output space of dimension ($(get_input_dim(vrfi)), $(get_output_dim(vrfi))) has been truncated to ($(input_dim), $(output_dim)). \n Rebuilding prior... number of hyperparameters reduced from $(ndims(prior)) to $(n_hp)."
-        prior = build_default_prior(input_dim, output_dim, kernel_structure)
-    end
-
-    # Optimize feature cholesky factors with EKP
-    # [1.] Split data into test/train (e.g. 80/20)
-    train_fraction = optimizer_options["train_fraction"]
-    n_train = n_data
-    n_test = n_data
-#    n_train = Int(floor(train_fraction * n_data)) # 20% split
-#    n_test = n_data - n_train
-    n_features_opt = optimizer_options["n_features_opt"]
-
-    @info (
-        "hyperparameter learning using $n_train training points, $n_test validation points and $n_features_opt features"
-    )
-
-
-    # regularization_matrix = nothing when we use scaled SVD to decorrelate the space,
-    # in this setting, noise = I
-    if regularization_matrix === nothing
-        regularization = I
-
-    else
-        # think of the regularization_matrix as the observational noise covariance, or a related quantity
-        if !isposdef(regularization_matrix)
-            regularization = posdef_correct(regularization_matrix)
-            println("RF regularization matrix is not positive definite, correcting")
-
-        else
-            regularization = regularization_matrix
-        end
-
-    end
-
-    #idx_shuffle = randperm(rng, n_data)
-    io_pairs_opt = PairedDataContainer(input_values,output_values)
-    
-    #io_pairs_opt = PairedDataContainer(
-    #    input_values[:, idx_shuffle],
-    #    reshape(output_values[:, idx_shuffle], :, size(output_values, 2)),
-    #)
-
-    # [2.] Estimate covariance at mean value
-    μ_hp = transform_unconstrained_to_constrained(prior, mean(prior))
-    cov_sample_multiplier = optimizer_options["cov_sample_multiplier"]
-    cov_correction = optimizer_options["cov_correction"]
-
-    if nameof(typeof(kernel_structure)) == :SeparableKernel
-        if nameof(typeof(get_output_cov_structure(kernel_structure))) == :DiagonalFactor
-            n_cov_samples_min = n_test + 2 # diagonal case
-        else
-            n_cov_samples_min = (n_test * output_dim + 2)
-        end
-    else
-        n_cov_samples_min = (n_test * output_dim + 2)
-    end
-    n_cov_samples = Int(floor(n_cov_samples_min * max(cov_sample_multiplier, 0.0)))
-
-    internal_Γ, approx_σ2 = estimate_mean_and_coeffnorm_covariance(
-        vrfi,
-        rng,
-        μ_hp, # take mean values
-        regularization,
-        n_features_opt,
-        n_train,
-        n_test,
-        batch_sizes,
-        io_pairs_opt,
-        n_cov_samples,
-        decomp_type,
-        multithread_type,
-        cov_correction = cov_correction,
-    )
-
-    tikhonov_opt_val = optimizer_options["tikhonov"]
-    if tikhonov_opt_val == 0
-        # Build the covariance
-        Γ = internal_Γ
-        Γ[1:(n_test * output_dim), 1:(n_test * output_dim)] +=
-            isa(regularization, UniformScaling) ? regularization : kron(I(n_test), regularization) # + approx_σ2
-        Γ[(n_test * output_dim + 1):end, (n_test * output_dim + 1):end] += I
-#        data = vcat(reshape(get_outputs(io_pairs_opt)[:, (n_train + 1):end], :, 1), 0.0, 0.0) #flatten data
-        data = vcat(get_outputs(io_pairs_opt)[:], 0.0, 0.0) #flatten data
-
-    elseif tikhonov_opt_val > 0
-        # augment the state to add tikhonov
-        outsize = size(internal_Γ, 1)
-        Γ = zeros(outsize + n_hp, outsize + n_hp)
-        Γ[1:outsize, 1:outsize] = internal_Γ
-        Γ[1:(n_test * output_dim), 1:(n_test * output_dim)] += kron(I(n_test), regularization) # block diag regularization
-        Γ[(n_test * output_dim + 1):outsize, (n_test * output_dim + 1):outsize] += I
-
-        Γ[(outsize + 1):end, (outsize + 1):end] = tikhonov_opt_val .* cov(prior)
-
-        data = vcat(
-#            reshape(get_outputs(io_pairs_opt)[:, (n_train + 1):end], :, 1),
-            get_outputs(io_pairs_opt)[:],
-            0.0,
-            0.0,
-            zeros(size(Γ, 1) - outsize, 1),
-        ) #flatten data with additional zeros
-    else
-        throw(
-            ArgumentError(
-                "Tikhonov parameter must be non-negative, instead received tikhonov_opt_val=$tikhonov_opt_val",
-            ),
-        )
-    end
-
-    # [3.] set up EKP optimization
-    n_ensemble = optimizer_options["n_ensemble"] # minimal ensemble size n_hp,
-    n_iteration = optimizer_options["n_iteration"]
-    opt_verbose_flag = optimizer_options["verbose"]
-    scheduler = optimizer_options["scheduler"]
-    localization = optimizer_options["localization"]
-    accelerator = optimizer_options["accelerator"]
-
-    if !isposdef(Γ)
-        Γ = posdef_correct(Γ)
-    end
-
-    initial_params = construct_initial_ensemble(rng, prior, n_ensemble)
-
-    ekiobj = EKP.EnsembleKalmanProcess(
-        initial_params,
-        data[:],
-        Γ,
-        Inversion(),
-        scheduler = scheduler,
-        rng = rng,
-        verbose = opt_verbose_flag,
-        localization_method = localization,
-        accelerator = accelerator,
-    )
-    err = zeros(n_iteration)
-
-    # [4.] optimize with EKP
-    for i in 1:n_iteration
-        #get parameters:
-        lvec = get_ϕ_final(prior, ekiobj)
-
-        g_ens, _ = calculate_ensemble_mean_and_coeffnorm(
-            vrfi,
-            rng,
-            lvec,
-            regularization,
-            n_features_opt,
-            n_train,
-            n_test,
-            batch_sizes,
-            io_pairs_opt,
-            decomp_type,
-            multithread_type,
-        )
-        if tikhonov_opt_val > 0
-            # augment with the computational parameters (u not ϕ)
-            uvecormat = get_u_final(ekiobj)
-            if isa(uvecormat, AbstractVector)
-                umat = reshape(uvecormat, 1, :)
-            else
-                umat = uvecormat
-            end
-
-            g_ens = vcat(g_ens, umat)
-        end
-        inflation = optimizer_options["inflation"]
-        if inflation > 0
-            terminated = EKP.update_ensemble!(ekiobj, g_ens, additive_inflation = true, s = inflation) # small regularizing inflation
-        else
-            terminated = EKP.update_ensemble!(ekiobj, g_ens) # small regularizing inflation
-        end
-        if !isnothing(terminated)
-            break # if the timestep was terminated due to timestepping condition
-        end
-
-        err[i] = get_error(ekiobj)[end] #mean((params_true - mean(params_i,dims=2)).^2)
-
-    end
-    push!(optimizer, err)
-
-    # [5.] extract optimal hyperparameters
-    hp_optimal = get_ϕ_mean_final(prior, ekiobj)[:]
-
-    # Now, fit new RF model with the optimized hyperparameters
-    if opt_verbose_flag
-        names = get_name(prior)
-        hp_optimal_batch = [hp_optimal[b] for b in batch(prior)]
-        hp_optimal_range =
-            [(minimum(hp_optimal_batch[i]), maximum(hp_optimal_batch[i])) for i in 1:length(hp_optimal_batch)] #the min and max of the hparams
-        prior_conf_interval = [mean(prior) .- 3 * sqrt.(var(prior)), mean(prior) .+ 3 * sqrt.(var(prior))]
-        pci_constrained = [transform_unconstrained_to_constrained(prior, prior_conf_interval[i]) for i in 1:2]
-        pcic = [(pci_constrained[1][i], pci_constrained[2][i]) for i in 1:length(pci_constrained[1])]
-        pcic_batched = [pcic[b][1] for b in batch(prior)]
-        @info("EKI Optimization result:")
-
-        println(
-            display(
-                [
-                    "name" "number of hyperparameters" "optimized value range" "99% prior mass"
-                    names length.(hp_optimal_batch) hp_optimal_range pcic_batched
-                ],
-            ),
-        )
-    end
-
-    rfm_tmp = RFM_from_hyperparameters(
-        vrfi,
-        rng,
-        hp_optimal,
-        regularization,
-        n_features,
-        batch_sizes,
-        input_dim,
-        output_dim,
-        multithread_type,
-    )
-    fitted_features_tmp = fit(rfm_tmp, input_output_pairs, decomposition_type = decomp_type)
-
-
-    push!(rfms, rfm_tmp)
-    push!(fitted_features, fitted_features_tmp)
-    push!(get_regularization(vrfi), regularization)
-
-end
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Empty method, as optimization takes place within the build_models stage
-"""
-function optimize_hyperparameters!(vrfi::VectorRandomFeatureInterface, args...; kwargs...)
-    @info("Random Features already trained. continuing...")
-end
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Prediction of data observation (not latent function) at new inputs (passed in as columns in a matrix). That is, we add the observational noise into predictions.
-"""
-function predict(vrfi::VectorRandomFeatureInterface, new_inputs::M) where {M <: AbstractMatrix}
-    input_dim = get_input_dim(vrfi)
-    output_dim = get_output_dim(vrfi)
-    rfm = get_rfms(vrfi)[1]
-    ff = get_fitted_features(vrfi)[1]
-    optimizer_options = get_optimizer_options(vrfi)
-    multithread = optimizer_options["multithread"]
-    if multithread == "ensemble"
-        tullio_threading = false
-    elseif multithread == "tullio"
-        tullio_threading = true
-    end
-
-    N_samples = size(new_inputs, 2)
-    # Predicts columns of inputs: input_dim × N_samples
-    μ, σ2 = RF.Methods.predict(rfm, ff, DataContainer(new_inputs), tullio_threading = "tullio")
-    # μ, σ2 = RF.Methods.predict(rfm, ff, DataContainer(new_inputs), tullio_threading = tullio_threading)
-    # sizes (output_dim x n_test), (output_dim x output_dim x n_test) 
-    # add the noise contribution from the regularization
-    # note this is because we are predicting the data here, not the latent function.
-    lambda = get_regularization(vrfi)[1]
-    for i in 1:N_samples
-        σ2[:, :, i] = 0.5 * (σ2[:, :, i] + permutedims(σ2[:, :, i], (2, 1))) + lambda
-
-        if !isposdef(σ2[:, :, i])
-            σ2[:, :, i] = posdef_correct(σ2[:, :, i])
-        end
-    end
-    return μ, σ2
-end