From 9cb5e89fea944a16a1223c2899ad9cf9e3527a8b Mon Sep 17 00:00:00 2001
From: odunbar <odunbar@caltech.edu>
Date: Tue, 4 Jul 2023 15:09:43 -0700
Subject: [PATCH] add Ledoit Wolf shrinkage estimator for cross-validation
 covariance

different priors

change prior again

add reg into cov-mat for optimization

example tweaks

add reg to scalar method and widen default prior

fix lorenz

fix lorenz

major refactor of RF prior cov

fix lorenz

format

refactored user interface

examples work with new interface

format

resolve merge

format

bug-fix n_hp calc

updates to prior - remove scalings

src for new kernel structure

updated Emulator and Lorenz example with Kernel interface

bugfix symbol checks

Cholesky factorization of K

remove unnecessary warning

needed branch in if-else

error message typo

better nugget term for LRF

sensible results for RF emulator exmaple

bypass JLD2 errors

make low-rank into more of a diagonal decomp

bugfix scatter bounds

added learnable scaling

Lorenz 63 example learning update map

gets l63 pdf

moved L63 into Emulator examples

remove duplicate svd

rebalance loss function (sqrt complexity, zero out scaled coeffs, remove approx_sig2 from noise), greatly improved robustness

example tweaks and fixes for plotting, convergence etc.

clima format

tests pass

clima format

change plots to makie, and small changes

change plots to makie, and small changes

format

docs fix
---
 docs/src/API/RandomFeatures.md                |   2 +-
 examples/EDMF_data/Project.toml               |   2 +
 examples/EDMF_data/plot_posterior.jl          |  23 +-
 examples/EDMF_data/uq_for_edmf.jl             | 127 ++++-
 examples/Emulator/GaussianProcess/plot_GP.jl  |   3 +-
 examples/Emulator/L63/Project.toml            |   9 +
 examples/Emulator/L63/emulate.jl              | 207 +++++++
 .../scalar_optimize_and_plot_RF.jl            | 397 ++++++-------
 .../vector_optimize_and_plot_RF.jl            |  65 ++-
 examples/GCM/Project.toml                     |   1 +
 examples/GCM/emulate_sample_script.jl         | 153 ++++-
 examples/GCM/sbatch_emulate_sample_jl         |   2 +-
 examples/Lorenz/GModel_common.jl              |   5 +-
 examples/Lorenz/calibrate.jl                  | 113 ++--
 examples/Lorenz/emulate_sample.jl             | 111 ++--
 src/Emulator.jl                               |  21 +-
 src/RandomFeature.jl                          | 521 ++++++++++++------
 src/ScalarRandomFeature.jl                    | 137 +++--
 src/VectorRandomFeature.jl                    | 213 ++++---
 test/RandomFeature/runtests.jl                | 212 +++++--
 20 files changed, 1642 insertions(+), 682 deletions(-)
 create mode 100644 examples/Emulator/L63/Project.toml
 create mode 100644 examples/Emulator/L63/emulate.jl

diff --git a/docs/src/API/RandomFeatures.md b/docs/src/API/RandomFeatures.md
index b5f5956bc..2baffd992 100644
--- a/docs/src/API/RandomFeatures.md
+++ b/docs/src/API/RandomFeatures.md
@@ -31,7 +31,7 @@ get_n_features
 get_input_dim
 get_output_dim
 get_rng
-get_diagonalize_input
+get_kernel_structure
 get_feature_decomposition
 get_optimizer_options
 optimize_hyperparameters!(::ScalarRandomFeatureInterface) 
diff --git a/examples/EDMF_data/Project.toml b/examples/EDMF_data/Project.toml
index a865fa8be..2936bdfbe 100644
--- a/examples/EDMF_data/Project.toml
+++ b/examples/EDMF_data/Project.toml
@@ -1,9 +1,11 @@
 [deps]
+CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 CalibrateEmulateSample = "95e48a1f-0bec-4818-9538-3db4340308e3"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NCDatasets = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
+PairPlots = "43a3c2be-4208-490b-832a-a21dcd55d7da"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
diff --git a/examples/EDMF_data/plot_posterior.jl b/examples/EDMF_data/plot_posterior.jl
index 5173889fb..9ba36b414 100644
--- a/examples/EDMF_data/plot_posterior.jl
+++ b/examples/EDMF_data/plot_posterior.jl
@@ -1,5 +1,8 @@
 # Import modules
-using StatsPlots
+ENV["GKSwstype"] = "100"
+using Plots
+
+using CairoMakie, PairPlots
 using JLD2
 using Dates
 
@@ -11,11 +14,11 @@ using CalibrateEmulateSample.ParameterDistributions
 
 # 2-parameter calibration exp
 exp_name = "ent-det-calibration"
-date_of_run = Date(2022, 7, 15)
+date_of_run = Date(2023, 10, 5)
 
 # 5-parameter calibration exp
 #exp_name = "ent-det-tked-tkee-stab-calibration"
-#date_of_run = Date(2022,7,14)
+#date_of_run = Date(2023,10,4)
 
 # Output figure read/write directory
 figure_save_directory = joinpath(@__DIR__, "output", exp_name, string(date_of_run))
@@ -24,7 +27,7 @@ data_save_directory = joinpath(@__DIR__, "output", exp_name, string(date_of_run)
 # load
 posterior_filepath = joinpath(data_save_directory, "posterior.jld2")
 if !isfile(posterior_filepath)
-    LoadError(posterior_filepath * " not found. Please check experiment name and date")
+    throw(ArgumentError(posterior_filepath * " not found. Please check experiment name and date"))
 else
     println("Loading posterior distribution from: " * posterior_filepath)
     posterior = load(posterior_filepath)["posterior"]
@@ -40,10 +43,10 @@ density_filepath = joinpath(figure_save_directory, "posterior_dist_comp.png")
 transformed_density_filepath = joinpath(figure_save_directory, "posterior_dist_phys.png")
 labels = get_name(posterior)
 
-gr(dpi = 300, size = (nparam_plots * 300, nparam_plots * 300))
-# lower triangular marginal cornerplot
-p = cornerplot(permutedims(posterior_samples, (2, 1)), label = labels, compact = true)
-trans_p = cornerplot(permutedims(transformed_posterior_samples, (2, 1)), label = labels, compact = true)
+data = (; [(Symbol(labels[i]), posterior_samples[i, :]) for i in 1:length(labels)]...)
+transformed_data = (; [(Symbol(labels[i]), transformed_posterior_samples[i, :]) for i in 1:length(labels)]...)
 
-savefig(p, density_filepath)
-savefig(trans_p, transformed_density_filepath)
+p = pairplot(data => (PairPlots.Scatter(),))
+trans_p = pairplot(transformed_data => (PairPlots.Scatter(),))
+save(density_filepath, p)
+save(transformed_density_filepath, trans_p)
diff --git a/examples/EDMF_data/uq_for_edmf.jl b/examples/EDMF_data/uq_for_edmf.jl
index c9da771d5..7c5a1ac0d 100644
--- a/examples/EDMF_data/uq_for_edmf.jl
+++ b/examples/EDMF_data/uq_for_edmf.jl
@@ -1,9 +1,10 @@
 #include(joinpath(@__DIR__, "..", "ci", "linkfig.jl"))
-PLOT_FLAG = true
+PLOT_FLAG = false
 
 # Import modules
 using Distributions  # probability distributions and associated functions
 using LinearAlgebra
+ENV["GKSwstype"] = "100"
 using Plots
 using Random
 using JLD2
@@ -17,6 +18,7 @@ using CalibrateEmulateSample.MarkovChainMonteCarlo
 using CalibrateEmulateSample.ParameterDistributions
 using CalibrateEmulateSample.DataContainers
 using CalibrateEmulateSample.EnsembleKalmanProcesses
+using CalibrateEmulateSample.EnsembleKalmanProcesses.Localizers
 using CalibrateEmulateSample.Utilities
 
 rng_seed = 42424242
@@ -126,6 +128,7 @@ function main()
     end
 
     # load and create prior distributions
+    #=   
     prior_filepath = joinpath(exp_dir, "prior.jld2")
     if !isfile(prior_filepath)
         LoadError("prior file \"prior.jld2\" not found in directory \"" * exp_dir * "/\"")
@@ -133,6 +136,47 @@ function main()
         prior_dict_raw = load(prior_filepath) #using JLD2
         prior = prior_dict_raw["prior"]
     end
+    =#
+    # build prior if jld2 does not work
+    function get_prior_config(s::SS) where {SS <: AbstractString}
+        config = Dict()
+        if s == "ent-det-calibration"
+            config["constraints"] =
+                Dict("entrainment_factor" => [bounded(0.0, 1.0)], "detrainment_factor" => [bounded(0.0, 1.0)])
+            config["prior_mean"] = Dict("entrainment_factor" => 0.13, "detrainment_factor" => 0.51)
+            config["unconstrained_σ"] = 1.0
+        elseif s == "ent-det-tked-tkee-stab-calibration"
+            config["constraints"] = Dict(
+                "entrainment_factor" => [bounded(0.0, 1.0)],
+                "detrainment_factor" => [bounded(0.0, 1.0)],
+                "tke_ed_coeff" => [bounded(0.01, 1.0)],
+                "tke_diss_coeff" => [bounded(0.01, 1.0)],
+                "static_stab_coeff" => [bounded(0.01, 1.0)],
+            )
+            config["prior_mean"] = Dict(
+                "entrainment_factor" => 0.13,
+                "detrainment_factor" => 0.51,
+                "tke_ed_coeff" => 0.14,
+                "tke_diss_coeff" => 0.22,
+                "static_stab_coeff" => 0.4,
+            )
+            config["unconstrained_σ"] = 1.0
+        else
+            throw(ArgumentError("prior for experiment $s not found, please implement in uq_for_edmf.jl"))
+        end
+        return config
+    end
+    prior_config = get_prior_config(exp_name)
+    means = prior_config["prior_mean"]
+    std = prior_config["unconstrained_σ"]
+    constraints = prior_config["constraints"]
+
+
+    prior = combine_distributions([
+        ParameterDistribution(
+            Dict("name" => name, "distribution" => Parameterized(Normal(mean, std)), "constraint" => constraints[name]),
+        ) for (name, mean) in means
+    ])
 
     # Option (ii) load EKP object
     # max_ekp_it = 10 # use highest available iteration file
@@ -145,7 +189,7 @@ function main()
     # end
     # input_output_pairs = Utilities.get_training_points(ekpobj, max_ekp_it)
 
-    println("Completed calibration stage")
+    println("Completed calibration loading stage")
     println(" ")
     ##############################################
     # [3. ] Build Emulator from calibration data #
@@ -153,19 +197,82 @@ function main()
     println("Begin Emulation stage")
     # Create GP object
 
-    gppackage = Emulators.SKLJL()
-    pred_type = Emulators.YType()
-    gauss_proc = GaussianProcess(
-        gppackage;
-        kernel = nothing, # use default squared exponential kernel
-        prediction_type = pred_type,
-        noise_learn = false,
+    cases = [
+        "GP", # diagonalize, train scalar GP, assume diag inputs
+        "RF-scalar", # diagonalize, train scalar RF, don't asume diag inputs
+        "RF-vector-svd-diag",
+        "RF-vector-svd-nondiag",
+        "RF-vector-svd-nonsep",
+    ]
+    case = cases[5]
+
+    overrides = Dict(
+        "verbose" => true,
+        "train_fraction" => 0.95,
+        "scheduler" => DataMisfitController(terminate_at = 100),
+        "cov_sample_multiplier" => 0.5,
+        "n_iteration" => 5,
+        #    "n_ensemble" => 20,
+        #    "localization" => SEC(0.1), # localization / sample error correction for small ensembles
     )
+    nugget = 0.01
+    rng_seed = 99330
+    rng = Random.MersenneTwister(rng_seed)
+    input_dim = size(get_inputs(input_output_pairs), 1)
+    output_dim = size(get_outputs(input_output_pairs), 1)
+    if case == "GP"
+
+        gppackage = Emulators.SKLJL()
+        pred_type = Emulators.YType()
+        mlt = GaussianProcess(
+            gppackage;
+            kernel = nothing, # use default squared exponential kernel
+            prediction_type = pred_type,
+            noise_learn = false,
+        )
+    elseif case ∈ ["RF-scalar"]
+        n_features = 100
+        kernel_structure = SeparableKernel(CholeskyFactor(nugget), OneDimFactor())
+        mlt = ScalarRandomFeatureInterface(
+            n_features,
+            input_dim,
+            rng = rng,
+            kernel_structure = kernel_structure,
+            optimizer_options = overrides,
+        )
+    elseif case ∈ ["RF-vector-svd-diag", "RF-vector-svd-nondiag"]
+        # do we want to assume that the outputs are decorrelated in the machine-learning problem?
+        kernel_structure =
+            case ∈ ["RF-vector-svd-diag"] ? SeparableKernel(LowRankFactor(1, nugget), DiagonalFactor(nugget)) :
+            SeparableKernel(LowRankFactor(2, nugget), LowRankFactor(2, nugget))
+        n_features = 500
+
+        mlt = VectorRandomFeatureInterface(
+            n_features,
+            input_dim,
+            output_dim,
+            rng = rng,
+            kernel_structure = kernel_structure,
+            optimizer_options = overrides,
+        )
+    elseif case ∈ ["RF-vector-svd-nonsep"]
+        kernel_structure = NonseparableKernel(LowRankFactor(3, nugget))
+        n_features = 500
+
+        mlt = VectorRandomFeatureInterface(
+            n_features,
+            input_dim,
+            output_dim,
+            rng = rng,
+            kernel_structure = kernel_structure,
+            optimizer_options = overrides,
+        )
+    end
 
     # Fit an emulator to the data
     normalized = true
 
-    emulator = Emulator(gauss_proc, input_output_pairs; obs_noise_cov = truth_cov, normalize_inputs = normalized)
+    emulator = Emulator(mlt, input_output_pairs; obs_noise_cov = truth_cov, normalize_inputs = normalized)
 
     # Optimize the GP hyperparameters for better fit
     optimize_hyperparameters!(emulator)
diff --git a/examples/Emulator/GaussianProcess/plot_GP.jl b/examples/Emulator/GaussianProcess/plot_GP.jl
index a0ee9751f..721382ce7 100644
--- a/examples/Emulator/GaussianProcess/plot_GP.jl
+++ b/examples/Emulator/GaussianProcess/plot_GP.jl
@@ -11,6 +11,7 @@ using CalibrateEmulateSample.DataContainers
 
 plot_flag = true
 if plot_flag
+    ENV["GKSwstype"] = "100"
     using Plots
     gr(size = (1500, 700))
     Plots.scalefontsizes(1.3)
@@ -78,7 +79,7 @@ gaussian_process = GaussianProcess(gppackage, noise_learn = false)
 # The observables y are related to the parameters x by: 
 # y = G(x1, x2) + η, 
 # where G(x1, x2) := [sin(x1) + cos(x2), sin(x1) - cos(x2)], and η ~ N(0, Σ)
-n = 100  # number of training points
+n = 150  # number of training points
 p = 2   # input dim 
 d = 2   # output dim
 
diff --git a/examples/Emulator/L63/Project.toml b/examples/Emulator/L63/Project.toml
new file mode 100644
index 000000000..e2c9baa08
--- /dev/null
+++ b/examples/Emulator/L63/Project.toml
@@ -0,0 +1,9 @@
+[deps]
+CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
+CalibrateEmulateSample = "95e48a1f-0bec-4818-9538-3db4340308e3"
+ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4"
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
diff --git a/examples/Emulator/L63/emulate.jl b/examples/Emulator/L63/emulate.jl
new file mode 100644
index 000000000..289314295
--- /dev/null
+++ b/examples/Emulator/L63/emulate.jl
@@ -0,0 +1,207 @@
+using OrdinaryDiffEq
+using Random, Distributions, LinearAlgebra
+ENV["GKSwstype"] = "100"
+using CairoMakie, ColorSchemes #for plots
+using JLD2
+
+# CES 
+using CalibrateEmulateSample.Emulators
+using CalibrateEmulateSample.ParameterDistributions
+using CalibrateEmulateSample.EnsembleKalmanProcesses
+using CalibrateEmulateSample.DataContainers
+
+
+function lorenz(du, u, p, t)
+    du[1] = 10.0 * (u[2] - u[1])
+    du[2] = u[1] * (28.0 - u[3]) - u[2]
+    du[3] = u[1] * u[2] - (8 / 3) * u[3]
+end
+
+function main()
+
+    # rng
+    rng = MersenneTwister(1232434)
+
+    # Run L63 from 0 -> tmax
+    u0 = [1.0; 0.0; 0.0]
+    tmax = 20
+    dt = 0.01
+    tspan = (0.0, tmax)
+    prob = ODEProblem(lorenz, u0, tspan)
+    sol = solve(prob, Euler(), dt = dt)
+
+    # Run L63 from end for test trajetory data
+    tmax_test = 100
+    tspan_test = (0.0, tmax_test)
+    u0_test = sol.u[end]
+    prob_test = ODEProblem(lorenz, u0_test, tspan_test)
+    sol_test = solve(prob_test, Euler(), dt = dt)
+
+    # Run L63 from end for histogram matching data
+    tmax_hist = 1000
+    tspan_hist = (0.0, tmax_hist)
+    u0_hist = sol_test.u[end]
+    prob_hist = ODEProblem(lorenz, u0_hist, tspan_hist)
+    sol_hist = solve(prob_hist, Euler(), dt = dt)
+
+    # Create training pairs (with noise) from subsampling [burnin,tmax] 
+    tburn = 10
+    burnin = Int(floor(10 / dt))
+    n_train_pts = 400 #600 
+    ind = shuffle!(rng, Vector(burnin:(tmax / dt - 1)))[1:n_train_pts]
+    n_tp = length(ind)
+    input = zeros(3, n_tp)
+    output = zeros(3, n_tp)
+    Γy = 1e-6 * I(3)
+    noise = rand(rng, MvNormal(zeros(3), Γy), n_tp)
+    for i in 1:n_tp
+        input[:, i] = sol.u[i]
+        output[:, i] = sol.u[i + 1] + noise[:, i]
+    end
+    iopairs = PairedDataContainer(input, output)
+
+    # Emulate
+    cases = ["GP", "RF-scalar", "RF-scalar-diagin", "RF-vector-svd-nonsep"]
+
+    case = cases[4]
+    decorrelate = true
+    nugget = Float64(1e-12)
+    overrides = Dict(
+        "verbose" => true,
+        "scheduler" => DataMisfitController(terminate_at = 1e4),
+        "cov_sample_multiplier" => 2.0,
+        "n_iteration" => 10,
+    )
+
+    # Build ML tools
+    if case == "GP"
+        gppackage = Emulators.GPJL()
+        pred_type = Emulators.YType()
+        mlt = GaussianProcess(
+            gppackage;
+            kernel = nothing, # use default squared exponential kernel
+            prediction_type = pred_type,
+            noise_learn = false,
+        )
+
+    elseif case ∈ ["RF-scalar", "RF-scalar-diagin"]
+        n_features = 10 * Int(floor(sqrt(3 * n_tp)))
+        kernel_structure =
+            case == "RF-scalar-diagin" ? SeparableKernel(DiagonalFactor(nugget), OneDimFactor()) :
+            SeparableKernel(LowRankFactor(3, nugget), OneDimFactor())
+        mlt = ScalarRandomFeatureInterface(
+            n_features,
+            3,
+            rng = rng,
+            kernel_structure = kernel_structure,
+            optimizer_options = overrides,
+        )
+    elseif case ∈ ["RF-vector-svd-nonsep"]
+        kernel_structure = NonseparableKernel(LowRankFactor(9, nugget))
+        n_features = 500
+
+        mlt = VectorRandomFeatureInterface(
+            n_features,
+            3,
+            3,
+            rng = rng,
+            kernel_structure = kernel_structure,
+            optimizer_options = overrides,
+        )
+    end
+
+    # Emulate
+    emulator = Emulator(mlt, iopairs; obs_noise_cov = Γy, decorrelate = decorrelate)
+    optimize_hyperparameters!(emulator)
+
+
+    # Predict with emulator
+    xspan_test = 0.0:dt:tmax_test
+    u_test = zeros(3, length(xspan_test))
+    u_test[:, 1] = sol_test.u[1]
+
+    for i in 1:(length(xspan_test) - 1)
+        rf_mean, _ = predict(emulator, u_test[:, i:i], transform_to_real = true) # 3x1 matrix
+        u_test[:, i + 1] = rf_mean
+    end
+    train_err = [0.0]
+    for i in 1:size(input, 2)
+        train_mean, _ = predict(emulator, input[:, i:i], transform_to_real = true) # 3x1
+        train_err[1] += norm(train_mean - output[:, i])
+    end
+    println("normalized L^2 error on training data:", 1 / size(input, 2) * train_err[1])
+
+
+
+    # plotting trace
+    f = Figure(resolution = (900, 450))
+    axx = Axis(f[1, 1], xlabel = "time", ylabel = "x")
+    axy = Axis(f[2, 1], xlabel = "time", ylabel = "y")
+    axz = Axis(f[3, 1], xlabel = "time", ylabel = "z")
+
+    xx = 0:dt:tmax_test
+    lines!(axx, xx, u_test[1, :], color = :blue)
+    lines!(axy, xx, u_test[2, :], color = :blue)
+    lines!(axz, xx, u_test[3, :], color = :blue)
+
+    # run test data 
+    solplot = zeros(3, length(xspan_test))
+    for i in 1:length(xspan_test)
+        solplot[:, i] = sol_test.u[i] #noiseless
+    end
+    JLD2.save("l63_testdata.jld2", "solplot", solplot, "uplot", u_test)
+
+    lines!(axx, xspan_test, solplot[1, :], color = :orange)
+    lines!(axy, xspan_test, solplot[2, :], color = :orange)
+    lines!(axz, xspan_test, solplot[3, :], color = :orange)
+
+    current_figure()
+    # save
+    save("l63_test.png", f, px_per_unit = 3)
+    save("l63_test.pdf", f, pt_per_unit = 3)
+
+    # plot attractor
+    f3 = Figure()
+    lines(f3[1, 1], u_test[1, :], u_test[3, :], color = :blue)
+    lines(f3[2, 1], solplot[1, :], solplot[3, :], color = :orange)
+
+    # save
+    save("l63_attr.png", f3, px_per_unit = 3)
+    save("l63_attr.pdf", f3, pt_per_unit = 3)
+
+
+    # Predict with emulator for histogram
+    xspan_hist = 0.0:dt:tmax_hist
+    u_hist = zeros(3, length(xspan_hist))
+    u_hist[:, 1] = sol_hist.u[1]  # start at end of previous sim
+
+    for i in 1:(length(xspan_hist) - 1)
+        rf_mean, _ = predict(emulator, u_hist[:, i:i], transform_to_real = true) # 3x1 matrix
+        u_hist[:, i + 1] = rf_mean
+    end
+
+    solhist = zeros(3, length(xspan_hist))
+    for i in 1:length(xspan_hist)
+        solhist[:, i] = sol_hist.u[i] #noiseless
+    end
+    JLD2.save("l63_histdata.jld2", "solhist", solhist, "uhist", u_hist)
+
+    # plotting histograms
+    f2 = Figure()
+    hist(f2[1, 1], u_hist[1, :], bins = 50, normalization = :pdf, color = (:blue, 0.5))
+    hist(f2[1, 2], u_hist[2, :], bins = 50, normalization = :pdf, color = (:blue, 0.5))
+    hist(f2[1, 3], u_hist[3, :], bins = 50, normalization = :pdf, color = (:blue, 0.5))
+
+    hist!(f2[1, 1], solhist[1, :], bins = 50, normalization = :pdf, color = (:orange, 0.5))
+    hist!(f2[1, 2], solhist[2, :], bins = 50, normalization = :pdf, color = (:orange, 0.5))
+    hist!(f2[1, 3], solhist[3, :], bins = 50, normalization = :pdf, color = (:orange, 0.5))
+
+    # save
+    save("l63_pdf.png", f2, px_per_unit = 3)
+    save("l63_pdf.pdf", f2, pt_per_unit = 3)
+
+
+
+end
+
+main()
diff --git a/examples/Emulator/RandomFeature/scalar_optimize_and_plot_RF.jl b/examples/Emulator/RandomFeature/scalar_optimize_and_plot_RF.jl
index b379ff689..3ac0a64a9 100644
--- a/examples/Emulator/RandomFeature/scalar_optimize_and_plot_RF.jl
+++ b/examples/Emulator/RandomFeature/scalar_optimize_and_plot_RF.jl
@@ -10,17 +10,18 @@ using LinearAlgebra
 using CalibrateEmulateSample.Emulators
 using CalibrateEmulateSample.DataContainers
 using CalibrateEmulateSample.ParameterDistributions
+using CalibrateEmulateSample.EnsembleKalmanProcesses
+
 
 case = "scalar"
 println("running case $case")
-
-diagonalize_input = true
-
+kernel_structure = SeparableKernel(LowRankFactor(2, 1e-8), OneDimFactor()) # input and output(1D) cov structure in sep kernel
 plot_flag = true
 if plot_flag
+    ENV["GKSwstype"] = "100"
     using Plots
     gr(size = (1500, 700))
-    Plots.scalefontsizes(1.3)
+    #    Plots.scalefontsizes(1.3) # scales on recursive calls to include..
     font = Plots.font("Helvetica", 18)
     fontdict = Dict(:guidefont => font, :xtickfont => font, :ytickfont => font, :legendfont => font)
 
@@ -33,233 +34,247 @@ function meshgrid(vx::AbstractVector{T}, vy::AbstractVector{T}) where {T}
 
     return gx, gy
 end
-rng_seed = 41
-Random.seed!(rng_seed)
-output_directory = joinpath(@__DIR__, "output")
-if !isdir(output_directory)
-    mkdir(output_directory)
-end
 
-#problem
-n = 100  # number of training points
-p = 2   # input dim 
-d = 2   # output dim
-X = 2.0 * π * rand(p, n)
-# G(x1, x2)
-g1x = sin.(X[1, :]) .+ cos.(X[2, :])
-g2x = sin.(X[1, :]) .- cos.(X[2, :])
-gx = zeros(2, n)
-gx[1, :] = g1x
-gx[2, :] = g2x
-
-# Add noise η
-μ = zeros(d)
-Σ = 0.1 * [[0.8, 0.1] [0.1, 0.5]] # d x d
-noise_samples = rand(MvNormal(μ, Σ), n)
-# y = G(x) + η
-Y = gx .+ noise_samples
-
-iopairs = PairedDataContainer(X, Y, data_are_columns = true)
-@assert get_inputs(iopairs) == X
-@assert get_outputs(iopairs) == Y
-
-#plot training data with and without noise
-if plot_flag
-    p1 = plot(
-        X[1, :],
-        X[2, :],
-        g1x,
-        st = :surface,
-        camera = (30, 60),
-        c = :cividis,
-        xlabel = "x1",
-        ylabel = "x2",
-        zguidefontrotation = 90,
-    )
-
-    figpath = joinpath(output_directory, "RF_observed_y1nonoise.png")
-    savefig(figpath)
-
-    p2 = plot(
-        X[1, :],
-        X[2, :],
-        g2x,
-        st = :surface,
-        camera = (30, 60),
-        c = :cividis,
-        xlabel = "x1",
-        ylabel = "x2",
-        zguidefontrotation = 90,
-    )
-    figpath = joinpath(output_directory, "RF_observed_y2nonoise.png")
-    savefig(figpath)
-
-    p3 = plot(
-        X[1, :],
-        X[2, :],
-        Y[1, :],
-        st = :surface,
-        camera = (30, 60),
-        c = :cividis,
-        xlabel = "x1",
-        ylabel = "x2",
-        zguidefontrotation = 90,
-    )
-    figpath = joinpath(output_directory, "RF_observed_y1.png")
-    savefig(figpath)
-
-    p4 = plot(
-        X[1, :],
-        X[2, :],
-        Y[2, :],
-        st = :surface,
-        camera = (30, 60),
-        c = :cividis,
-        xlabel = "x1",
-        ylabel = "x2",
-        zguidefontrotation = 90,
-    )
-    figpath = joinpath(output_directory, "RF_observed_y2.png")
-    savefig(figpath)
+function main()
 
-end
+    rng_seed = 41
+    Random.seed!(rng_seed)
+    output_directory = joinpath(@__DIR__, "output")
+    if !isdir(output_directory)
+        mkdir(output_directory)
+    end
 
-# setup random features
-n_features = 180
-optimizer_options = Dict("n_iteration" => 20, "prior_in_scale" => 0.1, "verbose" => true) #"Max" iterations (may do less)
 
-srfi = ScalarRandomFeatureInterface(
-    n_features,
-    p,
-    diagonalize_input = diagonalize_input,
-    optimizer_options = optimizer_options,
-)
-emulator = Emulator(srfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true)
-println("build RF with $n training points and $(n_features) random features.")
 
-optimize_hyperparameters!(emulator) # although RF already optimized
+    #problem
+    n = 150  # number of training points
+    p = 2   # input dim 
+    d = 2   # output dim
+    X = 2.0 * π * rand(p, n)
+    # G(x1, x2)
+    g1x = sin.(X[1, :]) .+ cos.(X[2, :])
+    g2x = sin.(X[1, :]) .- cos.(X[2, :])
+    gx = zeros(2, n)
+    gx[1, :] = g1x
+    gx[2, :] = g2x
 
-# Plot mean and variance of the predicted observables y1 and y2
-# For this, we generate test points on a x1-x2 grid.
-n_pts = 200
-x1 = range(0.0, stop = 4.0 / 5.0 * 2 * π, length = n_pts)
-x2 = range(0.0, stop = 4.0 / 5.0 * 2 * π, length = n_pts)
-X1, X2 = meshgrid(x1, x2)
-# Input for predict has to be of size N_samples x input_dim
-inputs = permutedims(hcat(X1[:], X2[:]), (2, 1))
+    # Add noise η
+    μ = zeros(d)
+    Σ = 0.1 * [[0.8, 0.1] [0.1, 0.5]] # d x d
+    noise_samples = rand(MvNormal(μ, Σ), n)
+    # y = G(x) + η
+    Y = gx .+ noise_samples
 
-rf_mean, rf_cov = predict(emulator, inputs, transform_to_real = true)
-println("end predictions at ", n_pts * n_pts, " points")
+    iopairs = PairedDataContainer(X, Y, data_are_columns = true)
+    @assert get_inputs(iopairs) == X
+    @assert get_outputs(iopairs) == Y
 
+    #plot training data with and without noise
+    if plot_flag
+        p1 = plot(
+            X[1, :],
+            X[2, :],
+            g1x,
+            st = :surface,
+            camera = (30, 60),
+            c = :cividis,
+            xlabel = "x1",
+            ylabel = "x2",
+            zguidefontrotation = 90,
+        )
 
-#plot predictions
-for y_i in 1:d
-    rf_var_temp = [diag(rf_cov[j]) for j in 1:length(rf_cov)] # (40000,)
-    rf_var = permutedims(vcat([x' for x in rf_var_temp]...), (2, 1)) # 2 x 40000
+        figpath = joinpath(output_directory, "RF_observed_y1nonoise.png")
+        savefig(figpath)
 
-    mean_grid = reshape(rf_mean[y_i, :], n_pts, n_pts) # 2 x 40000
-    if plot_flag
-        p5 = plot(
-            x1,
-            x2,
-            mean_grid,
+        p2 = plot(
+            X[1, :],
+            X[2, :],
+            g2x,
             st = :surface,
             camera = (30, 60),
             c = :cividis,
             xlabel = "x1",
             ylabel = "x2",
-            zlabel = "mean of y" * string(y_i),
             zguidefontrotation = 90,
         )
-    end
-    var_grid = reshape(rf_var[y_i, :], n_pts, n_pts)
-    if plot_flag
-        p6 = plot(
-            x1,
-            x2,
-            var_grid,
+        figpath = joinpath(output_directory, "RF_observed_y2nonoise.png")
+        savefig(figpath)
+
+        p3 = plot(
+            X[1, :],
+            X[2, :],
+            Y[1, :],
             st = :surface,
             camera = (30, 60),
             c = :cividis,
             xlabel = "x1",
             ylabel = "x2",
-            zlabel = "var of y" * string(y_i),
             zguidefontrotation = 90,
         )
+        figpath = joinpath(output_directory, "RF_observed_y1.png")
+        savefig(figpath)
 
-        plot(p5, p6, layout = (1, 2), legend = false)
+        p4 = plot(
+            X[1, :],
+            X[2, :],
+            Y[2, :],
+            st = :surface,
+            camera = (30, 60),
+            c = :cividis,
+            xlabel = "x1",
+            ylabel = "x2",
+            zguidefontrotation = 90,
+        )
+        figpath = joinpath(output_directory, "RF_observed_y2.png")
+        savefig(figpath)
 
-        savefig(joinpath(output_directory, "RF_" * case * "_y" * string(y_i) * "_predictions.png"))
     end
-end
-
-# Plot the true components of G(x1, x2)
-g1_true = sin.(inputs[1, :]) .+ cos.(inputs[2, :])
-g1_true_grid = reshape(g1_true, n_pts, n_pts)
-if plot_flag
-    p7 = plot(
-        x1,
-        x2,
-        g1_true_grid,
-        st = :surface,
-        camera = (30, 60),
-        c = :cividis,
-        xlabel = "x1",
-        ylabel = "x2",
-        zlabel = "sin(x1) + cos(x2)",
-        zguidefontrotation = 90,
-    )
-    savefig(joinpath(output_directory, "RF_" * case * "_true_g1.png"))
-end
 
-g2_true = sin.(inputs[1, :]) .- cos.(inputs[2, :])
-g2_true_grid = reshape(g2_true, n_pts, n_pts)
-if plot_flag
-    p8 = plot(
-        x1,
-        x2,
-        g2_true_grid,
-        st = :surface,
-        camera = (30, 60),
-        c = :cividis,
-        xlabel = "x1",
-        ylabel = "x2",
-        zlabel = "sin(x1) - cos(x2)",
-        zguidefontrotation = 90,
+    # setup random features
+    n_features = 400
+    optimizer_options = Dict(
+        "n_iteration" => 20,
+        "n_ensemble" => 20,
+        "verbose" => true,
+        "scheduler" => DataMisfitController(terminate_at = 100.0),
+    ) #"Max" iterations (may do less)
+
+
+    srfi = ScalarRandomFeatureInterface(
+        n_features,
+        p,
+        kernel_structure = kernel_structure,
+        optimizer_options = optimizer_options,
     )
-    g_true_grids = [g1_true_grid, g2_true_grid]
-
-    savefig(joinpath(output_directory, "RF_" * case * "_true_g2.png"))
-
-end
-
-# Plot the difference between the truth and the mean of the predictions
-for y_i in 1:d
-
-    # Reshape rf_cov to size N_samples x output_dim
-    rf_var_temp = [diag(rf_cov[j]) for j in 1:length(rf_cov)] # (40000,)
-    rf_var = permutedims(vcat([x' for x in rf_var_temp]...), (2, 1)) # 40000 x 2
-
-    mean_grid = reshape(rf_mean[y_i, :], n_pts, n_pts)
-    var_grid = reshape(rf_var[y_i, :], n_pts, n_pts)
-    # Compute and plot 1/variance * (truth - prediction)^2
+    emulator = Emulator(srfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true)
+    println("build RF with $n training points and $(n_features) random features.")
+
+    optimize_hyperparameters!(emulator) # although RF already optimized
+
+    # Plot mean and variance of the predicted observables y1 and y2
+    # For this, we generate test points on a x1-x2 grid.
+    n_pts = 200
+    x1 = range(0.0, stop = 4.0 / 5.0 * 2 * π, length = n_pts)
+    x2 = range(0.0, stop = 4.0 / 5.0 * 2 * π, length = n_pts)
+    X1, X2 = meshgrid(x1, x2)
+    # Input for predict has to be of size N_samples x input_dim
+    inputs = permutedims(hcat(X1[:], X2[:]), (2, 1))
+
+    rf_mean, rf_cov = predict(emulator, inputs, transform_to_real = true)
+    println("end predictions at ", n_pts * n_pts, " points")
+
+
+    #plot predictions
+    for y_i in 1:d
+        rf_var_temp = [diag(rf_cov[j]) for j in 1:length(rf_cov)] # (40000,)
+        rf_var = permutedims(reduce(vcat, [x' for x in rf_var_temp]), (2, 1)) # 2 x 40000
+
+        mean_grid = reshape(rf_mean[y_i, :], n_pts, n_pts) # 2 x 40000
+        if plot_flag
+            p5 = plot(
+                x1,
+                x2,
+                mean_grid,
+                st = :surface,
+                camera = (30, 60),
+                c = :cividis,
+                xlabel = "x1",
+                ylabel = "x2",
+                zlabel = "mean of y" * string(y_i),
+                zguidefontrotation = 90,
+            )
+        end
+        var_grid = reshape(rf_var[y_i, :], n_pts, n_pts)
+        if plot_flag
+            p6 = plot(
+                x1,
+                x2,
+                var_grid,
+                st = :surface,
+                camera = (30, 60),
+                c = :cividis,
+                xlabel = "x1",
+                ylabel = "x2",
+                zlabel = "var of y" * string(y_i),
+                zguidefontrotation = 90,
+            )
+
+            plot(p5, p6, layout = (1, 2), legend = false)
+
+            savefig(joinpath(output_directory, "RF_" * case * "_y" * string(y_i) * "_predictions.png"))
+        end
+    end
 
+    # Plot the true components of G(x1, x2)
+    g1_true = sin.(inputs[1, :]) .+ cos.(inputs[2, :])
+    g1_true_grid = reshape(g1_true, n_pts, n_pts)
     if plot_flag
-        zlabel = "1/var * (true_y" * string(y_i) * " - predicted_y" * string(y_i) * ")^2"
+        p7 = plot(
+            x1,
+            x2,
+            g1_true_grid,
+            st = :surface,
+            camera = (30, 60),
+            c = :cividis,
+            xlabel = "x1",
+            ylabel = "x2",
+            zlabel = "sin(x1) + cos(x2)",
+            zguidefontrotation = 90,
+        )
+        savefig(joinpath(output_directory, "RF_" * case * "_true_g1.png"))
+    end
 
-        p9 = plot(
+    g2_true = sin.(inputs[1, :]) .- cos.(inputs[2, :])
+    g2_true_grid = reshape(g2_true, n_pts, n_pts)
+    if plot_flag
+        p8 = plot(
             x1,
             x2,
-            sqrt.(1.0 ./ var_grid .* (g_true_grids[y_i] .- mean_grid) .^ 2),
+            g2_true_grid,
             st = :surface,
             camera = (30, 60),
-            c = :magma,
-            zlabel = zlabel,
+            c = :cividis,
             xlabel = "x1",
             ylabel = "x2",
+            zlabel = "sin(x1) - cos(x2)",
             zguidefontrotation = 90,
         )
+        g_true_grids = [g1_true_grid, g2_true_grid]
 
-        savefig(joinpath(output_directory, "RF_" * case * "_y" * string(y_i) * "_difference_truth_prediction.png"))
+        savefig(joinpath(output_directory, "RF_" * case * "_true_g2.png"))
+
+    end
+
+    # Plot the difference between the truth and the mean of the predictions
+    for y_i in 1:d
+
+        # Reshape rf_cov to size N_samples x output_dim
+        rf_var_temp = [diag(rf_cov[j]) for j in 1:length(rf_cov)] # (40000,)
+        rf_var = permutedims(vcat([x' for x in rf_var_temp]...), (2, 1)) # 40000 x 2
+
+        mean_grid = reshape(rf_mean[y_i, :], n_pts, n_pts)
+        var_grid = reshape(rf_var[y_i, :], n_pts, n_pts)
+        # Compute and plot 1/variance * (truth - prediction)^2
+
+        if plot_flag
+            zlabel = "1/var * (true_y" * string(y_i) * " - predicted_y" * string(y_i) * ")^2"
+
+            p9 = plot(
+                x1,
+                x2,
+                sqrt.(1.0 ./ var_grid .* (g_true_grids[y_i] .- mean_grid) .^ 2),
+                st = :surface,
+                camera = (30, 60),
+                c = :magma,
+                zlabel = zlabel,
+                xlabel = "x1",
+                ylabel = "x2",
+                zguidefontrotation = 90,
+            )
+
+            savefig(joinpath(output_directory, "RF_" * case * "_y" * string(y_i) * "_difference_truth_prediction.png"))
+        end
     end
 end
+
+main()
diff --git a/examples/Emulator/RandomFeature/vector_optimize_and_plot_RF.jl b/examples/Emulator/RandomFeature/vector_optimize_and_plot_RF.jl
index 08d045c44..12cbc17a4 100644
--- a/examples/Emulator/RandomFeature/vector_optimize_and_plot_RF.jl
+++ b/examples/Emulator/RandomFeature/vector_optimize_and_plot_RF.jl
@@ -10,11 +10,13 @@ using LinearAlgebra
 using CalibrateEmulateSample.Emulators
 using CalibrateEmulateSample.DataContainers
 using CalibrateEmulateSample.ParameterDistributions
+using CalibrateEmulateSample.EnsembleKalmanProcesses
 plot_flag = true
 if plot_flag
+    ENV["GKSwstype"] = "100"
     using Plots
     gr(size = (1500, 700))
-    Plots.scalefontsizes(1.3)
+    #    Plots.scalefontsizes(1.3)
     font = Plots.font("Helvetica", 18)
     fontdict = Dict(:guidefont => font, :xtickfont => font, :ytickfont => font, :legendfont => font)
 
@@ -38,8 +40,9 @@ function main()
         mkdir(output_directory)
     end
 
-    cases = ["svd-diag", "svd-nondiag", "nosvd-diag", "nosvd-nondiag"]
-    case_mask = 1:4 # which cases to do
+    cases = ["svd-diag", "svd-nondiag", "nosvd-diag", "nosvd-nondiag", "svd-nonsep", "nosvd-nonsep"]
+    case_mask = 1:6 # which cases to do
+    nugget = 1e-12
 
     #problem
     n = 150  # number of training points
@@ -128,43 +131,67 @@ function main()
     for case in cases[case_mask]
         println("running case $case")
 
-
-
-
         # setup random features
-        n_features = 200
-        if case ∈ ["svd-diag", "svd-nondiag"]
-            optimizer_options =
-                Dict("n_iteration" => 20, "prior_in_scale" => 0.01, "prior_out_scale" => 1.0, "verbose" => true) #"Max" iterations (may do less)
-        else # without svd 
-            optimizer_options =
-                Dict("n_iteration" => 20, "prior_in_scale" => 0.01, "prior_out_scale" => 0.01, "verbose" => true) #"Max" iterations (may do less)
-        end
+        n_features = 400
+        optimizer_options =
+            Dict("n_iteration" => 5, "verbose" => true, "scheduler" => DataMisfitController(on_terminate = "continue")) #"Max" iterations (may do less)
 
         if case == "svd-diag"
             vrfi = VectorRandomFeatureInterface(
                 n_features,
                 p,
                 d,
-                diagonalize_output = true,
+                kernel_structure = SeparableKernel(LowRankFactor(2, nugget), DiagonalFactor()),
                 optimizer_options = optimizer_options,
             )
             emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true)
         elseif case == "svd-nondiag"
-            vrfi = VectorRandomFeatureInterface(n_features, p, d, optimizer_options = optimizer_options)
+            vrfi = VectorRandomFeatureInterface(
+                n_features,
+                p,
+                d,
+                kernel_structure = SeparableKernel(LowRankFactor(2, nugget), LowRankFactor(2, nugget)),
+                optimizer_options = optimizer_options,
+            )
+            emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true)
+        elseif case == "svd-nonsep"
+            vrfi = VectorRandomFeatureInterface(
+                n_features,
+                p,
+                d,
+                kernel_structure = NonseparableKernel(LowRankFactor(4, nugget)),
+                optimizer_options = optimizer_options,
+            )
             emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true)
+
         elseif case == "nosvd-diag"
             vrfi = VectorRandomFeatureInterface(
                 n_features,
                 p,
                 d,
-                diagonalize_output = true,
+                kernel_structure = SeparableKernel(LowRankFactor(2, nugget), DiagonalFactor()), # roughly normalize output by noise
                 optimizer_options = optimizer_options,
             )
             emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true, decorrelate = false)
         elseif case == "nosvd-nondiag"
-            vrfi = VectorRandomFeatureInterface(n_features, p, d, optimizer_options = optimizer_options)
+            vrfi = VectorRandomFeatureInterface(
+                n_features,
+                p,
+                d,
+                kernel_structure = SeparableKernel(LowRankFactor(2, nugget), LowRankFactor(2, nugget)), # roughly normalize output by noise
+                optimizer_options = optimizer_options,
+            )
             emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true, decorrelate = false)
+        elseif case == "nosvd-nonsep"
+            vrfi = VectorRandomFeatureInterface(
+                n_features,
+                p,
+                d,
+                kernel_structure = NonseparableKernel(LowRankFactor(4, nugget)),
+                optimizer_options = optimizer_options,
+            )
+            emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true, decorrelate = false)
+
         end
         println("build RF with $n training points and $(n_features) random features.")
 
@@ -187,7 +214,7 @@ function main()
         for y_i in 1:d
 
             rf_var_temp = [diag(rf_cov[j]) for j in 1:length(rf_cov)] # (40000,)
-            rf_var = permutedims(vcat([x' for x in rf_var_temp]...), (2, 1)) # 2 x 40000
+            rf_var = permutedims(reduce(vcat, [x' for x in rf_var_temp]), (2, 1)) # 2 x 40000
 
             mean_grid = reshape(rf_mean[y_i, :], n_pts, n_pts) # 2 x 40000
             if plot_flag
diff --git a/examples/GCM/Project.toml b/examples/GCM/Project.toml
index aba51ad3c..73fa40b73 100644
--- a/examples/GCM/Project.toml
+++ b/examples/GCM/Project.toml
@@ -10,6 +10,7 @@ PyPlot = "d330b81b-6aea-500a-939a-2ce795aea3ee"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
 
 [compat]
 FiniteDiff = "~2.10"
diff --git a/examples/GCM/emulate_sample_script.jl b/examples/GCM/emulate_sample_script.jl
index 0c8607554..4c0f9ca9c 100644
--- a/examples/GCM/emulate_sample_script.jl
+++ b/examples/GCM/emulate_sample_script.jl
@@ -1,38 +1,37 @@
 # Script to run Emulation and Sampling on data from GCM
 
 # Import modules
-using Distributions  # probability distributions and associated functions
+using Distributions
 using LinearAlgebra
+ENV["GKSwstype"] = "100"
 using Plots
+using Plots.PlotMeasures # for mm
+using StatsPlots
 using Random
 using JLD2
 # CES 
 using CalibrateEmulateSample.Emulators
 using CalibrateEmulateSample.MarkovChainMonteCarlo
 using CalibrateEmulateSample.ParameterDistributions
+using CalibrateEmulateSample.EnsembleKalmanProcesses
 using CalibrateEmulateSample.DataContainers
+using CalibrateEmulateSample.EnsembleKalmanProcesses.Localizers
 
-@time begin
 
+function main()
     rng_seed = 2413798
     Random.seed!(rng_seed)
 
-    #    expname = "vrf-nondiag-logdet_newprior"
-    #emulator_type ∈ ["GPR","ScalarRFR","VectorRFR-svd-diag","VectorRFR-svd-nondiag", "VectorRFR-nondiag"]
-    #    emulator_type = "GPR"
-    #    expname = "gpr"
-
-    #    emulator_type = "ScalarRFR"
-    #    expname = "srf"
+    # CHOOSE YOUR CASE 
+    case = 6
 
-    #    emulator_type = "VectorRFR-svd-diag"
-    #    expname = "vrf-svd-diag"
+    emulator_types =
+        ["GPR", "ScalarRFR", "VectorRFR-svd-diag", "VectorRFR-svd-nondiag", "VectorRFR-nondiag", "VectorRFR-svd-nonsep"]
 
-    #    emulator_type = "VectorRFR-svd-nondiag"
-    #    expname = "vrf-svd-nondiag"
+    expnames = ["gpr", "srf", "vrf-svd-diag", "vrf-svd-nondiag", "vrf-nondiag_standardized", "vrf-svd-nonsep"]
 
-    emulator_type = "VectorRFR-nondiag"
-    expname = "vrf-nondiag_standardized"
+    emulator_type = emulator_types[case]
+    expname = expnames[case]
 
     # Output figure save directory
     example_directory = @__DIR__
@@ -54,11 +53,11 @@ using CalibrateEmulateSample.DataContainers
     obs_noise_cov = load(datafile)["obs_noise_cov"] # 96 x 96
 
     #take only first 400 points
-    iter_mask = 1:4
-    #data_mask = 1:32
+    iter_mask = [1, 2, 4]
+    data_mask = 1:96
     #    data_mask= 33:64
     #    data_mask= 65:96
-    data_mask = 1:96
+    #data_mask = 1:96
     #data_mask = [5*i for i = 1:Int(floor(96/5))]
 
     inputs = inputs[:, :, iter_mask]
@@ -66,8 +65,6 @@ using CalibrateEmulateSample.DataContainers
     obs_noise_cov = obs_noise_cov[data_mask, data_mask]
     truth = truth[data_mask]
 
-    # priorfile = "priors.jld2"
-    # prior = load(priorfile)["prior"]
 
     # derived quantities
     N_ens, input_dim, N_iter = size(inputs)
@@ -79,7 +76,19 @@ using CalibrateEmulateSample.DataContainers
     normalized = true
 
     # setup random features
-    eki_options_override = Dict("tikhonov" => 0, "multithread" => "ensemble") #faster than tullio multithread for training
+    eki_options_override = Dict(
+        "verbose" => true,
+        "tikhonov" => 0.0,
+        "scheduler" => DataMisfitController(on_terminate = "continue"),
+        "n_iteration" => 10,
+        "multithread" => "ensemble",
+        "train_fraction" => 0.95,
+        "inflation" => 0.0,
+        "cov_sample_multiplier" => 0.5,
+        "n_ensemble" => 50,
+        "localization" => SEC(1.0),
+    )
+
 
 
     if emulator_type == "VectorRFR-svd-nondiag" || emulator_type == "VectorRFR-nondiag"
@@ -89,23 +98,46 @@ using CalibrateEmulateSample.DataContainers
             println("Running Vector RF model - without SVD and assuming non-diagonal variance ")
         end
 
-        n_features = 80 * Int(floor(5 * sqrt(N_ens * N_iter)))
+        n_features = 20 * Int(floor(5 * sqrt(N_ens * N_iter))) #80 *
         println("build RF with $(N_ens*N_iter) training points and $(n_features) random features.")
+        kernel_structure = SeparableKernel(LowRankFactor(2), LowRankFactor(3))
 
+        mlt = VectorRandomFeatureInterface(
+            n_features,
+            input_dim,
+            output_dim,
+            kernel_structure = kernel_structure,
+            optimizer_options = eki_options_override,
+        )
+    elseif emulator_type == "VectorRFR-svd-nonsep" || emulator_type == "VectorRFR-nonsep"
+        if emulator_type == "VectorRFR-svd-nondiag"
+            println("Running Vector RF model with nonseparable kernel - using SVD")
+        elseif emulator_type == "VectorRFR-nonsep"
+            println("Running Vector RF model with nonseparable kernel - without SVD")
+        end
 
-        mlt = VectorRandomFeatureInterface(n_features, input_dim, output_dim, optimizer_options = eki_options_override)
+        n_features = 20 * Int(floor(5 * sqrt(N_ens * N_iter))) #80 *
+        println("build RF with $(N_ens*N_iter) training points and $(n_features) random features.")
+        kernel_structure = NonseparableKernel(LowRankFactor(5))
+        mlt = VectorRandomFeatureInterface(
+            n_features,
+            input_dim,
+            output_dim,
+            kernel_structure = kernel_structure,
+            optimizer_options = eki_options_override,
+        )
 
     elseif emulator_type == "VectorRFR-svd-diag"
 
         println("Running Vector RF model - using SVD and assuming diagonal variance")
-        n_features = 20 * Int(floor(5 * sqrt(N_ens * N_iter)))
+        n_features = 5 * Int(floor(5 * sqrt(N_ens * N_iter))) #20 *
         println("build RF with $(N_ens*N_iter) training points and $(n_features) random features.")
-
+        kernel_structure = SeparableKernel(LowRankFactor(2), DiagonalFactor(1e-8))
         mlt = VectorRandomFeatureInterface(
             n_features,
             input_dim,
             output_dim,
-            diagonalize_output = true,
+            kernel_structure = kernel_structure,
             optimizer_options = eki_options_override,
         )
 
@@ -231,8 +263,75 @@ using CalibrateEmulateSample.DataContainers
             end
         end
 
-    end
+    end # plots
+
+    ### MCMC
+
+    priorfile = "priors.jld2"
+    prior = load(priorfile)["prior"]
+
+    ##
+    ###  Sample: Markov Chain Monte Carlo
+    ###
+    # initial values
+    u0 = vec(mean(get_inputs(input_output_pairs), dims = 2))
+    println("initial parameters: ", u0)
+
+    # First let's run a short chain to determine a good step size
+    mcmc = MCMCWrapper(RWMHSampling(), truth, prior, emulator; init_params = u0)
+    new_step = optimize_stepsize(mcmc; init_stepsize = 0.1, N = 2000, discard_initial = 0)
+
+    # Now begin the actual MCMC
+    println("Begin MCMC - with step size ", new_step)
+    chain = MarkovChainMonteCarlo.sample(mcmc, 100_000; stepsize = new_step, discard_initial = 2_000)
+    posterior = MarkovChainMonteCarlo.get_posterior(mcmc, chain)
+
+    post_mean = mean(posterior)
+    post_cov = cov(posterior)
+    println("post_mean")
+    println(post_mean)
+    println("post_cov")
+    println(post_cov)
+    println("D util")
+    println(det(inv(post_cov)))
+    println(" ")
 
 
+    # plot posterior
 
+    truth_params = [log(0.7 / 0.3) log(7200)]' # parameter value (at truth) - unconstrained
+
+    # Save data
+    save(
+        joinpath(data_save_directory, expname * "posterior.jld2"),
+        "posterior",
+        posterior,
+        "input_output_pairs",
+        input_output_pairs,
+        "truth_params",
+        truth_params,
+    )
+
+
+    constrained_truth_params = transform_unconstrained_to_constrained(posterior, truth_params)
+    param_names = get_name(posterior)
+
+    posterior_samples = vcat([get_distribution(posterior)[name] for name in param_names]...) #samples are columns
+    constrained_posterior_samples =
+        mapslices(x -> transform_unconstrained_to_constrained(posterior, x), posterior_samples, dims = 1)
+
+    gr(dpi = 300, size = (300, 300))
+    p = cornerplot(permutedims(constrained_posterior_samples, (2, 1)), label = param_names, compact = true)
+    plot!(p.subplots[1], [constrained_truth_params[1]], seriestype = "vline", w = 1.5, c = :steelblue, ls = :dash) # vline on top histogram
+    plot!(p.subplots[3], [constrained_truth_params[2]], seriestype = "hline", w = 1.5, c = :steelblue, ls = :dash) # hline on right histogram
+    plot!(p.subplots[2], [constrained_truth_params[1]], seriestype = "vline", w = 1.5, c = :steelblue, ls = :dash) # v & h line on scatter.
+    plot!(p.subplots[2], [constrained_truth_params[2]], seriestype = "hline", w = 1.5, c = :steelblue, ls = :dash)
+    figpath = joinpath(figure_save_directory, "plot_posterior_" * expname * ".png")
+    savefig(figpath)
+
+
+end # main
+
+@time begin
+    main()
 end # for @time
diff --git a/examples/GCM/sbatch_emulate_sample_jl b/examples/GCM/sbatch_emulate_sample_jl
index efbdb391c..8807f0e9a 100644
--- a/examples/GCM/sbatch_emulate_sample_jl
+++ b/examples/GCM/sbatch_emulate_sample_jl
@@ -1,6 +1,6 @@
 #!/bin/bash                      
 #Submit this script with: sbatch thefilename                                                                                      
-#SBATCH --time=6:00:00   # walltime                                                                                              
+#SBATCH --time=12:00:00   # walltime                                                                                              
 #SBATCH --ntasks-per-node=1   # number of processor cores (i.e. tasks)                                                                     
 #SBATCH --cpus-per-task=28
 #SBATCH --mem-per-cpu=6000
diff --git a/examples/Lorenz/GModel_common.jl b/examples/Lorenz/GModel_common.jl
index d3a99de5c..1954cae29 100644
--- a/examples/Lorenz/GModel_common.jl
+++ b/examples/Lorenz/GModel_common.jl
@@ -128,7 +128,8 @@ end
 # tend: End of simulation Float64(1), nstep: 
 function lorenz_solve(settings::LSettings, params::LParams)
     # Initialize
-    nstep = Int32(ceil((settings.tend - settings.t_start) / settings.dt))
+    # nstep = Int32(ceil((settings.tend - settings.t_start) / settings.dt))
+    nstep = Int32(ceil(settings.tend / settings.dt))
     xn = zeros(Float64, settings.N, nstep)
     t = zeros(Float64, nstep)
     # Initial perturbation
@@ -137,7 +138,7 @@ function lorenz_solve(settings::LSettings, params::LParams)
     Xbuffer = zeros(4, settings.N)
     # March forward in time
     for j in 1:nstep
-        t[j] = settings.t_start + settings.dt * j
+        t[j] = settings.dt * j
         #use view to update a slice
         # RK4! modifies first and last arguments
         if settings.dynamics == 1
diff --git a/examples/Lorenz/calibrate.jl b/examples/Lorenz/calibrate.jl
index a799899a0..17e1b1242 100644
--- a/examples/Lorenz/calibrate.jl
+++ b/examples/Lorenz/calibrate.jl
@@ -15,6 +15,34 @@ using CalibrateEmulateSample
 const EKP = CalibrateEmulateSample.EnsembleKalmanProcesses
 const PD = EKP.ParameterDistributions
 
+
+function shrinkage_cov(sample_mat::AA) where {AA <: AbstractMatrix}
+    n_out, n_sample = size(sample_mat)
+
+    # de-mean (as we will use the samples directly for calculation of β)
+    sample_mat_zeromean = sample_mat .- mean(sample_mat, dims = 2)
+    # Ledoit Wolf shrinkage to I
+
+    # get sample covariance
+    Γ = cov(sample_mat_zeromean, dims = 2)
+    # estimate opt shrinkage
+    μ_shrink = 1 / n_out * tr(Γ)
+    δ_shrink = norm(Γ - μ_shrink * I)^2 / n_out # (scaled) frob norm of Γ_m
+    #once de-meaning, we need to correct the sample covariance with an n_sample -> n_sample-1
+    β_shrink = sum([norm(c * c' - -Γ)^2 / n_out for c in eachcol(sample_mat_zeromean)]) / (n_sample - 1)^2
+
+    γ_shrink = min(β_shrink / δ_shrink, 1) # clipping is typically rare
+    #  γμI + (1-γ)Γ
+    Γ .*= (1 - γ_shrink)
+    for i in 1:n_out
+        Γ[i, i] += γ_shrink * μ_shrink
+    end
+
+    @info "Shrinkage scale: $(γ_shrink), (0 = none, 1 = revert to scaled Identity)\n shrinkage covariance condition number: $(cond(Γ))"
+    return Γ
+end
+
+
 function main()
 
     rng_seed = 4137
@@ -63,10 +91,6 @@ function main()
     n_param = length(param_names)
     params_true = reshape(params_true, (n_param, 1))
 
-    println(n_param)
-    println(params_true)
-
-
     ###
     ###  Define the parameter priors
     ###
@@ -97,7 +121,7 @@ function main()
     N = 36
     dt = 1 / 64.0
     # Start of integration
-    t_start = 800.0
+    t_start = 100.0#800.0
     # We now rescale all the timings etc, to be in nondim-time
     # Data collection length
     if dynamics == 1
@@ -110,7 +134,7 @@ function main()
     # Integration length
     Tfit = Ts_days / τc
     # Initial perturbation
-    Fp = rand(Normal(0.0, 0.01), N)
+    Fp = rand(rng, Normal(0.0, 0.01), N)
     kmax = 1
     # Prescribe variance or use a number of forward passes to define true interval variability
     var_prescribe = false
@@ -148,40 +172,61 @@ function main()
         μ = zeros(length(gt))
         # Add noise
         for i in 1:n_samples
-            yt[:, i] = gt .+ rand(MvNormal(μ, Γy))
+            yt[:, i] = gt .+ rand(rng, MvNormal(μ, Γy))
         end
-        println(Γy)
 
     else
         println("Using truth values to compute covariance")
         n_samples = 100
-        nthreads = Threads.nthreads()
-        yt = zeros(nthreads, length(gt), n_samples)
-        Threads.@threads for i in 1:n_samples
-
-            #=
-                        lorenz_settings_local = GModel.LSettings(
-                            dynamics,
-                            stats_type,
-                            t_start + T * (i - 1),
-                            T,
-                            Ts,
-                            Tfit,
-                            Fp,
-                            N,
-                            dt,
-                            t_start + T * (i - 1) + T,
-                            kmax,
-                            ω_fixed,
-                            ω_true,
-                        )=#
-            lorenz_settings_local = lorenz_settings
-            tid = Threads.threadid()
-            yt[tid, :, i] = GModel.run_G_ensemble(params_true, lorenz_settings_local)
+        yt = zeros(length(gt), n_samples)
+        for i in 1:n_samples
+
+            # Fp changes IC, then run from 0 to t_start + T,
+            # then gathering stats over [t_start, t_start+T]
+            Fp = rand(rng, Normal(0.0, 0.01), N)
+            lorenz_settings_local = GModel.LSettings(
+                dynamics,
+                stats_type,
+                t_start, # data collection start start time
+                T, #data collection length
+                Ts, # integration_length
+                Tfit, # number of Ts to fit over
+                Fp,
+                N,
+                dt,
+                t_start + T, #sim end time
+                kmax,
+                ω_fixed,
+                ω_true,
+            )
+
+            #= lorenz_settings_local = GModel.LSettings(
+                dynamics,
+                stats_type,
+                t_start + T * (i - 1), #sim start time
+                T, #data collection length
+                Ts, # integration_length
+                Tfit, # number of Ts to fit over
+                Fp,
+                N,
+                dt,
+                t_start + T * (i - 1) + T , #sim end time
+                kmax,
+                ω_fixed,
+                ω_true,
+            )=#
+            yt[:, i] = GModel.run_G_ensemble(params_true, lorenz_settings_local)
+
         end
-        yt = dropdims(sum(yt, dims = 1), dims = 1)
+        gr(dpi = 300, size = (400, 400))
+        KK = plot(yt[1:end, 1:100], legend = false)
+        savefig(KK, joinpath(figure_save_directory, "data_samples.png"))
+
+        # [1.] use shrinkage for regularization
+        Γy = shrinkage_cov(yt)
 
-        # add a little extra variance for regularization here
+        #=
+        # [2.] add fixed variance for regularization
         noise_sd = 0.1
         Γy_diag = noise_sd .^ 2 * convert(Array, I(size(yt, 1)))
         μ = zeros(size(yt, 1))
@@ -192,8 +237,8 @@ function main()
         #Γy = convert(Array, Diagonal(dropdims(mean((yt.-mean(yt,dims=1)).^2,dims=1),dims=1)))
         # Covariance of truth data
         Γy = cov(yt, dims = 2)
+        =#
 
-        println(Γy)
     end
 
 
diff --git a/examples/Lorenz/emulate_sample.jl b/examples/Lorenz/emulate_sample.jl
index 7ea16c67c..a42cd5c5a 100644
--- a/examples/Lorenz/emulate_sample.jl
+++ b/examples/Lorenz/emulate_sample.jl
@@ -4,6 +4,7 @@ include(joinpath(@__DIR__, "..", "ci", "linkfig.jl"))
 # Import modules
 using Distributions  # probability distributions and associated functions
 using LinearAlgebra
+ENV["GKSwstype"] = "100"
 using StatsPlots
 using Plots
 using Random
@@ -43,25 +44,22 @@ function main()
         "RF-vector-svd-nondiag",
         "RF-vector-nosvd-diag",
         "RF-vector-nosvd-nondiag",
+        "RF-vector-svd-nonsep",
     ]
 
     #### CHOOSE YOUR CASE: 
-    mask = 2:7
-    # One day we can use good heuristics here
-    # Currently set so that learnt hyperparameters stays relatively far inside the prior. (use "verbose" => true in optimizer overrides to see this info)
-    prior_scalings = [[0.0, 0.0], [1e-2, 0.0], [1e-1, 0.0], [1e-2, 1e-2], [1e-2, 1e-1], [1e-2, 1e-2], [1e-2, 1e-2]]
-    for (case, scaling) in zip(cases[mask], prior_scalings[mask])
+    mask = 2:7 # e.g. 1:8 or [7]
+    for (case) in cases[mask]
+
 
-        #case = cases[7]
         println("case: ", case)
-        max_iter = 6 # number of EKP iterations to use data from is at most this
+        min_iter = 1
+        max_iter = 5 # number of EKP iterations to use data from is at most this
         overrides = Dict(
             "verbose" => true,
-            "train_fraction" => 0.8,
-            "n_iteration" => 40,
-            "scheduler" => DataMisfitController(),
-            "prior_in_scale" => scaling[1],
-            "prior_out_scale" => scaling[2],
+            "scheduler" => DataMisfitController(terminate_at = 100.0),
+            "cov_sample_multiplier" => 1.0,
+            "n_iteration" => 20,
         )
         # we do not want termination, as our priors have relatively little interpretation
 
@@ -93,12 +91,12 @@ function main()
 
         ekiobj = load(data_save_file)["eki"]
         priors = load(data_save_file)["priors"]
-        truth_sample = load(data_save_file)["truth_sample"]
         truth_sample_mean = load(data_save_file)["truth_sample_mean"]
+        truth_sample = load(data_save_file)["truth_sample"]
         truth_params_constrained = load(data_save_file)["truth_input_constrained"] #true parameters in constrained space
         truth_params = transform_constrained_to_unconstrained(priors, truth_params_constrained)
         Γy = ekiobj.obs_noise_cov
-        println(Γy)
+
 
         n_params = length(truth_params) # "input dim"
         output_dim = size(Γy, 1)
@@ -108,6 +106,7 @@ function main()
 
         # Emulate-sample settings
         # choice of machine-learning tool
+        nugget = 0.001
         if case == "GP"
             gppackage = Emulators.GPJL()
             pred_type = Emulators.YType()
@@ -118,26 +117,43 @@ function main()
                 noise_learn = false,
             )
         elseif case ∈ ["RF-scalar", "RF-scalar-diagin"]
-            n_features = 300
-            diagonalize_input = case == "RF-scalar-diagin" ? true : false
+            n_features = 100
+            kernel_structure =
+                case == "RF-scalar-diagin" ? SeparableKernel(DiagonalFactor(nugget), OneDimFactor()) :
+                SeparableKernel(LowRankFactor(2, nugget), OneDimFactor())
             mlt = ScalarRandomFeatureInterface(
                 n_features,
                 n_params,
                 rng = rng,
-                diagonalize_input = diagonalize_input,
+                kernel_structure = kernel_structure,
                 optimizer_options = overrides,
             )
         elseif case ∈ ["RF-vector-svd-diag", "RF-vector-nosvd-diag", "RF-vector-svd-nondiag", "RF-vector-nosvd-nondiag"]
             # do we want to assume that the outputs are decorrelated in the machine-learning problem?
-            diagonalize_output = case ∈ ["RF-vector-svd-diag", "RF-vector-nosvd-diag"] ? true : false
-            n_features = 300
+            kernel_structure =
+                case ∈ ["RF-vector-svd-diag", "RF-vector-nosvd-diag"] ?
+                SeparableKernel(LowRankFactor(2, nugget), DiagonalFactor(nugget)) :
+                SeparableKernel(LowRankFactor(2, nugget), LowRankFactor(3, nugget))
+            n_features = 500
 
             mlt = VectorRandomFeatureInterface(
                 n_features,
                 n_params,
                 output_dim,
                 rng = rng,
-                diagonalize_output = diagonalize_output, # assume outputs are decorrelated
+                kernel_structure = kernel_structure,
+                optimizer_options = overrides,
+            )
+        elseif case ∈ ["RF-vector-svd-nonsep"]
+            kernel_structure = NonseparableKernel(LowRankFactor(3, nugget))
+            n_features = 500
+
+            mlt = VectorRandomFeatureInterface(
+                n_features,
+                n_params,
+                output_dim,
+                rng = rng,
+                kernel_structure = kernel_structure,
                 optimizer_options = overrides,
             )
         end
@@ -146,21 +162,41 @@ function main()
         # Use median over all data since all data are the same type
         truth_sample_norm = vcat(truth_sample...)
         norm_factor = get_standardizing_factors(truth_sample_norm)
-        println(size(norm_factor))
         #norm_factor = vcat(norm_factor...)
         norm_factor = fill(norm_factor, size(truth_sample))
-        println("Standardization factors")
-        println(norm_factor)
 
         # Get training points from the EKP iteration number in the second input term  
         N_iter = min(max_iter, length(get_u(ekiobj)) - 1) # number of paired iterations taken from EKP
-
-        input_output_pairs = Utilities.get_training_points(ekiobj, N_iter - 1) # 1:N-1
-
-        input_output_pairs_test = Utilities.get_training_points(ekiobj, N_iter:N_iter) # just "next" iteration used for testing
+        min_iter = min(max_iter, max(1, min_iter))
+        input_output_pairs = Utilities.get_training_points(ekiobj, min_iter:(N_iter - 1))
+        input_output_pairs_test = Utilities.get_training_points(ekiobj, N_iter:(length(get_u(ekiobj)) - 1)) #  "next" iterations
         # Save data
         @save joinpath(data_save_directory, "input_output_pairs.jld2") input_output_pairs
 
+        # plot training points in constrained space
+        if case == cases[mask[1]]
+            gr(dpi = 300, size = (400, 400))
+            inputs_unconstrained = get_inputs(input_output_pairs)
+            inputs_constrained = transform_unconstrained_to_constrained(priors, inputs_unconstrained)
+            p = plot(
+                title = "training points",
+                xlims = extrema(inputs_constrained[1, :]),
+                xaxis = "F",
+                yaxis = "A",
+                ylims = extrema(inputs_constrained[2, :]),
+            )
+            scatter!(p, inputs_constrained[1, :], inputs_constrained[2, :], color = :magenta, label = false)
+            inputs_test_unconstrained = get_inputs(input_output_pairs_test)
+            inputs_test_constrained = transform_unconstrained_to_constrained(priors, inputs_test_unconstrained)
+
+            scatter!(p, inputs_test_constrained[1, :], inputs_test_constrained[2, :], color = :black, label = false)
+
+            vline!(p, [truth_params_constrained[1]], linestyle = :dash, linecolor = :red, label = false)
+            hline!(p, [truth_params_constrained[2]], linestyle = :dash, linecolor = :red, label = false)
+            savefig(p, joinpath(figure_save_directory, "training_points.pdf"))
+            savefig(p, joinpath(figure_save_directory, "training_points.png"))
+        end
+
         standardize = false
         retained_svd_frac = 1.0
         normalized = true
@@ -190,11 +226,11 @@ function main()
         println("ML prediction on true parameters: ")
         println(vec(y_mean))
         println("true data: ")
-        println(truth_sample_mean) # same, regardless of norm_factor 
-        println(" ML predicted variance")
-        println(diag(y_var[1], 0))
+        println(truth_sample) # what was used as truth
+        println(" ML predicted standard deviation")
+        println(sqrt.(diag(y_var[1], 0)))
         println("ML MSE (truth): ")
-        println(mean((truth_sample_mean - vec(y_mean)) .^ 2))
+        println(mean((truth_sample - vec(y_mean)) .^ 2))
         println("ML MSE (next ensemble): ")
         println(mean((get_outputs(input_output_pairs_test) - y_mean_test) .^ 2))
 
@@ -207,13 +243,13 @@ function main()
         println("initial parameters: ", u0)
 
         # First let's run a short chain to determine a good step size
-        truth_sample = truth_sample
         mcmc = MCMCWrapper(RWMHSampling(), truth_sample, priors, emulator; init_params = u0)
         new_step = optimize_stepsize(mcmc; init_stepsize = 0.1, N = 2000, discard_initial = 0)
 
         # Now begin the actual MCMC
         println("Begin MCMC - with step size ", new_step)
         chain = MarkovChainMonteCarlo.sample(mcmc, 100_000; stepsize = new_step, discard_initial = 2_000)
+
         posterior = MarkovChainMonteCarlo.get_posterior(mcmc, chain)
 
         post_mean = mean(posterior)
@@ -226,7 +262,6 @@ function main()
         println(det(inv(post_cov)))
         println(" ")
 
-        constrained_truth_params = transform_unconstrained_to_constrained(posterior, truth_params)
         param_names = get_name(posterior)
 
         posterior_samples = vcat([get_distribution(posterior)[name] for name in get_name(posterior)]...) #samples are columns
@@ -235,10 +270,12 @@ function main()
 
         gr(dpi = 300, size = (300, 300))
         p = cornerplot(permutedims(constrained_posterior_samples, (2, 1)), label = param_names, compact = true)
-        plot!(p.subplots[1], [constrained_truth_params[1]], seriestype = "vline", w = 1.5, c = :steelblue, ls = :dash) # vline on top histogram
-        plot!(p.subplots[3], [constrained_truth_params[2]], seriestype = "hline", w = 1.5, c = :steelblue, ls = :dash) # hline on right histogram
-        plot!(p.subplots[2], [constrained_truth_params[1]], seriestype = "vline", w = 1.5, c = :steelblue, ls = :dash) # v & h line on scatter.
-        plot!(p.subplots[2], [constrained_truth_params[2]], seriestype = "hline", w = 1.5, c = :steelblue, ls = :dash)
+        plot!(p.subplots[1], [truth_params_constrained[1]], seriestype = "vline", w = 1.5, c = :steelblue, ls = :dash) # vline on top histogram
+        plot!(p.subplots[3], [truth_params_constrained[2]], seriestype = "hline", w = 1.5, c = :steelblue, ls = :dash) # hline on right histogram
+        plot!(p.subplots[2], [truth_params_constrained[1]], seriestype = "vline", w = 1.5, c = :steelblue, ls = :dash) # v & h line on scatter.
+        plot!(p.subplots[2], [truth_params_constrained[2]], seriestype = "hline", w = 1.5, c = :steelblue, ls = :dash)
+        figpath = joinpath(figure_save_directory, "posterior_2d-" * case * ".pdf")
+        savefig(figpath)
         figpath = joinpath(figure_save_directory, "posterior_2d-" * case * ".png")
         savefig(figpath)
 
diff --git a/src/Emulator.jl b/src/Emulator.jl
index 8c6f7a2d8..985d59dee 100644
--- a/src/Emulator.jl
+++ b/src/Emulator.jl
@@ -132,7 +132,6 @@ function Emulator(
             svd_transform(training_outputs, obs_noise_cov, retained_svd_frac = retained_svd_frac)
 
         training_pairs = PairedDataContainer(training_inputs, decorrelated_training_outputs)
-
         # [4.] build an emulator
         build_models!(machine_learning_tool, training_pairs)
     else
@@ -141,7 +140,6 @@ function Emulator(
         end
         decomposition = nothing
         training_pairs = PairedDataContainer(training_inputs, training_outputs)
-
         # [4.] build an emulator - providing the noise covariance as a Tikhonov regularizer
         build_models!(machine_learning_tool, training_pairs, regularization_matrix = obs_noise_cov)
     end
@@ -258,13 +256,20 @@ function predict(
                 # [4.] unstandardize
                 return reverse_standardize(emulator, s_outputs, s_output_cov)
             else #... and want to stay in decorrelated standardized coordinates
-                diag_out_flag = get_diagonalize_output(get_machine_learning_tool(emulator))
-                if diag_out_flag
-                    ds_output_diagvar = vec([Diagonal(ds_output_covvec[j]) for j in 1:N_samples]) # extracts diagonal
-                    if output_dim == 1
+
+                # special cases where you only return variances and not covariances, could be better acheived with dispatch...
+                kernel_structure = get_kernel_structure(get_machine_learning_tool(emulator))
+                if nameof(typeof(kernel_structure)) == :SeparableKernel
+                    output_cov_structure = get_output_cov_structure(kernel_structure)
+                    if nameof(typeof(output_cov_structure)) == :DiagonalFactor
+                        ds_output_diagvar = vec([Diagonal(ds_output_covvec[j]) for j in 1:N_samples]) # extracts diagonal
+                        return ds_outputs, ds_output_diagvar
+                    elseif nameof(typeof(output_cov_structure)) == :OneDimFactor
                         ds_output_diagvar = ([ds_output_covvec[i][1, 1] for i in 1:N_samples], 1, :) # extracts value
+                        return ds_outputs, ds_output_diagvar
+                    else
+                        return ds_outputs, ds_output_covvec
                     end
-                    return ds_outputs, ds_output_diagvar
                 else
                     return ds_outputs, ds_output_covvec
                 end
@@ -434,7 +439,7 @@ function svd_transform(
         decomposition = SVD(decomp.U[:, 1:k], decomp.S[1:k], decomp.Vt[1:k, :])
     else
         transformed_data = sqrt_singular_values_inv * decomp.Vt * data
-        decomposition = svd(obs_noise_cov)
+        decomposition = decomp
     end
     return transformed_data, decomposition
 end
diff --git a/src/RandomFeature.jl b/src/RandomFeature.jl
index e04fde7eb..129d55f1b 100644
--- a/src/RandomFeature.jl
+++ b/src/RandomFeature.jl
@@ -8,15 +8,117 @@ using ..Utilities
 using StableRNGs
 using Random
 
-export calculate_n_hyperparameters, build_default_prior
+export calculate_n_hyperparameters, hyperparameters_from_flat, build_default_prior, cov_structure_from_string
+export CovarianceStructureType, OneDimFactor, DiagonalFactor, CholeskyFactor, LowRankFactor, HierarchicalLowRankFactor
+export SeparableKernel, NonseparableKernel
+export get_input_cov_structure, get_output_cov_structure, get_cov_structure
+export get_eps
+export rank
+
+
 
 abstract type RandomFeatureInterface <: MachineLearningTool end
 
+# Different types of threading
 abstract type MultithreadType end
 struct TullioThreading <: MultithreadType end
 struct EnsembleThreading <: MultithreadType end
 
-#common functions
+
+# Different types of covariance representation in the prior
+abstract type CovarianceStructureType end
+struct OneDimFactor <: CovarianceStructureType end
+struct DiagonalFactor{FT <: AbstractFloat} <: CovarianceStructureType
+    eps::FT
+end
+DiagonalFactor() = DiagonalFactor(Float64(1.0))
+get_eps(df::DiagonalFactor) = df.eps
+
+struct CholeskyFactor{FT <: AbstractFloat} <: CovarianceStructureType
+    eps::FT
+end
+CholeskyFactor() = CholeskyFactor(Float64(1.0))
+get_eps(cf::CholeskyFactor) = cf.eps
+
+struct LowRankFactor{FT <: AbstractFloat} <: CovarianceStructureType
+    rank::Int
+    eps::FT
+end
+LowRankFactor(r::Int) = LowRankFactor(r, Float64(1.0))
+get_eps(lrf::LowRankFactor) = lrf.eps
+
+struct HierarchicalLowRankFactor{FT <: AbstractFloat} <: CovarianceStructureType
+    #cst::CovarianceStructureType
+    rank::Int
+    eps::FT
+end
+
+HierarchicalLowRankFactor(r::Int) = HierarchicalLowRankFactor(r, Float64(1.0))
+get_eps(hlrf::HierarchicalLowRankFactor) = hlrf.eps
+
+import LinearAlgebra: rank
+rank(lrf::LowRankFactor) = lrf.rank
+rank(hlrf::HierarchicalLowRankFactor) = hlrf.rank
+
+# To add a new cov type `T`  one must define these 3 methods:
+# hyperparameters_from_flat(x::V, d::Int, t::T) where {V <: AbstractVector}
+# calculate_n_hyperparameters(d::Int, t::T)
+# build_default_prior(name::SS, d::Int, t::T) where {SS <: AbstractString}
+# and add a string id to cov_structure_from_string
+
+function cov_structure_from_string(s::S, d::Int) where {S <: AbstractString}
+    if s == "onedim"
+        return OneDimFactor()
+    elseif s == "diagonal"
+        return DiagonalFactor()
+    elseif s == "cholesky"
+        return CholeskyFactor()
+    elseif s == "lowrank"
+        return LowRankFactor(Int(ceil(sqrt(d))))
+    elseif s == "hierlowrank"
+        return HierarchicalLowRankFactor(Int(ceil(sqrt(d))))
+    else
+        throw(
+            ArgumentError(
+                "Recieved unknown `input_cov_structure` keyword $(s), \n please choose from [\"onedim\",\"diagonal\",\"cholesky\", \"lowrank\"] or provide a CovarianceStructureType",
+            ),
+        )
+    end
+end
+cov_structure_from_string(cst::CST, args...; kwargs...) where {CST <: CovarianceStructureType} = cst
+
+
+# Different types of kernel 
+abstract type KernelStructureType end
+
+struct SeparableKernel{CST1 <: CovarianceStructureType, CST2 <: CovarianceStructureType} <: KernelStructureType
+    input_cov_structure::CST1
+    output_cov_structure::CST2
+end
+get_input_cov_structure(kernel_structure::SeparableKernel) = kernel_structure.input_cov_structure
+get_output_cov_structure(kernel_structure::SeparableKernel) = kernel_structure.output_cov_structure
+
+struct NonseparableKernel{CST <: CovarianceStructureType} <: KernelStructureType
+    cov_structure::CST
+end
+get_cov_structure(kernel_structure::NonseparableKernel) = kernel_structure.cov_structure
+
+
+# calculate_n_hyperparameters
+calculate_n_hyperparameters(d::Int, odf::OneDimFactor) = 0
+calculate_n_hyperparameters(d::Int, df::DiagonalFactor) = d
+calculate_n_hyperparameters(d::Int, cf::CholeskyFactor) = Int(d * (d + 1) / 2)
+calculate_n_hyperparameters(d::Int, lrf::LowRankFactor) = Int(rank(lrf) + d * rank(lrf))
+calculate_n_hyperparameters(d::Int, hlrf::HierarchicalLowRankFactor) =
+    Int(rank(hlrf) * (rank(hlrf) + 1) / 2 + d * rank(hlrf))
+
+# build from flat 
+hyperparameters_from_flat(x::V, odf::OneDimFactor) where {V <: AbstractVector} = nothing
+
+function hyperparameters_from_flat(x::V, df::DiagonalFactor) where {V <: AbstractVector}
+    return convert(Matrix, Diagonal(x) + get_eps(df) * I)
+end
+
 """
 $(DocStringExtensions.TYPEDSIGNATURES)
 
@@ -33,185 +135,234 @@ function flat_to_chol(x::V) where {V <: AbstractVector}
     return cholmat
 end
 
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
+function hyperparameters_from_flat(x::V, cf::CholeskyFactor) where {V <: AbstractVector}
+    chol = flat_to_chol(x)
+    return chol * permutedims(chol, (2, 1)) + get_eps(cf) * I
 
-Builds Covariance matrices from a flat array of hyperparameters
-"""
-function hyperparameters_from_flat(
-    x::VV,
-    input_dim::Int,
-    output_dim::Int,
-    diagonalize_input::Bool,
-    diagonalize_output::Bool,
-) where {VV <: AbstractVector}
-    # if dim = 1 then parameters are a 1x1 matrix
-    # if we diagonalize then parameters are diagonal entries + constant
-    # if we don't diagonalize then parameters are a cholesky product + constant on diagonal.
-
-    #input space
-    if input_dim == 1
-        in_max = 1
-        U = x[in_max] * ones(1, 1)
-    elseif diagonalize_input
-        in_max = input_dim + 2
-        U = convert(Matrix, x[in_max - 1] * (Diagonal(x[1:(in_max - 2)]) + x[in_max] * I))
-    elseif !diagonalize_input
-        in_max = Int(input_dim * (input_dim + 1) / 2) + 2
-        cholU = flat_to_chol(x[1:(in_max - 2)])
-        U = x[in_max - 1] * (cholU * permutedims(cholU, (2, 1)) + x[in_max] * I)
-    end
+end
 
-    # output_space    
-    out_min = in_max + 1
-    if output_dim == 1
-        V = x[end] * ones(1, 1)
-    elseif diagonalize_output
-        V = convert(Matrix, x[end - 1] * (Diagonal(x[out_min:(end - 2)]) + x[end] * I))
-    elseif !diagonalize_output
-        cholV = flat_to_chol(x[out_min:(end - 2)])
-        V = x[end - 1] * (cholV * permutedims(cholV, (2, 1)) + x[end] * I)
-    end
+function hyperparameters_from_flat(x::V, lrf::LowRankFactor) where {V <: AbstractVector}
 
-    # sometimes this process does not give a (numerically) symmetric matrix
-    U = 0.5 * (U + permutedims(U, (2, 1)))
-    V = 0.5 * (V + permutedims(V, (2, 1)))
+    r = rank(lrf)
+    d = Int(length(x) / r - 1) # l = r + d*r
 
-    return U, V
+    D = Diagonal(x[1:r]) # D acts like sing.vals.^2
+    U = reshape(x[(r + 1):end], d, r)
+    qrU = qr(U) # qrU.Q is semi-orthogonal in first r columns, i.e. Q^T.Q = I
+    return qrU.Q[:, 1:r] * D * qrU.Q[:, 1:r]' + get_eps(lrf) * I
 
 end
 
-function hyperparameters_from_flat(x::VV, input_dim::Int, diagonalize_input::Bool) where {VV <: AbstractVector}
-    output_dim = 1
-    diagonalize_output = false
-    U, V = hyperparameters_from_flat(x, input_dim, output_dim, diagonalize_input, diagonalize_output)
-    # Note that the scalar setting has a slight inconsistency from the 1-D output vector case
-    # We correct here by rescaling U using the single hyperparameter in V
-    return V[1, 1] * U
+function hyperparameters_from_flat(x::V, hlrf::HierarchicalLowRankFactor) where {V <: AbstractVector}
+    r = rank(hlrf)
+    d = Int(length(x) / r - (r + 1) / 2) # l = r^2 + d*r
+
+    # Ambikasaran, O'Neil, Singh 2016
+    L = flat_to_chol(x[1:Int(r * (r + 1) / 2)])
+    U = reshape(x[Int(r * (r + 1) / 2 + 1):end], d, r)
+    K = L * permutedims(L, (2, 1)) + get_eps(hlrf) * I
+
+    qrU = qr(U)
+    svdT = svd(I + qrU.R * K * qrU.R') # square so T = Q S Q^t 
+    M = svdT.V * Diagonal(sqrt.(svdT.S)) # M M^t = Q sqrt(S) (Q sqrt(S))^t = Q sqrt(S) sqrt(S) Q^t = T
+    X = M - I
+    update_factor = I + qrU.Q * X * qrU.Q'
+
+    return update_factor * update_factor'
+end
+
+
+build_default_prior(name::SS, n_hp::Int, odf::OneDimFactor) where {SS <: AbstractString} = nothing
+
+function build_default_prior(name::SS, n_hp::Int, df::DiagonalFactor) where {SS <: AbstractString}
+    return ParameterDistribution(
+        Dict(
+            "name" => "$(name)_diagonal",
+            "distribution" => VectorOfParameterized(repeat([Normal(0, 3)], n_hp)),
+            "constraint" => repeat([bounded_below(0.0)], n_hp),
+        ),
+    )
 end
 
+function build_default_prior(name::SS, n_hp::Int, df::CholeskyFactor) where {SS <: AbstractString}
+    return constrained_gaussian("$(name)_cholesky", 0.0, 5.0, -Inf, Inf, repeats = n_hp)
+end
+
+function build_default_prior(name::SS, n_hp::Int, lrf::LowRankFactor) where {SS <: AbstractString}
+    r = rank(lrf)
+    d = Int(n_hp / r - 1)
+    D = ParameterDistribution(
+        Dict(
+            "name" => "$(name)_lowrank_diagonal",
+            "distribution" => VectorOfParameterized(repeat([Normal(0, 3)], r)),
+            "constraint" => repeat([bounded_below(0.0)], r),
+        ),
+    )
+    U = constrained_gaussian("$(name)_lowrank_U", 0.0, 100.0, -Inf, Inf, repeats = Int(d * r))
+    return combine_distributions([D, U])
+end
+
+function build_default_prior(name::SS, n_hp::Int, hlrf::HierarchicalLowRankFactor) where {SS <: AbstractString}
+    r = rank(hlrf)
+    d = Int(n_hp / r - (r + 1) / 2)
+    L = constrained_gaussian("$(name)_lowrank_Kchol", 0.0, 5.0, -Inf, Inf, repeats = Int(r * (r + 1) / 2))
+    U = constrained_gaussian("$(name)_lowrank_U", 0.0, 100.0, -Inf, Inf, repeats = Int(d * r))
+    return combine_distributions([L, U])
+end
+
+# combining input and output spaces:
+
 """
 $(DocStringExtensions.TYPEDSIGNATURES)
 
-Builds a prior over the hyperparameters (i.e. the cholesky/diagaonal or individaul entries of the input/output covariances).
-For example, the case where the input covariance ``U = γ_1 * (LL^T + γ_2 I)``, 
-we set priors for the entries of the lower triangular matrix  ``L`` as normal, and constant scalings ``γ_i`` as log-normal to retain positivity.
-
-priors can be scaled by a constant factor by using the in_scale and out_scale keywords
+Calculate number of hyperparameters required to create the default prior in the given input/output space dimensions (determined from the `CovarianceStructureType`)
 """
-function build_default_prior(
+function calculate_n_hyperparameters(
     input_dim::Int,
     output_dim::Int,
-    diagonalize_input::Bool,
-    diagonalize_output::Bool;
-    in_scale = 1.0,
-    out_scale = 1.0,
-)
-    n_hp_in = n_hyperparameters_cov(input_dim, diagonalize_input)
-    n_hp_out = n_hyperparameters_cov(output_dim, diagonalize_output)
-
-    # aspect ratio for mean:sd for positive parameters in constrained_gaussian()
-    # 10 seems to give a decent range, this isn't very tuneable so the alternative would be to use the basic constructor 
-    pos_asp_ratio = 10
-    # if dim = 1 , positive constant
-    # elseif diagonalized, positive on diagonal and positive scalings
-    # else chol factor, and positive scalings
-    if input_dim > 1
-        if diagonalize_input
-            param_diag =
-                constrained_gaussian("input_prior_diagonal", 1.0, pos_asp_ratio, 0.0, Inf, repeats = n_hp_in - 2)
-            param_scaling =
-                constrained_gaussian("input_prior_scaling", in_scale, in_scale * pos_asp_ratio, 0.0, Inf, repeats = 2)
-            input_prior = combine_distributions([param_diag, param_scaling])
-        else
-            param_chol = constrained_gaussian("input_prior_cholesky", 0.0, 1.0, -Inf, Inf, repeats = n_hp_in - 2)
-            param_scaling =
-                constrained_gaussian("input_prior_scaling", in_scale, in_scale * pos_asp_ratio, 0.0, Inf, repeats = 2)
-            input_prior = combine_distributions([param_chol, param_scaling])
-        end
-    else
-        input_prior = constrained_gaussian("input_prior", in_scale, in_scale, 0.0, Inf)
+    kernel_structure::SK,
+) where {SK <: SeparableKernel}
+
+    input_cov_structure = get_input_cov_structure(kernel_structure)
+    n_hp_in = calculate_n_hyperparameters(input_dim, input_cov_structure)
+    output_cov_structure = get_output_cov_structure(kernel_structure)
+    n_hp_out = calculate_n_hyperparameters(output_dim, output_cov_structure)
+    n_scalings = 1
+
+    # if both in and output dim are "OneDimFactors" we still learn 1 hp
+    if n_hp_in + n_hp_out == 0
+        return 1 + n_scalings
     end
 
-    if output_dim > 1
-        if diagonalize_output
-            param_diag =
-                constrained_gaussian("output_prior_diagonal", 1.0, pos_asp_ratio, 0.0, Inf, repeats = n_hp_out - 2)
-            param_scaling = constrained_gaussian(
-                "output_prior_scaling",
-                out_scale,
-                out_scale * pos_asp_ratio,
-                0.0,
-                Inf,
-                repeats = 2,
-            )
-            output_prior = combine_distributions([param_diag, param_scaling])
-        else
-            param_chol = constrained_gaussian("output_prior_cholesky", 0.0, 1.0, -Inf, Inf, repeats = n_hp_out - 2)
-            param_scaling = constrained_gaussian(
-                "output_prior_scaling",
-                out_scale,
-                out_scale * pos_asp_ratio,
-                0.0,
-                Inf,
-                repeats = 2,
-            )
-            output_prior = combine_distributions([param_chol, param_scaling])
-        end
-    else
-        output_prior = constrained_gaussian("output_prior", out_scale, 10 * out_scale, 0.0, Inf)
-    end
-
-    return combine_distributions([input_prior, output_prior])
-
+    return n_hp_in + n_hp_out + n_scalings
 end
 
-function build_default_prior(input_dim::Int, diagonalize_input::Bool; in_scale = 1.0)
-    #scalar case consistent with vector case
+function calculate_n_hyperparameters(input_dim::Int, kernel_structure::KST) where {KST <: KernelStructureType}
     output_dim = 1
-    diagonalize_output = false
-    out_scale = 1.0
-    return build_default_prior(
-        input_dim,
-        output_dim,
-        diagonalize_input,
-        diagonalize_output,
-        in_scale = in_scale,
-        out_scale = out_scale,
-    )
+    return calculate_n_hyperparameters(input_dim, output_dim, kernel_structure)
 end
 
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
+function calculate_n_hyperparameters(
+    input_dim::Int,
+    output_dim::Int,
+    kernel_structure::NK,
+) where {NK <: NonseparableKernel}
+    cov_structure = get_cov_structure(kernel_structure)
+    n_hp = calculate_n_hyperparameters(input_dim * output_dim, cov_structure)
+    n_scalings = 1
+    # if both in and output dim are "OneDimFactors" we still learn 1 hp
+    if n_hp == 0
+        return 1 + n_scalings
+    end
 
-The number of hyperparameters required to characterize covariance (determined from `hyperparameters_from_flat`).
-"""
-function n_hyperparameters_cov(dim::Int, diagonalize::Bool)
-    n_hp = 0
-    n_hp += diagonalize ? dim : Int(dim * (dim + 1) / 2)     # inputs: diagonal or cholesky
-    n_hp += (dim > 1) ? 2 : 0     # add two more for constant diagonal in each case.
-    return n_hp
+    return n_hp + n_scalings
 end
 
 
+
+
 """
 $(DocStringExtensions.TYPEDSIGNATURES)
 
-Calculate number of hyperparameters required to create the default prior in the given input/output space (determined from `hyperparameters_from_flat`)
+Builds a prior over the hyperparameters (i.e. the low-rank/cholesky/diagaonal or individaul entries of the input/output covariances).
+For example, the case where the input covariance ``U = γ_1 * (LL^T + γ_2 I)``, 
+we set priors for the entries of the lower triangular matrix  ``L`` as normal, and constant scalings ``γ_i`` as log-normal to retain positivity.
 """
-function calculate_n_hyperparameters(input_dim::Int, output_dim::Int, diagonalize_input::Bool, diagonalize_output::Bool)
-    n_hp_in = n_hyperparameters_cov(input_dim, diagonalize_input)
-    n_hp_out = n_hyperparameters_cov(output_dim, diagonalize_output)
-    return n_hp_in + n_hp_out
+function build_default_prior(input_dim::Int, output_dim::Int, kernel_structure::SK) where {SK <: SeparableKernel}
+    input_cov_structure = get_input_cov_structure(kernel_structure)
+    output_cov_structure = get_output_cov_structure(kernel_structure)
+
+    n_hp_in = calculate_n_hyperparameters(input_dim, input_cov_structure)
+    input_prior = build_default_prior("input", n_hp_in, input_cov_structure)
+    n_hp_out = calculate_n_hyperparameters(output_dim, output_cov_structure)
+    output_prior = build_default_prior("output", n_hp_out, output_cov_structure)
+
+    scaling_kern = constrained_gaussian("sigma", 1, 5, 0, Inf)
+
+    # We only use OneDimFactor values, default to input if both in and output dimension are one dimensional.Otherwise they are ignored
+    if isnothing(input_prior) && isnothing(output_prior)
+        onedim_prior = constrained_gaussian("onedim", 1.0, 1.0, 0.0, Inf)
+        return combine_distributions([onedim_prior, scaling_kern])
+    elseif isnothing(input_prior)
+        return combine_distributions([output_prior, scaling_kern])
+    elseif isnothing(output_prior)
+        return combine_distributions([input_prior, scaling_kern])
+    else
+        return combine_distributions([input_prior, output_prior, scaling_kern])
+    end
+end
+
+function build_default_prior(input_dim::Int, kernel_structure::KST) where {KST <: KernelStructureType}
+    output_dim = 1
+    return build_default_prior(input_dim, output_dim, kernel_structure)
+end
+
+function build_default_prior(input_dim::Int, output_dim::Int, kernel_structure::NK) where {NK <: NonseparableKernel}
+    cov_structure = get_cov_structure(kernel_structure)
+    n_hp = calculate_n_hyperparameters(input_dim * output_dim, cov_structure)
+    if n_hp == 0
+        pd_kern = constrained_gaussian("onedim", 1.0, 1.0, 0.0, Inf)
+    else
+        pd_kern = build_default_prior("full", n_hp, cov_structure)
+    end
+
+    scaling_kern = constrained_gaussian("sigma", 1, 5, 0, Inf)
+    return combine_distributions([pd_kern, scaling_kern])
+end
+
+function hyperparameters_from_flat(
+    x::VV,
+    input_dim::Int,
+    output_dim::Int,
+    kernel_structure::SK,
+) where {VV <: AbstractVector, SK <: SeparableKernel}
+    input_cov_structure = get_input_cov_structure(kernel_structure)
+    output_cov_structure = get_output_cov_structure(kernel_structure)
+
+    n_hp_in = calculate_n_hyperparameters(input_dim, input_cov_structure)
+    U = hyperparameters_from_flat(x[1:n_hp_in], input_cov_structure)
+    V = hyperparameters_from_flat(x[(n_hp_in + 1):end], output_cov_structure)
+
+    if isnothing(U) && isnothing(V) #i.e. both are from OneDimFactor cov structures
+        return x[1] * ones(1, 1), V
+    elseif isnothing(U)
+        return U, 0.5 * (V + permutedims(V, (2, 1)))
+    elseif isnothing(V)
+        return 0.5 * (U + permutedims(U, (2, 1))), V
+    else
+        # symmetrize (sometimes there are numerical artifacts)
+        U = 0.5 * (U + permutedims(U, (2, 1)))
+        V = 0.5 * (V + permutedims(V, (2, 1)))
+        return U, V
+    end
 end
 
-function calculate_n_hyperparameters(input_dim::Int, diagonalize_input::Bool)
-    #scalar case consistent with vector case
+function hyperparameters_from_flat(
+    x::VV,
+    input_dim::Int,
+    kernel_structure::SK,
+) where {VV <: AbstractVector, SK <: SeparableKernel}
     output_dim = 1
-    diagonalize_output = false
-    return calculate_n_hyperparameters(input_dim, output_dim, diagonalize_input, diagonalize_output)
+    U, V = hyperparameters_from_flat(x, input_dim, output_dim, kernel_structure)
+
+    return U
 end
 
+function hyperparameters_from_flat(
+    x::VV,
+    input_dim::Int,
+    output_dim::Int,
+    kernel_structure::NK,
+) where {VV <: AbstractVector, NK <: NonseparableKernel}
+    cov_structure = get_cov_structure(kernel_structure)
+    C = hyperparameters_from_flat(x, cov_structure)
+    if isnothing(C)
+        return x[1] * ones(1, 1)
+    else
+        return 0.5 * (C + permutedims(C, (2, 1)))
+    end
+
+end
 
 """
 $(DocStringExtensions.TYPEDSIGNATURES)
@@ -252,7 +403,7 @@ function calculate_mean_cov_and_coeffs(
     otest = get_outputs(io_pairs)[:, (n_train + 1):end]
     input_dim = size(itrain, 1)
     output_dim = size(otrain, 1)
-
+    n_test = size(itest, 2)
     # build and fit the RF
     rfm = RFM_from_hyperparameters(
         rfi,
@@ -278,11 +429,15 @@ function calculate_mean_cov_and_coeffs(
         buffer,
         tullio_threading = thread_opt,
     )
+
     # sizes (output_dim x n_test), (output_dim x output_dim x n_test) 
-    scaled_coeffs = sqrt(1 / n_features) * RF.Methods.get_coeffs(fitted_features)
 
-    # we add the additional complexity term
-    # TODO only cholesky and SVD available
+    ## TODO - the theory states that the following should be set:
+    # scaled_coeffs = sqrt(1 / (n_features)) * RF.Methods.get_coeffs(fitted_features)
+    # However the convergence is much improved with setting this to zero:
+    scaled_coeffs = 0
+
+
     if decomp_type == "cholesky"
         chol_fac = RF.Methods.get_decomposition(RF.Methods.get_feature_factors(fitted_features)).L
         complexity = 2 * sum(log(chol_fac[i, i]) for i in 1:size(chol_fac, 1))
@@ -290,10 +445,47 @@ function calculate_mean_cov_and_coeffs(
         svd_singval = RF.Methods.get_decomposition(RF.Methods.get_feature_factors(fitted_features)).S
         complexity = sum(log, svd_singval) # note this is log(abs(det))
     end
+    complexity = sqrt(abs(complexity)) #abs can introduce nonconvexity, 
+
     return scaled_coeffs, complexity
 
 end
 
+
+
+"""
+$(DocStringExtensions.TYPEDSIGNATURES)
+
+Calculate the empirical covariance, additionally applying a shrinkage operator (here the Ledoit Wolf 2004 shrinkage operation). Known to have better stability properties than Monte-Carlo for low sample sizes
+"""
+function shrinkage_cov(sample_mat::AA) where {AA <: AbstractMatrix}
+    n_out, n_sample = size(sample_mat)
+
+    # de-mean (as we will use the samples directly for calculation of β)
+    sample_mat_zeromean = sample_mat .- mean(sample_mat, dims = 2)
+    # Ledoit Wolf shrinkage to I
+
+    # get sample covariance
+    Γ = cov(sample_mat_zeromean, dims = 2)
+    # estimate opt shrinkage
+    μ_shrink = 1 / n_out * tr(Γ)
+    δ_shrink = norm(Γ - μ_shrink * I)^2 / n_out # (scaled) frob norm of Γ_m
+    #once de-meaning, we need to correct the sample covariance with an n_sample -> n_sample-1
+    β_shrink = sum([norm(c * c' - -Γ)^2 / n_out for c in eachcol(sample_mat_zeromean)]) / (n_sample - 1)^2
+
+    γ_shrink = min(β_shrink / δ_shrink, 1) # clipping is typically rare
+    #  γμI + (1-γ)Γ
+    Γ .*= (1 - γ_shrink)
+    for i in 1:n_out
+        Γ[i, i] += γ_shrink * μ_shrink
+    end
+
+    @info "Shrinkage scale: $(γ_shrink), (0 = none, 1 = revert to scaled Identity)\n shrinkage covariance condition number: $(cond(Γ))"
+    return Γ
+end
+
+
+
 """
 $(DocStringExtensions.TYPEDSIGNATURES)
 
@@ -377,14 +569,19 @@ function estimate_mean_and_coeffnorm_covariance(
         blockmeans[id, :] = permutedims(means[i, :, :], (2, 1))
     end
 
+    sample_mat = vcat(blockmeans, coeffl2norm, complexity)
+    shrinkage = true
+    if shrinkage
+        Γ = shrinkage_cov(sample_mat)
+    else
+        Γ = cov(sample_mat, dims = 2)
+    end
+
     if !isposdef(approx_σ2)
         println("approx_σ2 not posdef")
         approx_σ2 = posdef_correct(approx_σ2)
     end
 
-    Γ = cov(vcat(blockmeans, coeffl2norm, complexity), dims = 2)
-
-
     return Γ, approx_σ2
 
 end
@@ -579,16 +776,27 @@ function estimate_mean_and_coeffnorm_covariance(
         blockmeans[id, :] = permutedims(means[i, :, :], (2, 1))
     end
 
+
+    sample_mat = vcat(blockmeans, coeffl2norm, complexity)
+    shrinkage = true
+    if shrinkage
+        Γ = shrinkage_cov(sample_mat)
+    else
+        Γ = cov(sample_mat, dims = 2)
+    end
+
     if !isposdef(approx_σ2)
         println("approx_σ2 not posdef")
         approx_σ2 = posdef_correct(approx_σ2)
     end
 
-    Γ = cov(vcat(blockmeans, coeffl2norm, complexity), dims = 2)
     return Γ, approx_σ2
 
 end
 
+
+
+
 function calculate_ensemble_mean_and_coeffnorm(
     rfi::RFI,
     rng::RNG,
@@ -622,11 +830,6 @@ function calculate_ensemble_mean_and_coeffnorm(
     println("calculating " * string(N_ens * repeats) * " ensemble members...")
 
     nthreads = Threads.nthreads()
-
-    means = zeros(output_dim, N_ens, n_test)
-    complexity = zeros(1, N_ens)
-    coeffl2norm = zeros(1, N_ens)
-
     c_list = [zeros(1, N_ens) for i in 1:nthreads]
     cp_list = [zeros(1, N_ens) for i in 1:nthreads]
     m_list = [zeros(output_dim, N_ens, n_test) for i in 1:nthreads]
diff --git a/src/ScalarRandomFeature.jl b/src/ScalarRandomFeature.jl
index 0a3c0ae9c..a23586127 100644
--- a/src/ScalarRandomFeature.jl
+++ b/src/ScalarRandomFeature.jl
@@ -6,11 +6,12 @@ $(DocStringExtensions.TYPEDEF)
 
 Structure holding the Scalar Random Feature models. 
 
-# Fields
+# FieldsWhen calibrated with ocean LES,
 $(DocStringExtensions.TYPEDFIELDS)
 
 """
-struct ScalarRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG} <: RandomFeatureInterface
+struct ScalarRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG, KST <: KernelStructureType} <:
+       RandomFeatureInterface
     "vector of `RandomFeatureMethod`s, contains the feature structure, batch-sizes and regularization"
     rfms::Vector{RF.Methods.RandomFeatureMethod}
     "vector of `Fit`s, containing the matrix decomposition and coefficients of RF when fitted to data"
@@ -23,8 +24,8 @@ struct ScalarRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG} <:
     input_dim::Int
     "choice of random number generator"
     rng::RNG
-    "Assume inputs are decorrelated for ML"
-    diagonalize_input::Bool
+    "Kernel structure type (e.g. Separable or Nonseparable)"
+    kernel_structure::KST
     "Random Feature decomposition, choose from \"svd\" or \"cholesky\" (default)"
     feature_decomposition::S
     "dictionary of options for hyperparameter optimizer"
@@ -76,9 +77,9 @@ get_rng(srfi::ScalarRandomFeatureInterface) = srfi.rng
 """
 $(DocStringExtensions.TYPEDSIGNATURES)
 
-gets the diagonalize_input field
+Gets the kernel_structure field
 """
-get_diagonalize_input(srfi::ScalarRandomFeatureInterface) = srfi.diagonalize_input
+get_kernel_structure(srfi::ScalarRandomFeatureInterface) = srfi.kernel_structure
 
 """
 $(DocStringExtensions.TYPEDSIGNATURES)
@@ -101,7 +102,7 @@ $(DocStringExtensions.TYPEDSIGNATURES)
 Constructs a `ScalarRandomFeatureInterface <: MachineLearningTool` interface for the `RandomFeatures.jl` package for multi-input and single- (or decorrelated-)output emulators.
  - `n_features` - the number of random features
  - `input_dim` - the dimension of the input space
- - `diagonalize_input = false` - whether to assume independence in input space (Note, whens set `true`, this tool will approximate directly the behaviour of the scalar-valued GaussianProcess with ARD kernel)
+ - `kernel_structure` -  - a prescribed form of kernel structure
  - `batch_sizes = nothing` - Dictionary of batch sizes passed `RandomFeatures.jl` object (see definition there)
  - `rng = Random.GLOBAL_RNG` - random number generator 
  - `feature_decomposition = "cholesky"` - choice of how to store decompositions of random features, `cholesky` or `svd` available
@@ -110,7 +111,7 @@ Constructs a `ScalarRandomFeatureInterface <: MachineLearningTool` interface for
      - "prior_in_scale": use this to tune the input prior scale
      - "n_ensemble":  number of ensemble members
      - "n_iteration":  number of eki iterations
-     - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 1.0)  
+     - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 0.0)  
      - "scheduler": Learning rate Scheduler (a.k.a. EKP timestepper) Default: DataMisfitController
      - "tikhonov":  tikhonov regularization parameter if >0
      - "inflation":  additive inflation ∈ [0,1] with 0 being no inflation
@@ -121,22 +122,23 @@ Constructs a `ScalarRandomFeatureInterface <: MachineLearningTool` interface for
 function ScalarRandomFeatureInterface(
     n_features::Int,
     input_dim::Int;
-    diagonalize_input::Bool = false,
+    kernel_structure::Union{KST, Nothing} = nothing,
     batch_sizes::Union{Dict{S, Int}, Nothing} = nothing,
     rng::RNG = Random.GLOBAL_RNG,
     feature_decomposition::S = "cholesky",
     optimizer_options::Union{Dict{S}, Nothing} = nothing,
-) where {S <: AbstractString, RNG <: AbstractRNG}
+) where {S <: AbstractString, RNG <: AbstractRNG, KST <: KernelStructureType}
     # Initialize vector for GP models
     rfms = Vector{RF.Methods.RandomFeatureMethod}(undef, 0)
     fitted_features = Vector{RF.Methods.Fit}(undef, 0)
 
-    if !isnothing(optimizer_options)
-        prior_in_scale = "prior_in_scale" ∈ keys(optimizer_options) ? optimizer_options["prior_in_scale"] : 1.0
-    else
-        prior_in_scale = 1.0
+    if isnothing(kernel_structure)
+        kernel_structure = SeparableKernel(cov_structure_from_string("lowrank", input_dim), OneDimFactor())
     end
-    prior = build_default_prior(input_dim, diagonalize_input, in_scale = prior_in_scale)
+    KSType = typeof(kernel_structure)
+
+    prior = build_default_prior(input_dim, kernel_structure)
+
     # default optimizer settings
     optimizer_opts = Dict(
         "prior" => prior, #the hyperparameter_prior
@@ -164,23 +166,60 @@ function ScalarRandomFeatureInterface(
     end
     @info("hyperparameter optimization with EKI configured with $opt_tmp")
 
-    return ScalarRandomFeatureInterface{S, RNG}(
+    return ScalarRandomFeatureInterface{S, RNG, KSType}(
         rfms,
         fitted_features,
         batch_sizes,
         n_features,
         input_dim,
         rng,
-        diagonalize_input,
+        kernel_structure,
         feature_decomposition,
         optimizer_opts,
     )
 end
 
+function hyperparameter_distribution_from_flat(
+    x::VV,
+    input_dim::Int,
+    kernel_structure::SK,
+) where {VV <: AbstractVector, SK <: SeparableKernel}
+
+    M = zeros(input_dim) #scalar output
+    U = hyperparameters_from_flat(x, input_dim, kernel_structure)
+    if !isposdef(U)
+        println("U not posdef - correcting")
+        U = posdef_correct(U)
+    end
+
+    dist = MvNormal(M, U)
+    pd = ParameterDistribution(
+        Dict(
+            "distribution" => Parameterized(dist),
+            "constraint" => repeat([no_constraint()], input_dim),
+            "name" => "xi",
+        ),
+    )
+
+    return pd
+end
+
+function hyperparameter_distribution_from_flat(
+    x::VV,
+    input_dim::Int,
+    kernel_structure::NK,
+) where {VV <: AbstractVector, NK <: NonseparableKernel}
+    throw(
+        ArgumentError(
+            "Scalar Kernels must be of type `Separable( *** , OneDimFactor())`, received $(kernel_structure)",
+        ),
+    )
+end
+
 """
 $(DocStringExtensions.TYPEDSIGNATURES)
 
-Builds the random feature method from hyperparameters. We use cosine activation functions and a MatrixVariateNormal(M,U,V) distribution (from `Distributions.jl`) with mean M=0, and input covariance U built from cholesky factorization or diagonal components based on the diagonalize_input flag.
+Builds the random feature method from hyperparameters. We use cosine activation functions and a MatrixVariateNormal(M,U,V) distribution (from `Distributions.jl`) with mean M=0, and input covariance U built using a `CovarianceStructureType`.
 """
 function RFM_from_hyperparameters(
     srfi::ScalarRandomFeatureInterface,
@@ -198,33 +237,22 @@ function RFM_from_hyperparameters(
     S <: AbstractString,
     MT <: MultithreadType,
 }
-    diagonalize_input = get_diagonalize_input(srfi)
 
-    M = zeros(input_dim) #scalar output
-    U = hyperparameters_from_flat(l, input_dim, diagonalize_input)
-    if !isposdef(U)
-        println("U not posdef - correcting")
-        U = posdef_correct(U)
-    end
+    xi_hp = l[1:(end - 1)]
+    sigma_hp = l[end]
+    kernel_structure = get_kernel_structure(srfi)
+    pd = hyperparameter_distribution_from_flat(xi_hp, input_dim, kernel_structure)
 
-    dist = MvNormal(M, U)
-    pd = ParameterDistribution(
-        Dict(
-            "distribution" => Parameterized(dist),
-            "constraint" => repeat([no_constraint()], input_dim),
-            "name" => "xi",
-        ),
-    )
     feature_sampler = RF.Samplers.FeatureSampler(pd, rng = rng)
     # Learn hyperparameters for different feature types
-
-    sf = RF.Features.ScalarFourierFeature(n_features, feature_sampler)
+    feature_parameters = Dict("sigma" => sigma_hp)
+    sff = RF.Features.ScalarFourierFeature(n_features, feature_sampler, feature_parameters = feature_parameters)
     thread_opt = isa(multithread_type, TullioThreading) # if we want to multithread with tullio
     if isnothing(batch_sizes)
-        return RF.Methods.RandomFeatureMethod(sf, regularization = regularization, tullio_threading = thread_opt)
+        return RF.Methods.RandomFeatureMethod(sff, regularization = regularization, tullio_threading = thread_opt)
     else
         return RF.Methods.RandomFeatureMethod(
-            sf,
+            sff,
             regularization = regularization,
             batch_sizes = batch_sizes,
             tullio_threading = thread_opt,
@@ -254,7 +282,7 @@ RFM_from_hyperparameters(
 """
 $(DocStringExtensions.TYPEDSIGNATURES)
 
-Builds the random feature method from hyperparameters. We use cosine activation functions and a Multivariate Normal distribution (from `Distributions.jl`) with mean M=0, and input covariance U built from cholesky factorization or diagonal components based on the diagonalize_input flag.
+Builds the random feature method from hyperparameters. We use cosine activation functions and a Multivariate Normal distribution (from `Distributions.jl`) with mean M=0, and input covariance U built with the `CovarianceStructureType`.
 """
 function build_models!(
     srfi::ScalarRandomFeatureInterface,
@@ -268,6 +296,10 @@ function build_models!(
 
     input_dim = size(input_values, 1)
 
+    kernel_structure = get_kernel_structure(srfi)
+    n_hp = calculate_n_hyperparameters(input_dim, kernel_structure)
+
+
     rfms = get_rfms(srfi)
     fitted_features = get_fitted_features(srfi)
     n_features = get_n_features(srfi)
@@ -276,16 +308,19 @@ function build_models!(
     decomp_type = get_feature_decomposition(srfi)
     optimizer_options = get_optimizer_options(srfi)
 
-    #regularization = I = 1.0 in scalar case
-    regularization = I
+
     # Optimize features with EKP for each output dim
     # [1.] Split data into test/train 80/20 
     train_fraction = optimizer_options["train_fraction"]
     n_train = Int(floor(train_fraction * n_data))
     n_test = n_data - n_train
-    n_features_opt = min(n_features, Int(floor(2 * n_test))) #we take a specified number of features for optimization.
+    n_features_opt = min(n_features, Int(floor(10 * n_test)))#Int(floor(2 * n_test))) #we take a specified number of features for optimization.
     idx_shuffle = randperm(rng, n_data)
 
+    #regularization = I = 1.0 in scalar case
+    regularization = I
+
+
     @info (
         "hyperparameter learning for $n_rfms models using $n_train training points, $n_test validation points and $n_features_opt features"
     )
@@ -311,11 +346,22 @@ function build_models!(
         end
         # [2.] Estimate covariance at mean value
         prior = optimizer_options["prior"]
+
+        # where prior space has changed we need to rebuild the priors
+        if ndims(prior) > n_hp
+
+            # comes from having a truncated output_dimension
+            # TODO not really a truncation here, resetting to default
+            @info "Original input space of dimension $(get_input_dim(srfi)) has been truncated to $(input_dim). \n Rebuilding prior... number of hyperparameters reduced from $(ndims(prior)) to $(n_hp)."
+            prior = build_default_prior(input_dim, kernel_structure)
+
+        end
+
         μ_hp = transform_unconstrained_to_constrained(prior, mean(prior))
 
         cov_sample_multiplier = optimizer_options["cov_sample_multiplier"]
         n_cov_samples_min = n_test + 2
-        n_cov_samples = Int(floor(n_cov_samples_min * max(cov_sample_multiplier, 1.0)))
+        n_cov_samples = Int(floor(n_cov_samples_min * max(cov_sample_multiplier, 0.0)))
 
         internal_Γ, approx_σ2 = estimate_mean_and_coeffnorm_covariance(
             srfi,
@@ -331,9 +377,8 @@ function build_models!(
             decomp_type,
             multithread_type,
         )
-
         Γ = internal_Γ
-        Γ[1:n_test, 1:n_test] += approx_σ2
+        Γ[1:n_test, 1:n_test] += regularization  # + approx_σ2 
         Γ[(n_test + 1):end, (n_test + 1):end] += I
 
         # [3.] set up EKP optimization
@@ -343,7 +388,8 @@ function build_models!(
         scheduler = optimizer_options["scheduler"]
 
         initial_params = construct_initial_ensemble(rng, prior, n_ensemble)
-        min_complexity = log(det(regularization))
+        min_complexity = log(regularization.λ)
+        min_complexity = sqrt(abs(min_complexity))
         data = vcat(get_outputs(io_pairs_opt)[(n_train + 1):end], 0.0, min_complexity)
         ekiobj = EKP.EnsembleKalmanProcess(
             initial_params,
@@ -415,6 +461,7 @@ function build_models!(
 
         io_pairs_i = PairedDataContainer(input_values, reshape(output_values[i, :], 1, size(output_values, 2)))
         # Now, fit new RF model with the optimized hyperparameters
+
         rfm_i = RFM_from_hyperparameters(
             srfi,
             rng,
diff --git a/src/VectorRandomFeature.jl b/src/VectorRandomFeature.jl
index bbc7807c5..5326d1678 100644
--- a/src/VectorRandomFeature.jl
+++ b/src/VectorRandomFeature.jl
@@ -8,8 +8,7 @@ export get_rfms,
     get_input_dim,
     get_output_dim,
     get_rng,
-    get_diagonalize_input,
-    get_diagonalize_output,
+    get_kernel_structure,
     get_optimizer_options
 
 """
@@ -21,7 +20,8 @@ Structure holding the Vector Random Feature models.
 $(DocStringExtensions.TYPEDFIELDS)
 
 """
-struct VectorRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG} <: RandomFeatureInterface
+struct VectorRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG, KST <: KernelStructureType} <:
+       RandomFeatureInterface
     "A vector of `RandomFeatureMethod`s, contains the feature structure, batch-sizes and regularization"
     rfms::Vector{RF.Methods.RandomFeatureMethod}
     "vector of `Fit`s, containing the matrix decomposition and coefficients of RF when fitted to data"
@@ -38,10 +38,8 @@ struct VectorRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG} <:
     rng::RNG
     "regularization"
     regularization::Vector{Union{Matrix, UniformScaling, Diagonal}}
-    "Assume inputs are decorrelated for ML"
-    diagonalize_input::Bool
-    "Assume outputs are decorrelated for ML. Note: emulator(..., decorrelate=true) by default"
-    diagonalize_output::Bool
+    "Kernel structure type (e.g. Separable or Nonseparable)"
+    kernel_structure::KST
     "Random Feature decomposition, choose from \"svd\" or \"cholesky\" (default)"
     feature_decomposition::S
     "dictionary of options for hyperparameter optimizer"
@@ -107,16 +105,9 @@ get_regularization(vrfi::VectorRandomFeatureInterface) = vrfi.regularization
 """
 $(DocStringExtensions.TYPEDSIGNATURES)
 
-Gets the diagonalize_input field
+Gets the kernel_structure field
 """
-get_diagonalize_input(vrfi::VectorRandomFeatureInterface) = vrfi.diagonalize_input
-
-"""
-$(DocStringExtensions.TYPEDSIGNATURES)
-
-Gets the diagonalize_output field
-"""
-get_diagonalize_output(vrfi::VectorRandomFeatureInterface) = vrfi.diagonalize_output
+get_kernel_structure(vrfi::VectorRandomFeatureInterface) = vrfi.kernel_structure
 
 """
 $(DocStringExtensions.TYPEDSIGNATURES)
@@ -139,8 +130,7 @@ Constructs a `VectorRandomFeatureInterface <: MachineLearningTool` interface for
  - `n_features` - the number of random features
  - `input_dim` - the dimension of the input space
  - `output_dim` - the dimension of the output space
- - `diagonalize_input = false` - whether to assume independence in input space
- - `diagonalize_output::Bool = false` - whether to assume independence in output space (even if set true, this is not the same as a list of scalar-valued models, as hyperparameters will be shared between models in this case)
+ - `kernel_structure` -  - a prescribed form of kernel structure
  - `batch_sizes = nothing` - Dictionary of batch sizes passed `RandomFeatures.jl` object (see definition there)
  - `rng = Random.GLOBAL_RNG` - random number generator 
  - `feature_decomposition = "cholesky"` - choice of how to store decompositions of random features, `cholesky` or `svd` available
@@ -150,7 +140,7 @@ Constructs a `VectorRandomFeatureInterface <: MachineLearningTool` interface for
       - "n_ensemble": number of ensemble members
       - "n_iteration": number of eki iterations
       - "scheduler": Learning rate Scheduler (a.k.a. EKP timestepper) Default: DataMisfitController
-      - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 1.0) 
+      - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 0.0) 
       - "tikhonov": tikhonov regularization parameter if > 0
       - "inflation": additive inflation ∈ [0,1] with 0 being no inflation
       - "train_fraction": e.g. 0.8 (default)  means 80:20 train - test split
@@ -162,35 +152,28 @@ function VectorRandomFeatureInterface(
     n_features::Int,
     input_dim::Int,
     output_dim::Int;
-    diagonalize_input::Bool = false,
-    diagonalize_output::Bool = false,
+    kernel_structure::Union{KST, Nothing} = nothing,
     batch_sizes::Union{Dict{S, Int}, Nothing} = nothing,
     rng::RNG = Random.GLOBAL_RNG,
     feature_decomposition::S = "cholesky",
     optimizer_options::Union{Dict{S}, Nothing} = nothing,
-) where {S <: AbstractString, RNG <: AbstractRNG}
+) where {S <: AbstractString, RNG <: AbstractRNG, KST <: KernelStructureType}
 
     # Initialize vector for RF model
     rfms = Vector{RF.Methods.RandomFeatureMethod}(undef, 0)
     fitted_features = Vector{RF.Methods.Fit}(undef, 0)
     regularization = Vector{Union{Matrix, UniformScaling, Nothing}}(undef, 0)
 
-    #Optimization Defaults    
-    if !isnothing(optimizer_options)
-        prior_in_scale = "prior_in_scale" ∈ keys(optimizer_options) ? optimizer_options["prior_in_scale"] : 1.0
-        prior_out_scale = "prior_out_scale" ∈ keys(optimizer_options) ? optimizer_options["prior_out_scale"] : 1.0
-    else
-        prior_in_scale = 1.0
-        prior_out_scale = 1.0
+    if isnothing(kernel_structure)
+        kernel_structure = SeparableKernel(
+            cov_structure_from_string("lowrank", input_dim),
+            cov_structure_from_string("lowrank", output_dim),
+        )
     end
-    prior = build_default_prior(
-        input_dim,
-        output_dim,
-        diagonalize_input,
-        diagonalize_output,
-        in_scale = prior_in_scale,
-        out_scale = prior_out_scale,
-    )
+    KSType = typeof(kernel_structure)
+    prior = build_default_prior(input_dim, output_dim, kernel_structure)
+
+    #Optimization Defaults    
     optimizer_opts = Dict(
         "prior" => prior, #the hyperparameter_prior (note scalings have already been applied)
         "n_ensemble" => max(ndims(prior) + 1, 10), #number of ensemble
@@ -202,6 +185,7 @@ function VectorRandomFeatureInterface(
         "train_fraction" => 0.8, # 80:20 train - test split
         "multithread" => "ensemble", # instead of "tullio"
         "verbose" => false, # verbose optimizer statements
+        "localization" => EKP.Localizers.NoLocalization(), # localization / sample error correction for small ensembles
     )
 
     if !isnothing(optimizer_options)
@@ -218,7 +202,7 @@ function VectorRandomFeatureInterface(
     end
     @info("hyperparameter optimization with EKI configured with $opt_tmp")
 
-    return VectorRandomFeatureInterface{S, RNG}(
+    return VectorRandomFeatureInterface{S, RNG, KSType}(
         rfms,
         fitted_features,
         batch_sizes,
@@ -227,17 +211,72 @@ function VectorRandomFeatureInterface(
         output_dim,
         rng,
         regularization,
-        diagonalize_input,
-        diagonalize_output,
+        kernel_structure,
         feature_decomposition,
         optimizer_opts,
     )
 end
 
+
+
+function hyperparameter_distribution_from_flat(
+    x::VV,
+    input_dim::Int,
+    output_dim::Int,
+    kernel_structure::SK,
+) where {VV <: AbstractVector, SK <: SeparableKernel}
+    M = zeros(input_dim, output_dim) # n x p mean
+
+    U, V = hyperparameters_from_flat(x, input_dim, output_dim, kernel_structure)
+
+    if !isposdef(U)
+        println("U not posdef - correcting")
+        U = posdef_correct(U)
+    end
+    if !isposdef(V)
+        println("V not posdef - correcting")
+        V = posdef_correct(V)
+    end
+
+    dist = MatrixNormal(M, U, V)
+    pd = ParameterDistribution(
+        Dict(
+            "distribution" => Parameterized(dist),
+            "constraint" => repeat([no_constraint()], input_dim * output_dim),
+            "name" => "xi",
+        ),
+    )
+    return pd
+end
+
+function hyperparameter_distribution_from_flat(
+    x::VV,
+    input_dim::Int,
+    output_dim::Int,
+    kernel_structure::NK,
+) where {VV <: AbstractVector, NK <: NonseparableKernel}
+
+    C = hyperparameters_from_flat(x, input_dim, output_dim, kernel_structure)
+    if !isposdef(C)
+        println("C not posdef - correcting")
+        C = posdef_correct(C)
+    end
+    dist = MvNormal(zeros(input_dim * output_dim), C)
+    pd = ParameterDistribution(
+        Dict(
+            "distribution" => Parameterized(dist),
+            "constraint" => repeat([no_constraint()], input_dim * output_dim),
+            "name" => "xi",
+        ),
+    )
+    return pd
+end
+
+
 """
 $(DocStringExtensions.TYPEDSIGNATURES)
 
-Builds the random feature method from hyperparameters. We use cosine activation functions and a Matrixvariate Normal distribution with mean M=0, and input(output) covariance U(V) built from cholesky factorization or diagonal components based on the diagonalize_input(diagonalize_output) flag.
+Builds the random feature method from hyperparameters. We use cosine activation functions and a Matrixvariate Normal distribution with mean M=0, and input(output) covariance U(V) built with a `CovarianceStructureType`.
 """
 function RFM_from_hyperparameters(
     vrfi::VectorRandomFeatureInterface,
@@ -256,31 +295,22 @@ function RFM_from_hyperparameters(
     MorUSorD <: Union{Matrix, UniformScaling, Diagonal},
     MT <: MultithreadType,
 }
-    diagonalize_input = get_diagonalize_input(vrfi)
-    diagonalize_output = get_diagonalize_output(vrfi)
 
-    M = zeros(input_dim, output_dim) # n x p mean
-    U, V = hyperparameters_from_flat(l, input_dim, output_dim, diagonalize_input, diagonalize_output)
+    xi_hp = l[1:(end - 1)]
+    sigma_hp = l[end]
 
-    if !isposdef(U)
-        println("U not posdef - correcting")
-        U = posdef_correct(U)
-    end
-    if !isposdef(V)
-        println("V not posdef - correcting")
-        V = posdef_correct(V)
-    end
+    kernel_structure = get_kernel_structure(vrfi)
+    pd = hyperparameter_distribution_from_flat(xi_hp, input_dim, output_dim, kernel_structure)
 
-    dist = MatrixNormal(M, U, V)
-    pd = ParameterDistribution(
-        Dict(
-            "distribution" => Parameterized(dist),
-            "constraint" => repeat([no_constraint()], input_dim * output_dim),
-            "name" => "xi",
-        ),
-    )
     feature_sampler = RF.Samplers.FeatureSampler(pd, output_dim, rng = rng)
-    vff = RF.Features.VectorFourierFeature(n_features, output_dim, feature_sampler)
+
+    feature_parameters = Dict("sigma" => sigma_hp)
+    vff = RF.Features.VectorFourierFeature(
+        n_features,
+        output_dim,
+        feature_sampler,
+        feature_parameters = feature_parameters,
+    )
 
     thread_opt = isa(multithread_type, TullioThreading) # if we want to multithread with tullio
     if isnothing(batch_sizes)
@@ -314,10 +344,10 @@ function build_models!(
 
     input_dim = size(input_values, 1)
     output_dim = size(output_values, 1)
-    # there are less hyperparameters when we diagonalize
-    diagonalize_input = get_diagonalize_input(vrfi)
-    diagonalize_output = get_diagonalize_output(vrfi)
-    n_hp = calculate_n_hyperparameters(input_dim, output_dim, diagonalize_input, diagonalize_output)
+
+    kernel_structure = get_kernel_structure(vrfi)
+
+    n_hp = calculate_n_hyperparameters(input_dim, output_dim, kernel_structure)
 
     rfms = get_rfms(vrfi)
     fitted_features = get_fitted_features(vrfi)
@@ -340,12 +370,13 @@ function build_models!(
     prior = optimizer_options["prior"]
     rng = get_rng(vrfi)
 
+    # where prior space has changed we need to rebuild the priors [TODO just build them here in the first place?]
     if ndims(prior) > n_hp
+
         # comes from having a truncated output_dimension
         # TODO not really a truncation here, resetting to default
-        @info "truncating hyperparameter space to default in first $n_hp dimensions, due to truncation of output space"
-        prior = constrained_gaussian("cholesky_factors", 1.0, 1.0, 0.0, Inf, repeats = n_hp)
-
+        @info "Original input-output space of dimension ($(get_input_dim(vrfi)), $(get_output_dim(vrfi))) has been truncated to ($(input_dim), $(output_dim)). \n Rebuilding prior... number of hyperparameters reduced from $(ndims(prior)) to $(n_hp)."
+        prior = build_default_prior(input_dim, output_dim, kernel_structure)
     end
 
     # Optimize feature cholesky factors with EKP
@@ -353,12 +384,20 @@ function build_models!(
     train_fraction = optimizer_options["train_fraction"]
     n_train = Int(floor(train_fraction * n_data)) # 20% split
     n_test = n_data - n_train
-    if diagonalize_output
-        n_features_opt = min(n_features, Int(floor(4 * n_train))) #we take a specified number of features for optimization. MAGIC NUMBER
+
+    # there are less hyperparameters when we diagonalize
+    if nameof(typeof(kernel_structure)) == :SeparableKernel
+        if nameof(typeof(get_output_cov_structure(kernel_structure))) == :DiagonalFactor
+            n_features_opt = min(n_features, Int(floor(4 * n_train))) #we take a specified number of features for optimization. MAGIC NUMBER
+        else
+            # note the n_features_opt should NOT exceed output_dim * n_train or the regularization is replaced with a diagonal approx.
+            n_features_opt = max(min(n_features, Int(floor(4 * sqrt(n_train * output_dim)))), Int(floor(1.9 * n_train)))#we take a specified number of features for optimization. MAGIC NUMBER
+        end
     else
         # note the n_features_opt should NOT exceed output_dim * n_train or the regularization is replaced with a diagonal approx.
         n_features_opt = max(min(n_features, Int(floor(4 * sqrt(n_train * output_dim)))), Int(floor(1.9 * n_train)))#we take a specified number of features for optimization. MAGIC NUMBER
     end
+
     @info (
         "hyperparameter learning using $n_train training points, $n_test validation points and $n_features_opt features"
     )
@@ -368,10 +407,11 @@ function build_models!(
     # in this setting, noise = I
     if regularization_matrix === nothing
         regularization = I
+
     else
         reg_mat = regularization_matrix
         if !isposdef(regularization_matrix)
-            reg_mat = posdef_correct(regularization_matrix)
+            regularization = posdef_correct(regularization_matrix)
             println("RF regularization matrix is not positive definite, correcting")
 
         else
@@ -382,7 +422,6 @@ function build_models!(
         end
     end
 
-
     idx_shuffle = randperm(rng, n_data)
 
     io_pairs_opt = PairedDataContainer(
@@ -394,12 +433,16 @@ function build_models!(
     μ_hp = transform_unconstrained_to_constrained(prior, mean(prior))
     cov_sample_multiplier = optimizer_options["cov_sample_multiplier"]
 
-    if diagonalize_output
-        n_cov_samples_min = n_test + 2 # diagonal case
+    if nameof(typeof(kernel_structure)) == :SeparableKernel
+        if nameof(typeof(get_output_cov_structure(kernel_structure))) == :DiagonalFactor
+            n_cov_samples_min = n_test + 2 # diagonal case
+        else
+            n_cov_samples_min = (n_test * output_dim + 2)
+        end
     else
-        n_cov_samples_min = (n_test * output_dim + 2) # min required for
+        n_cov_samples_min = (n_test * output_dim + 2)
     end
-    n_cov_samples = Int(floor(n_cov_samples_min * max(cov_sample_multiplier, 1.0)))
+    n_cov_samples = Int(floor(n_cov_samples_min * max(cov_sample_multiplier, 0.0)))
 
     internal_Γ, approx_σ2 = estimate_mean_and_coeffnorm_covariance(
         vrfi,
@@ -420,13 +463,15 @@ function build_models!(
     if tikhonov_opt_val == 0
         # Build the covariance
         Γ = internal_Γ
-        Γ[1:(n_test * output_dim), 1:(n_test * output_dim)] += approx_σ2
+        Γ[1:(n_test * output_dim), 1:(n_test * output_dim)] += regularization # + approx_σ2
         Γ[(n_test * output_dim + 1):end, (n_test * output_dim + 1):end] += I
 
         #in diag case we have data logdet = λ^m, in non diag case we have logdet(Λ^) to match the different reg matrices.
         min_complexity =
             isa(regularization, UniformScaling) ? n_features_opt * log(regularization.λ) :
             n_features_opt / output_dim * 2 * sum(log.(diag(cholesky(regularization).L)))
+        min_complexity = sqrt(abs(min_complexity))
+
 
         data = vcat(reshape(get_outputs(io_pairs_opt)[:, (n_train + 1):end], :, 1), 0.0, min_complexity) #flatten data
 
@@ -435,7 +480,7 @@ function build_models!(
         outsize = size(internal_Γ, 1)
         Γ = zeros(outsize + n_hp, outsize + n_hp)
         Γ[1:outsize, 1:outsize] = internal_Γ
-        Γ[1:(n_test * output_dim), 1:(n_test * output_dim)] += approx_σ2
+        Γ[1:(n_test * output_dim), 1:(n_test * output_dim)] += approx_σ2 + regularization
         Γ[(n_test * output_dim + 1):outsize, (n_test * output_dim + 1):outsize] += I
 
         Γ[(outsize + 1):end, (outsize + 1):end] = tikhonov_opt_val .* cov(prior)
@@ -444,6 +489,7 @@ function build_models!(
         min_complexity =
             isa(regularization, UniformScaling) ? n_features_opt * log(regularization.λ) :
             n_features_opt / output_dim * 2 * sum(log.(diag(cholesky(regularization).L)))
+        min_complexity = sqrt(abs(min_complexity))
 
         data = vcat(
             reshape(get_outputs(io_pairs_opt)[:, (n_train + 1):end], :, 1),
@@ -464,6 +510,7 @@ function build_models!(
     n_iteration = optimizer_options["n_iteration"]
     opt_verbose_flag = optimizer_options["verbose"]
     scheduler = optimizer_options["scheduler"]
+    localization = optimizer_options["localization"]
     if !isposdef(Γ)
         Γ = posdef_correct(Γ)
     end
@@ -478,6 +525,7 @@ function build_models!(
         scheduler = scheduler,
         rng = rng,
         verbose = opt_verbose_flag,
+        localization_method = localization,
     )
     err = zeros(n_iteration)
 
@@ -527,6 +575,7 @@ function build_models!(
 
     # [5.] extract optimal hyperparameters
     hp_optimal = get_ϕ_mean_final(prior, ekiobj)[:]
+
     # Now, fit new RF model with the optimized hyperparameters
     if opt_verbose_flag
         names = get_name(prior)
@@ -538,6 +587,7 @@ function build_models!(
         pcic = [(pci_constrained[1][i], pci_constrained[2][i]) for i in 1:length(pci_constrained[1])]
         pcic_batched = [pcic[b][1] for b in batch(prior)]
         @info("EKI Optimization result:")
+
         println(
             display(
                 [
@@ -547,6 +597,7 @@ function build_models!(
             ),
         )
     end
+
     rfm_tmp = RFM_from_hyperparameters(
         vrfi,
         rng,
@@ -596,7 +647,8 @@ function predict(vrfi::VectorRandomFeatureInterface, new_inputs::M) where {M <:
 
     N_samples = size(new_inputs, 2)
     # Predicts columns of inputs: input_dim × N_samples
-    μ, σ2 = RF.Methods.predict(rfm, ff, DataContainer(new_inputs), tullio_threading = tullio_threading)
+    μ, σ2 = RF.Methods.predict(rfm, ff, DataContainer(new_inputs), tullio_threading = "tullio")
+    # μ, σ2 = RF.Methods.predict(rfm, ff, DataContainer(new_inputs), tullio_threading = tullio_threading)
     # sizes (output_dim x n_test), (output_dim x output_dim x n_test) 
     # add the noise contribution from the regularization
     # note this is because we are predicting the data here, not the latent function.
@@ -608,6 +660,5 @@ function predict(vrfi::VectorRandomFeatureInterface, new_inputs::M) where {M <:
             σ2[:, :, i] = posdef_correct(σ2[:, :, i])
         end
     end
-
     return μ, σ2
 end
diff --git a/test/RandomFeature/runtests.jl b/test/RandomFeature/runtests.jl
index b5c04d0f9..5a7a45b89 100644
--- a/test/RandomFeature/runtests.jl
+++ b/test/RandomFeature/runtests.jl
@@ -14,40 +14,137 @@ rng = Random.MersenneTwister(seed)
 
 @testset "RandomFeatures" begin
 
+    @testset "hyperparameter prior interface" begin
+
+        # [1. ] CovarianceStructures
+        # Costruction
+        eps = 1e-4
+        r = 3
+        odf = OneDimFactor()
+        @test typeof(odf) <: CovarianceStructureType
+        df = DiagonalFactor(eps)
+        @test typeof(df) <: CovarianceStructureType
+        @test get_eps(df) == eps
+        @test get_eps(DiagonalFactor()) == Float64(1.0)
+        cf = CholeskyFactor(eps)
+        @test typeof(cf) <: CovarianceStructureType
+        @test get_eps(cf) == eps
+        @test get_eps(CholeskyFactor()) == Float64(1.0)
+        lrf = LowRankFactor(r, eps)
+        @test typeof(lrf) <: CovarianceStructureType
+        @test rank(lrf) == r
+        @test get_eps(lrf) == eps
+        @test get_eps(LowRankFactor(r)) == Float64(1.0)
+        hlrf = HierarchicalLowRankFactor(r, eps)
+        @test typeof(hlrf) <: CovarianceStructureType
+        @test rank(hlrf) == r
+        @test get_eps(hlrf) == eps
+        @test get_eps(HierarchicalLowRankFactor(r)) == Float64(1.0)
+
+        # calculate_n_hyperparameters
+        d = 6
+        @test calculate_n_hyperparameters(d, odf) == 0
+        @test calculate_n_hyperparameters(d, df) == d
+        @test calculate_n_hyperparameters(d, cf) == Int(d * (d + 1) / 2)
+        @test calculate_n_hyperparameters(d, lrf) == Int(r + d * r)
+        @test calculate_n_hyperparameters(d, hlrf) == Int(r * (r + 1) / 2 + d * r)
+
+        # hyperparameters_from_flat - only check shape
+        @test isnothing(hyperparameters_from_flat([1], odf))
+        for factor in (df, cf, lrf, hlrf)
+            x = ones(calculate_n_hyperparameters(d, factor))
+            @test size(hyperparameters_from_flat(x, factor)) == (d, d)
+        end
+
+        # build_default_prior - only check size of distribution
+        name = "test_name"
+        @test isnothing(build_default_prior(name, 0, odf))
+        for factor in (df, cf, lrf, hlrf)
+            n_hp = calculate_n_hyperparameters(d, factor)
+            prior = build_default_prior(name, n_hp, factor)
+            @test ndims(prior) == n_hp
+        end
+
+        # [2. ] Kernel Structures 
+        d = 6
+        p = 3
+        c_in = lrf
+        c_out = cf
+        k_sep = SeparableKernel(c_in, c_out)
+        @test get_input_cov_structure(k_sep) == c_in
+        @test get_output_cov_structure(k_sep) == c_out
+
+        k_nonsep = NonseparableKernel(c_in)
+        @test get_cov_structure(k_nonsep) == c_in
+
+
+        # calculate_n_hyperparameters
+        @test calculate_n_hyperparameters(d, p, k_sep) ==
+              calculate_n_hyperparameters(d, c_in) + calculate_n_hyperparameters(p, c_out) + 1
+        @test calculate_n_hyperparameters(d, p, k_nonsep) == calculate_n_hyperparameters(d * p, c_in) + 1
+        k_sep1d = SeparableKernel(odf, odf)
+        k_nonsep1d = NonseparableKernel(odf)
+        @test calculate_n_hyperparameters(1, 1, k_sep1d) == 2
+        @test calculate_n_hyperparameters(1, 1, k_nonsep1d) == 2
+
+        # hyper_parameters_from_flat: not applied with scaling hyperparameter
+        x = ones(calculate_n_hyperparameters(d, c_in) + calculate_n_hyperparameters(p, c_out))
+        @test size(hyperparameters_from_flat(x, d, p, k_sep)[1]) == (d, d)
+        @test size(hyperparameters_from_flat(x, d, p, k_sep)[2]) == (p, p)
+
+        x = ones(calculate_n_hyperparameters(d * p, c_in))
+        @test size(hyperparameters_from_flat(x, d, p, k_nonsep)) == (d * p, d * p)
+
+        x = [1] # in the 1D case, return(U,V) = (1x1 matrix, nothing)
+        @test size(hyperparameters_from_flat(x, 1, 1, k_sep1d)[1]) == (1, 1)
+        @test isnothing(hyperparameters_from_flat(x, 1, 1, k_sep1d)[2])
+        @test size(hyperparameters_from_flat(x, 1, 1, k_nonsep1d)) == (1, 1)
+
+        # build_default_prior
+        @test ndims(build_default_prior(d, p, k_sep)) ==
+              ndims(build_default_prior("input", calculate_n_hyperparameters(d, c_in), c_in)) +
+              ndims(build_default_prior("output", calculate_n_hyperparameters(p, c_out), c_out)) +
+              1
+        @test ndims(build_default_prior(d, p, k_nonsep)) ==
+              ndims(build_default_prior("full", calculate_n_hyperparameters(d * p, c_in), c_in)) + 1
+
+        @test ndims(build_default_prior(1, 1, k_sep1d)) == 2
+        @test ndims(build_default_prior(1, 1, k_nonsep1d)) == 2
+
+
+    end
+
     @testset "ScalarRandomFeatureInterface" begin
 
         input_dim = 2
         n_features = 200
         batch_sizes = Dict("train" => 100, "test" => 100, "feature" => 100)
-        diagonalize_input = false
         #build interface
-        n_input_hp = Int(input_dim * (input_dim + 1) / 2) + 2 # see  calculate_n_hyperparameters for details
-        n_output_hp = 1
+
 
         # prior built from:
-        # input:  γ₁*(Cholesky_factor + γ₂ * I )
-        # output: γ₃
-        # where γᵢ positive
-        prior = build_default_prior(input_dim, diagonalize_input)
+        eps = 1e-8
+        kernel_structure = SeparableKernel(CholeskyFactor(eps), OneDimFactor()) # Cholesky factorized input, 1D output
+        prior = build_default_prior(input_dim, kernel_structure)
 
         optimizer_options = Dict(
             "prior" => prior,
             "n_ensemble" => max(ndims(prior) + 1, 10),
             "n_iteration" => 5,
+            "scheduler" => DataMisfitController(),
             "cov_sample_multiplier" => 10.0,
             "inflation" => 1e-4,
             "train_fraction" => 0.8,
             "multithread" => "ensemble",
             "verbose" => false,
-            "scheduler" => DataMisfitController(),
         )
 
         srfi = ScalarRandomFeatureInterface(
             n_features,
             input_dim,
+            kernel_structure = kernel_structure,
             batch_sizes = batch_sizes,
             rng = rng,
-            diagonalize_input = diagonalize_input,
             optimizer_options = optimizer_options,
         )
 
@@ -57,17 +154,19 @@ rng = Random.MersenneTwister(seed)
         @test get_n_features(srfi) == n_features
         @test get_input_dim(srfi) == input_dim
         @test get_rng(srfi) == rng
-        @test get_diagonalize_input(srfi) == diagonalize_input
+        @test get_kernel_structure(srfi) == kernel_structure
         @test get_optimizer_options(srfi) == optimizer_options
 
         # check defaults 
         srfi2 = ScalarRandomFeatureInterface(n_features, input_dim)
         @test get_batch_sizes(srfi2) === nothing
         @test get_rng(srfi2) == Random.GLOBAL_RNG
-        @test get_diagonalize_input(srfi2) == false
-        # currently the "scheduler" doesn't always satisfy X() = X(), bug so we need to remove this for now 
+        @test get_kernel_structure(srfi2) ==
+              SeparableKernel(cov_structure_from_string("lowrank", input_dim), OneDimFactor())
+
+        # currently the "scheduler" doesn't always satisfy X() = X(), bug so we need to remove this for now
         for key in keys(optimizer_options)
-            if !(key == "scheduler")
+            if !(key ∈ ["scheduler", "prior", "n_ensemble"])
                 @test get_optimizer_options(srfi2)[key] == optimizer_options[key] # we just set the defaults above
             end
         end
@@ -81,29 +180,25 @@ rng = Random.MersenneTwister(seed)
         output_dim = 3
         n_features = 200
         batch_sizes = Dict("train" => 100, "test" => 100, "feature" => 100)
-        diagonalize_input = true
-        diagonalize_output = false
-
-        n_input_hp = Int(input_dim) + 2
-        n_output_hp = Int(output_dim * (output_dim + 1) / 2) + 2
 
         # prior built from:
-        # input:  γ₁*(Cholesky_factor_in + γ₂ * I)
-        # output: γ₃*(Cholesky_factor_out + γ₄ * I)
-        # where γᵢ positive
-        prior = build_default_prior(input_dim, output_dim, diagonalize_input, diagonalize_output)
+        eps = 1e-8
+        kernel_structure = SeparableKernel(DiagonalFactor(eps), CholeskyFactor(eps)) # Diagonal input, Cholesky factorized output
+        prior = build_default_prior(input_dim, output_dim, kernel_structure)
+
 
         optimizer_options = Dict(
             "prior" => prior,
             "n_ensemble" => max(ndims(prior) + 1, 10),
             "n_iteration" => 5,
-            "tikhonov" => 0,
+            "scheduler" => DataMisfitController(),
             "cov_sample_multiplier" => 10.0,
+            "tikhonov" => 0,
             "inflation" => 1e-4,
             "train_fraction" => 0.8,
             "multithread" => "ensemble",
             "verbose" => false,
-            "scheduler" => DataMisfitController(),
+            "localization" => EnsembleKalmanProcesses.Localizers.NoLocalization(),
         )
 
         #build interfaces
@@ -111,10 +206,9 @@ rng = Random.MersenneTwister(seed)
             n_features,
             input_dim,
             output_dim,
+            kernel_structure = kernel_structure,
             rng = rng,
             batch_sizes = batch_sizes,
-            diagonalize_input = diagonalize_input,
-            diagonalize_output = diagonalize_output,
             optimizer_options = optimizer_options,
         )
 
@@ -124,21 +218,22 @@ rng = Random.MersenneTwister(seed)
         @test get_n_features(vrfi) == n_features
         @test get_input_dim(vrfi) == input_dim
         @test get_output_dim(vrfi) == output_dim
+        @test get_kernel_structure(vrfi) == kernel_structure
         @test get_rng(vrfi) == rng
-        @test get_diagonalize_input(vrfi) == diagonalize_input
-        @test get_diagonalize_output(vrfi) == diagonalize_output
         @test get_optimizer_options(vrfi) == optimizer_options
 
-
         #check defaults
-        vrfi2 = VectorRandomFeatureInterface(n_features, input_dim, output_dim, diagonalize_input = diagonalize_input)
+        vrfi2 = VectorRandomFeatureInterface(n_features, input_dim, output_dim)
 
         @test get_batch_sizes(vrfi2) === nothing
         @test get_rng(vrfi2) == Random.GLOBAL_RNG
-        @test get_diagonalize_input(vrfi2) == true
-        @test get_diagonalize_output(vrfi2) == false
+        @test get_kernel_structure(vrfi2) == SeparableKernel(
+            cov_structure_from_string("lowrank", input_dim),
+            cov_structure_from_string("lowrank", output_dim),
+        )
+
         for key in keys(optimizer_options)
-            if !(key == "scheduler")
+            if !(key ∈ ["scheduler", "prior", "n_ensemble"])
                 @test get_optimizer_options(vrfi2)[key] == optimizer_options[key] # we just set the defaults above
             end
         end
@@ -162,18 +257,14 @@ rng = Random.MersenneTwister(seed)
 
         # RF parameters
         n_features = 100
-        diagonalize_input = true
+        eps = 1e-8
+        scalar_ks = SeparableKernel(DiagonalFactor(eps), OneDimFactor()) # Diagonalize input (ARD-type kernel)
+        vector_ks = SeparableKernel(DiagonalFactor(eps), CholeskyFactor()) # Diagonalize input (ARD-type kernel)
         # Scalar RF options to mimic squared-exp ARD kernel
-        srfi = ScalarRandomFeatureInterface(n_features, input_dim, diagonalize_input = diagonalize_input, rng = rng)
+        srfi = ScalarRandomFeatureInterface(n_features, input_dim, kernel_structure = scalar_ks, rng = rng)
 
         # Vector RF options to mimic squared-exp ARD kernel (in 1D)
-        vrfi = VectorRandomFeatureInterface(
-            n_features,
-            input_dim,
-            output_dim,
-            diagonalize_input = diagonalize_input,
-            rng = rng,
-        )
+        vrfi = VectorRandomFeatureInterface(n_features, input_dim, output_dim, kernel_structure = vector_ks, rng = rng)
 
         # build emulators
         em_srfi = Emulator(srfi, iopairs, obs_noise_cov = obs_noise_cov)
@@ -231,28 +322,35 @@ rng = Random.MersenneTwister(seed)
         # 1) scalar + diag in
         # 2) scalar  
         # 3) vector + diag out
-        # 4) vector 
-
-        srfi_diagin = ScalarRandomFeatureInterface(n_features, input_dim, diagonalize_input = true, rng = rng)
-        srfi = ScalarRandomFeatureInterface(n_features, input_dim, diagonalize_input = false, rng = rng)
+        # 4) vector
+        # 5) vector nonseparable
+        eps = 1e-8
+        r = 1
+        scalar_diagin_ks = SeparableKernel(DiagonalFactor(eps), OneDimFactor())
+        scalar_ks = SeparableKernel(CholeskyFactor(eps), OneDimFactor())
+        vector_diagout_ks = SeparableKernel(CholeskyFactor(eps), DiagonalFactor(eps))
+        vector_ks = SeparableKernel(LowRankFactor(r, eps), HierarchicalLowRankFactor(r, eps))
+        vector_nonsep_ks = NonseparableKernel(LowRankFactor(r + 1, eps))
+
+        srfi_diagin =
+            ScalarRandomFeatureInterface(n_features, input_dim, kernel_structure = scalar_diagin_ks, rng = rng)
+        srfi = ScalarRandomFeatureInterface(n_features, input_dim, kernel_structure = scalar_ks, rng = rng)
 
         vrfi_diagout = VectorRandomFeatureInterface(
             n_features,
             input_dim,
             output_dim,
-            diagonalize_input = false,
-            diagonalize_output = true,
-            optimizer_options = Dict("train_fraction" => 0.7),
+            kernel_structure = vector_diagout_ks,
             rng = rng,
         )
 
-        vrfi = VectorRandomFeatureInterface(
+        vrfi = VectorRandomFeatureInterface(n_features, input_dim, output_dim, kernel_structure = vector_ks, rng = rng)
+
+        vrfi_nonsep = VectorRandomFeatureInterface(
             n_features,
             input_dim,
             output_dim,
-            diagonalize_input = false,
-            diagonalize_output = false,
-            optimizer_options = Dict("train_fraction" => 0.7),
+            kernel_structure = vector_nonsep_ks,
             rng = rng,
         )
 
@@ -266,6 +364,7 @@ rng = Random.MersenneTwister(seed)
         em_vrfi_svd = Emulator(vrfi, iopairs, obs_noise_cov = Σ)
 
         em_vrfi = Emulator(vrfi, iopairs, obs_noise_cov = Σ, decorrelate = false)
+        em_vrfi_nonsep = Emulator(vrfi_nonsep, iopairs, obs_noise_cov = Σ, decorrelate = false)
 
         #TODO truncated SVD option for vector (involves resizing prior)
 
@@ -281,21 +380,22 @@ rng = Random.MersenneTwister(seed)
         μ_vsd, σ²_vsd = Emulators.predict(em_vrfi_svd_diagout, new_inputs, transform_to_real = true)
         μ_vs, σ²_vs = Emulators.predict(em_vrfi_svd, new_inputs, transform_to_real = true)
         μ_v, σ²_v = Emulators.predict(em_vrfi, new_inputs)
+        μ_vns, σ²_vns = Emulators.predict(em_vrfi_nonsep, new_inputs)
 
         tol_μ = 0.1 * ntest * output_dim
         @test isapprox.(norm(μ_ssd - outx), 0, atol = tol_μ)
         @test isapprox.(norm(μ_ss - outx), 0, atol = tol_μ)
         @test isapprox.(norm(μ_vsd - outx), 0, atol = tol_μ)
         @test isapprox.(norm(μ_vs - outx), 0, atol = tol_μ)
-        @test isapprox.(norm(μ_v - outx), 0, atol = 5 * tol_μ) # a more approximate option so likely less good approx
+        @test isapprox.(norm(μ_v - outx), 0, atol = 5 * tol_μ) # approximate option so likely less good approx
+        @test isapprox.(norm(μ_vns - outx), 0, atol = 5 * tol_μ) # approximate option so likely less good approx
 
         # An example with the other threading option
         vrfi_tul = VectorRandomFeatureInterface(
             n_features,
             input_dim,
             output_dim,
-            diagonalize_input = false,
-            diagonalize_output = false,
+            kernel_structure = vector_ks,
             optimizer_options = Dict("train_fraction" => 0.7, "multithread" => "tullio"),
             rng = rng,
         )