From 9cb5e89fea944a16a1223c2899ad9cf9e3527a8b Mon Sep 17 00:00:00 2001 From: odunbar Date: Tue, 4 Jul 2023 15:09:43 -0700 Subject: [PATCH] add Ledoit Wolf shrinkage estimator for cross-validation covariance different priors change prior again add reg into cov-mat for optimization example tweaks add reg to scalar method and widen default prior fix lorenz fix lorenz major refactor of RF prior cov fix lorenz format refactored user interface examples work with new interface format resolve merge format bug-fix n_hp calc updates to prior - remove scalings src for new kernel structure updated Emulator and Lorenz example with Kernel interface bugfix symbol checks Cholesky factorization of K remove unnecessary warning needed branch in if-else error message typo better nugget term for LRF sensible results for RF emulator exmaple bypass JLD2 errors make low-rank into more of a diagonal decomp bugfix scatter bounds added learnable scaling Lorenz 63 example learning update map gets l63 pdf moved L63 into Emulator examples remove duplicate svd rebalance loss function (sqrt complexity, zero out scaled coeffs, remove approx_sig2 from noise), greatly improved robustness example tweaks and fixes for plotting, convergence etc. clima format tests pass clima format change plots to makie, and small changes change plots to makie, and small changes format docs fix --- docs/src/API/RandomFeatures.md | 2 +- examples/EDMF_data/Project.toml | 2 + examples/EDMF_data/plot_posterior.jl | 23 +- examples/EDMF_data/uq_for_edmf.jl | 127 ++++- examples/Emulator/GaussianProcess/plot_GP.jl | 3 +- examples/Emulator/L63/Project.toml | 9 + examples/Emulator/L63/emulate.jl | 207 +++++++ .../scalar_optimize_and_plot_RF.jl | 397 ++++++------- .../vector_optimize_and_plot_RF.jl | 65 ++- examples/GCM/Project.toml | 1 + examples/GCM/emulate_sample_script.jl | 153 ++++- examples/GCM/sbatch_emulate_sample_jl | 2 +- examples/Lorenz/GModel_common.jl | 5 +- examples/Lorenz/calibrate.jl | 113 ++-- examples/Lorenz/emulate_sample.jl | 111 ++-- src/Emulator.jl | 21 +- src/RandomFeature.jl | 521 ++++++++++++------ src/ScalarRandomFeature.jl | 137 +++-- src/VectorRandomFeature.jl | 213 ++++--- test/RandomFeature/runtests.jl | 212 +++++-- 20 files changed, 1642 insertions(+), 682 deletions(-) create mode 100644 examples/Emulator/L63/Project.toml create mode 100644 examples/Emulator/L63/emulate.jl diff --git a/docs/src/API/RandomFeatures.md b/docs/src/API/RandomFeatures.md index b5f5956bc..2baffd992 100644 --- a/docs/src/API/RandomFeatures.md +++ b/docs/src/API/RandomFeatures.md @@ -31,7 +31,7 @@ get_n_features get_input_dim get_output_dim get_rng -get_diagonalize_input +get_kernel_structure get_feature_decomposition get_optimizer_options optimize_hyperparameters!(::ScalarRandomFeatureInterface) diff --git a/examples/EDMF_data/Project.toml b/examples/EDMF_data/Project.toml index a865fa8be..2936bdfbe 100644 --- a/examples/EDMF_data/Project.toml +++ b/examples/EDMF_data/Project.toml @@ -1,9 +1,11 @@ [deps] +CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" CalibrateEmulateSample = "95e48a1f-0bec-4818-9538-3db4340308e3" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" NCDatasets = "85f8d34a-cbdd-5861-8df4-14fed0d494ab" +PairPlots = "43a3c2be-4208-490b-832a-a21dcd55d7da" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" diff --git a/examples/EDMF_data/plot_posterior.jl b/examples/EDMF_data/plot_posterior.jl index 5173889fb..9ba36b414 100644 --- a/examples/EDMF_data/plot_posterior.jl +++ b/examples/EDMF_data/plot_posterior.jl @@ -1,5 +1,8 @@ # Import modules -using StatsPlots +ENV["GKSwstype"] = "100" +using Plots + +using CairoMakie, PairPlots using JLD2 using Dates @@ -11,11 +14,11 @@ using CalibrateEmulateSample.ParameterDistributions # 2-parameter calibration exp exp_name = "ent-det-calibration" -date_of_run = Date(2022, 7, 15) +date_of_run = Date(2023, 10, 5) # 5-parameter calibration exp #exp_name = "ent-det-tked-tkee-stab-calibration" -#date_of_run = Date(2022,7,14) +#date_of_run = Date(2023,10,4) # Output figure read/write directory figure_save_directory = joinpath(@__DIR__, "output", exp_name, string(date_of_run)) @@ -24,7 +27,7 @@ data_save_directory = joinpath(@__DIR__, "output", exp_name, string(date_of_run) # load posterior_filepath = joinpath(data_save_directory, "posterior.jld2") if !isfile(posterior_filepath) - LoadError(posterior_filepath * " not found. Please check experiment name and date") + throw(ArgumentError(posterior_filepath * " not found. Please check experiment name and date")) else println("Loading posterior distribution from: " * posterior_filepath) posterior = load(posterior_filepath)["posterior"] @@ -40,10 +43,10 @@ density_filepath = joinpath(figure_save_directory, "posterior_dist_comp.png") transformed_density_filepath = joinpath(figure_save_directory, "posterior_dist_phys.png") labels = get_name(posterior) -gr(dpi = 300, size = (nparam_plots * 300, nparam_plots * 300)) -# lower triangular marginal cornerplot -p = cornerplot(permutedims(posterior_samples, (2, 1)), label = labels, compact = true) -trans_p = cornerplot(permutedims(transformed_posterior_samples, (2, 1)), label = labels, compact = true) +data = (; [(Symbol(labels[i]), posterior_samples[i, :]) for i in 1:length(labels)]...) +transformed_data = (; [(Symbol(labels[i]), transformed_posterior_samples[i, :]) for i in 1:length(labels)]...) -savefig(p, density_filepath) -savefig(trans_p, transformed_density_filepath) +p = pairplot(data => (PairPlots.Scatter(),)) +trans_p = pairplot(transformed_data => (PairPlots.Scatter(),)) +save(density_filepath, p) +save(transformed_density_filepath, trans_p) diff --git a/examples/EDMF_data/uq_for_edmf.jl b/examples/EDMF_data/uq_for_edmf.jl index c9da771d5..7c5a1ac0d 100644 --- a/examples/EDMF_data/uq_for_edmf.jl +++ b/examples/EDMF_data/uq_for_edmf.jl @@ -1,9 +1,10 @@ #include(joinpath(@__DIR__, "..", "ci", "linkfig.jl")) -PLOT_FLAG = true +PLOT_FLAG = false # Import modules using Distributions # probability distributions and associated functions using LinearAlgebra +ENV["GKSwstype"] = "100" using Plots using Random using JLD2 @@ -17,6 +18,7 @@ using CalibrateEmulateSample.MarkovChainMonteCarlo using CalibrateEmulateSample.ParameterDistributions using CalibrateEmulateSample.DataContainers using CalibrateEmulateSample.EnsembleKalmanProcesses +using CalibrateEmulateSample.EnsembleKalmanProcesses.Localizers using CalibrateEmulateSample.Utilities rng_seed = 42424242 @@ -126,6 +128,7 @@ function main() end # load and create prior distributions + #= prior_filepath = joinpath(exp_dir, "prior.jld2") if !isfile(prior_filepath) LoadError("prior file \"prior.jld2\" not found in directory \"" * exp_dir * "/\"") @@ -133,6 +136,47 @@ function main() prior_dict_raw = load(prior_filepath) #using JLD2 prior = prior_dict_raw["prior"] end + =# + # build prior if jld2 does not work + function get_prior_config(s::SS) where {SS <: AbstractString} + config = Dict() + if s == "ent-det-calibration" + config["constraints"] = + Dict("entrainment_factor" => [bounded(0.0, 1.0)], "detrainment_factor" => [bounded(0.0, 1.0)]) + config["prior_mean"] = Dict("entrainment_factor" => 0.13, "detrainment_factor" => 0.51) + config["unconstrained_σ"] = 1.0 + elseif s == "ent-det-tked-tkee-stab-calibration" + config["constraints"] = Dict( + "entrainment_factor" => [bounded(0.0, 1.0)], + "detrainment_factor" => [bounded(0.0, 1.0)], + "tke_ed_coeff" => [bounded(0.01, 1.0)], + "tke_diss_coeff" => [bounded(0.01, 1.0)], + "static_stab_coeff" => [bounded(0.01, 1.0)], + ) + config["prior_mean"] = Dict( + "entrainment_factor" => 0.13, + "detrainment_factor" => 0.51, + "tke_ed_coeff" => 0.14, + "tke_diss_coeff" => 0.22, + "static_stab_coeff" => 0.4, + ) + config["unconstrained_σ"] = 1.0 + else + throw(ArgumentError("prior for experiment $s not found, please implement in uq_for_edmf.jl")) + end + return config + end + prior_config = get_prior_config(exp_name) + means = prior_config["prior_mean"] + std = prior_config["unconstrained_σ"] + constraints = prior_config["constraints"] + + + prior = combine_distributions([ + ParameterDistribution( + Dict("name" => name, "distribution" => Parameterized(Normal(mean, std)), "constraint" => constraints[name]), + ) for (name, mean) in means + ]) # Option (ii) load EKP object # max_ekp_it = 10 # use highest available iteration file @@ -145,7 +189,7 @@ function main() # end # input_output_pairs = Utilities.get_training_points(ekpobj, max_ekp_it) - println("Completed calibration stage") + println("Completed calibration loading stage") println(" ") ############################################## # [3. ] Build Emulator from calibration data # @@ -153,19 +197,82 @@ function main() println("Begin Emulation stage") # Create GP object - gppackage = Emulators.SKLJL() - pred_type = Emulators.YType() - gauss_proc = GaussianProcess( - gppackage; - kernel = nothing, # use default squared exponential kernel - prediction_type = pred_type, - noise_learn = false, + cases = [ + "GP", # diagonalize, train scalar GP, assume diag inputs + "RF-scalar", # diagonalize, train scalar RF, don't asume diag inputs + "RF-vector-svd-diag", + "RF-vector-svd-nondiag", + "RF-vector-svd-nonsep", + ] + case = cases[5] + + overrides = Dict( + "verbose" => true, + "train_fraction" => 0.95, + "scheduler" => DataMisfitController(terminate_at = 100), + "cov_sample_multiplier" => 0.5, + "n_iteration" => 5, + # "n_ensemble" => 20, + # "localization" => SEC(0.1), # localization / sample error correction for small ensembles ) + nugget = 0.01 + rng_seed = 99330 + rng = Random.MersenneTwister(rng_seed) + input_dim = size(get_inputs(input_output_pairs), 1) + output_dim = size(get_outputs(input_output_pairs), 1) + if case == "GP" + + gppackage = Emulators.SKLJL() + pred_type = Emulators.YType() + mlt = GaussianProcess( + gppackage; + kernel = nothing, # use default squared exponential kernel + prediction_type = pred_type, + noise_learn = false, + ) + elseif case ∈ ["RF-scalar"] + n_features = 100 + kernel_structure = SeparableKernel(CholeskyFactor(nugget), OneDimFactor()) + mlt = ScalarRandomFeatureInterface( + n_features, + input_dim, + rng = rng, + kernel_structure = kernel_structure, + optimizer_options = overrides, + ) + elseif case ∈ ["RF-vector-svd-diag", "RF-vector-svd-nondiag"] + # do we want to assume that the outputs are decorrelated in the machine-learning problem? + kernel_structure = + case ∈ ["RF-vector-svd-diag"] ? SeparableKernel(LowRankFactor(1, nugget), DiagonalFactor(nugget)) : + SeparableKernel(LowRankFactor(2, nugget), LowRankFactor(2, nugget)) + n_features = 500 + + mlt = VectorRandomFeatureInterface( + n_features, + input_dim, + output_dim, + rng = rng, + kernel_structure = kernel_structure, + optimizer_options = overrides, + ) + elseif case ∈ ["RF-vector-svd-nonsep"] + kernel_structure = NonseparableKernel(LowRankFactor(3, nugget)) + n_features = 500 + + mlt = VectorRandomFeatureInterface( + n_features, + input_dim, + output_dim, + rng = rng, + kernel_structure = kernel_structure, + optimizer_options = overrides, + ) + end # Fit an emulator to the data normalized = true - emulator = Emulator(gauss_proc, input_output_pairs; obs_noise_cov = truth_cov, normalize_inputs = normalized) + emulator = Emulator(mlt, input_output_pairs; obs_noise_cov = truth_cov, normalize_inputs = normalized) # Optimize the GP hyperparameters for better fit optimize_hyperparameters!(emulator) diff --git a/examples/Emulator/GaussianProcess/plot_GP.jl b/examples/Emulator/GaussianProcess/plot_GP.jl index a0ee9751f..721382ce7 100644 --- a/examples/Emulator/GaussianProcess/plot_GP.jl +++ b/examples/Emulator/GaussianProcess/plot_GP.jl @@ -11,6 +11,7 @@ using CalibrateEmulateSample.DataContainers plot_flag = true if plot_flag + ENV["GKSwstype"] = "100" using Plots gr(size = (1500, 700)) Plots.scalefontsizes(1.3) @@ -78,7 +79,7 @@ gaussian_process = GaussianProcess(gppackage, noise_learn = false) # The observables y are related to the parameters x by: # y = G(x1, x2) + η, # where G(x1, x2) := [sin(x1) + cos(x2), sin(x1) - cos(x2)], and η ~ N(0, Σ) -n = 100 # number of training points +n = 150 # number of training points p = 2 # input dim d = 2 # output dim diff --git a/examples/Emulator/L63/Project.toml b/examples/Emulator/L63/Project.toml new file mode 100644 index 000000000..e2c9baa08 --- /dev/null +++ b/examples/Emulator/L63/Project.toml @@ -0,0 +1,9 @@ +[deps] +CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" +CalibrateEmulateSample = "95e48a1f-0bec-4818-9538-3db4340308e3" +ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" diff --git a/examples/Emulator/L63/emulate.jl b/examples/Emulator/L63/emulate.jl new file mode 100644 index 000000000..289314295 --- /dev/null +++ b/examples/Emulator/L63/emulate.jl @@ -0,0 +1,207 @@ +using OrdinaryDiffEq +using Random, Distributions, LinearAlgebra +ENV["GKSwstype"] = "100" +using CairoMakie, ColorSchemes #for plots +using JLD2 + +# CES +using CalibrateEmulateSample.Emulators +using CalibrateEmulateSample.ParameterDistributions +using CalibrateEmulateSample.EnsembleKalmanProcesses +using CalibrateEmulateSample.DataContainers + + +function lorenz(du, u, p, t) + du[1] = 10.0 * (u[2] - u[1]) + du[2] = u[1] * (28.0 - u[3]) - u[2] + du[3] = u[1] * u[2] - (8 / 3) * u[3] +end + +function main() + + # rng + rng = MersenneTwister(1232434) + + # Run L63 from 0 -> tmax + u0 = [1.0; 0.0; 0.0] + tmax = 20 + dt = 0.01 + tspan = (0.0, tmax) + prob = ODEProblem(lorenz, u0, tspan) + sol = solve(prob, Euler(), dt = dt) + + # Run L63 from end for test trajetory data + tmax_test = 100 + tspan_test = (0.0, tmax_test) + u0_test = sol.u[end] + prob_test = ODEProblem(lorenz, u0_test, tspan_test) + sol_test = solve(prob_test, Euler(), dt = dt) + + # Run L63 from end for histogram matching data + tmax_hist = 1000 + tspan_hist = (0.0, tmax_hist) + u0_hist = sol_test.u[end] + prob_hist = ODEProblem(lorenz, u0_hist, tspan_hist) + sol_hist = solve(prob_hist, Euler(), dt = dt) + + # Create training pairs (with noise) from subsampling [burnin,tmax] + tburn = 10 + burnin = Int(floor(10 / dt)) + n_train_pts = 400 #600 + ind = shuffle!(rng, Vector(burnin:(tmax / dt - 1)))[1:n_train_pts] + n_tp = length(ind) + input = zeros(3, n_tp) + output = zeros(3, n_tp) + Γy = 1e-6 * I(3) + noise = rand(rng, MvNormal(zeros(3), Γy), n_tp) + for i in 1:n_tp + input[:, i] = sol.u[i] + output[:, i] = sol.u[i + 1] + noise[:, i] + end + iopairs = PairedDataContainer(input, output) + + # Emulate + cases = ["GP", "RF-scalar", "RF-scalar-diagin", "RF-vector-svd-nonsep"] + + case = cases[4] + decorrelate = true + nugget = Float64(1e-12) + overrides = Dict( + "verbose" => true, + "scheduler" => DataMisfitController(terminate_at = 1e4), + "cov_sample_multiplier" => 2.0, + "n_iteration" => 10, + ) + + # Build ML tools + if case == "GP" + gppackage = Emulators.GPJL() + pred_type = Emulators.YType() + mlt = GaussianProcess( + gppackage; + kernel = nothing, # use default squared exponential kernel + prediction_type = pred_type, + noise_learn = false, + ) + + elseif case ∈ ["RF-scalar", "RF-scalar-diagin"] + n_features = 10 * Int(floor(sqrt(3 * n_tp))) + kernel_structure = + case == "RF-scalar-diagin" ? SeparableKernel(DiagonalFactor(nugget), OneDimFactor()) : + SeparableKernel(LowRankFactor(3, nugget), OneDimFactor()) + mlt = ScalarRandomFeatureInterface( + n_features, + 3, + rng = rng, + kernel_structure = kernel_structure, + optimizer_options = overrides, + ) + elseif case ∈ ["RF-vector-svd-nonsep"] + kernel_structure = NonseparableKernel(LowRankFactor(9, nugget)) + n_features = 500 + + mlt = VectorRandomFeatureInterface( + n_features, + 3, + 3, + rng = rng, + kernel_structure = kernel_structure, + optimizer_options = overrides, + ) + end + + # Emulate + emulator = Emulator(mlt, iopairs; obs_noise_cov = Γy, decorrelate = decorrelate) + optimize_hyperparameters!(emulator) + + + # Predict with emulator + xspan_test = 0.0:dt:tmax_test + u_test = zeros(3, length(xspan_test)) + u_test[:, 1] = sol_test.u[1] + + for i in 1:(length(xspan_test) - 1) + rf_mean, _ = predict(emulator, u_test[:, i:i], transform_to_real = true) # 3x1 matrix + u_test[:, i + 1] = rf_mean + end + train_err = [0.0] + for i in 1:size(input, 2) + train_mean, _ = predict(emulator, input[:, i:i], transform_to_real = true) # 3x1 + train_err[1] += norm(train_mean - output[:, i]) + end + println("normalized L^2 error on training data:", 1 / size(input, 2) * train_err[1]) + + + + # plotting trace + f = Figure(resolution = (900, 450)) + axx = Axis(f[1, 1], xlabel = "time", ylabel = "x") + axy = Axis(f[2, 1], xlabel = "time", ylabel = "y") + axz = Axis(f[3, 1], xlabel = "time", ylabel = "z") + + xx = 0:dt:tmax_test + lines!(axx, xx, u_test[1, :], color = :blue) + lines!(axy, xx, u_test[2, :], color = :blue) + lines!(axz, xx, u_test[3, :], color = :blue) + + # run test data + solplot = zeros(3, length(xspan_test)) + for i in 1:length(xspan_test) + solplot[:, i] = sol_test.u[i] #noiseless + end + JLD2.save("l63_testdata.jld2", "solplot", solplot, "uplot", u_test) + + lines!(axx, xspan_test, solplot[1, :], color = :orange) + lines!(axy, xspan_test, solplot[2, :], color = :orange) + lines!(axz, xspan_test, solplot[3, :], color = :orange) + + current_figure() + # save + save("l63_test.png", f, px_per_unit = 3) + save("l63_test.pdf", f, pt_per_unit = 3) + + # plot attractor + f3 = Figure() + lines(f3[1, 1], u_test[1, :], u_test[3, :], color = :blue) + lines(f3[2, 1], solplot[1, :], solplot[3, :], color = :orange) + + # save + save("l63_attr.png", f3, px_per_unit = 3) + save("l63_attr.pdf", f3, pt_per_unit = 3) + + + # Predict with emulator for histogram + xspan_hist = 0.0:dt:tmax_hist + u_hist = zeros(3, length(xspan_hist)) + u_hist[:, 1] = sol_hist.u[1] # start at end of previous sim + + for i in 1:(length(xspan_hist) - 1) + rf_mean, _ = predict(emulator, u_hist[:, i:i], transform_to_real = true) # 3x1 matrix + u_hist[:, i + 1] = rf_mean + end + + solhist = zeros(3, length(xspan_hist)) + for i in 1:length(xspan_hist) + solhist[:, i] = sol_hist.u[i] #noiseless + end + JLD2.save("l63_histdata.jld2", "solhist", solhist, "uhist", u_hist) + + # plotting histograms + f2 = Figure() + hist(f2[1, 1], u_hist[1, :], bins = 50, normalization = :pdf, color = (:blue, 0.5)) + hist(f2[1, 2], u_hist[2, :], bins = 50, normalization = :pdf, color = (:blue, 0.5)) + hist(f2[1, 3], u_hist[3, :], bins = 50, normalization = :pdf, color = (:blue, 0.5)) + + hist!(f2[1, 1], solhist[1, :], bins = 50, normalization = :pdf, color = (:orange, 0.5)) + hist!(f2[1, 2], solhist[2, :], bins = 50, normalization = :pdf, color = (:orange, 0.5)) + hist!(f2[1, 3], solhist[3, :], bins = 50, normalization = :pdf, color = (:orange, 0.5)) + + # save + save("l63_pdf.png", f2, px_per_unit = 3) + save("l63_pdf.pdf", f2, pt_per_unit = 3) + + + +end + +main() diff --git a/examples/Emulator/RandomFeature/scalar_optimize_and_plot_RF.jl b/examples/Emulator/RandomFeature/scalar_optimize_and_plot_RF.jl index b379ff689..3ac0a64a9 100644 --- a/examples/Emulator/RandomFeature/scalar_optimize_and_plot_RF.jl +++ b/examples/Emulator/RandomFeature/scalar_optimize_and_plot_RF.jl @@ -10,17 +10,18 @@ using LinearAlgebra using CalibrateEmulateSample.Emulators using CalibrateEmulateSample.DataContainers using CalibrateEmulateSample.ParameterDistributions +using CalibrateEmulateSample.EnsembleKalmanProcesses + case = "scalar" println("running case $case") - -diagonalize_input = true - +kernel_structure = SeparableKernel(LowRankFactor(2, 1e-8), OneDimFactor()) # input and output(1D) cov structure in sep kernel plot_flag = true if plot_flag + ENV["GKSwstype"] = "100" using Plots gr(size = (1500, 700)) - Plots.scalefontsizes(1.3) + # Plots.scalefontsizes(1.3) # scales on recursive calls to include.. font = Plots.font("Helvetica", 18) fontdict = Dict(:guidefont => font, :xtickfont => font, :ytickfont => font, :legendfont => font) @@ -33,233 +34,247 @@ function meshgrid(vx::AbstractVector{T}, vy::AbstractVector{T}) where {T} return gx, gy end -rng_seed = 41 -Random.seed!(rng_seed) -output_directory = joinpath(@__DIR__, "output") -if !isdir(output_directory) - mkdir(output_directory) -end -#problem -n = 100 # number of training points -p = 2 # input dim -d = 2 # output dim -X = 2.0 * π * rand(p, n) -# G(x1, x2) -g1x = sin.(X[1, :]) .+ cos.(X[2, :]) -g2x = sin.(X[1, :]) .- cos.(X[2, :]) -gx = zeros(2, n) -gx[1, :] = g1x -gx[2, :] = g2x - -# Add noise η -μ = zeros(d) -Σ = 0.1 * [[0.8, 0.1] [0.1, 0.5]] # d x d -noise_samples = rand(MvNormal(μ, Σ), n) -# y = G(x) + η -Y = gx .+ noise_samples - -iopairs = PairedDataContainer(X, Y, data_are_columns = true) -@assert get_inputs(iopairs) == X -@assert get_outputs(iopairs) == Y - -#plot training data with and without noise -if plot_flag - p1 = plot( - X[1, :], - X[2, :], - g1x, - st = :surface, - camera = (30, 60), - c = :cividis, - xlabel = "x1", - ylabel = "x2", - zguidefontrotation = 90, - ) - - figpath = joinpath(output_directory, "RF_observed_y1nonoise.png") - savefig(figpath) - - p2 = plot( - X[1, :], - X[2, :], - g2x, - st = :surface, - camera = (30, 60), - c = :cividis, - xlabel = "x1", - ylabel = "x2", - zguidefontrotation = 90, - ) - figpath = joinpath(output_directory, "RF_observed_y2nonoise.png") - savefig(figpath) - - p3 = plot( - X[1, :], - X[2, :], - Y[1, :], - st = :surface, - camera = (30, 60), - c = :cividis, - xlabel = "x1", - ylabel = "x2", - zguidefontrotation = 90, - ) - figpath = joinpath(output_directory, "RF_observed_y1.png") - savefig(figpath) - - p4 = plot( - X[1, :], - X[2, :], - Y[2, :], - st = :surface, - camera = (30, 60), - c = :cividis, - xlabel = "x1", - ylabel = "x2", - zguidefontrotation = 90, - ) - figpath = joinpath(output_directory, "RF_observed_y2.png") - savefig(figpath) +function main() -end + rng_seed = 41 + Random.seed!(rng_seed) + output_directory = joinpath(@__DIR__, "output") + if !isdir(output_directory) + mkdir(output_directory) + end -# setup random features -n_features = 180 -optimizer_options = Dict("n_iteration" => 20, "prior_in_scale" => 0.1, "verbose" => true) #"Max" iterations (may do less) -srfi = ScalarRandomFeatureInterface( - n_features, - p, - diagonalize_input = diagonalize_input, - optimizer_options = optimizer_options, -) -emulator = Emulator(srfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true) -println("build RF with $n training points and $(n_features) random features.") -optimize_hyperparameters!(emulator) # although RF already optimized + #problem + n = 150 # number of training points + p = 2 # input dim + d = 2 # output dim + X = 2.0 * π * rand(p, n) + # G(x1, x2) + g1x = sin.(X[1, :]) .+ cos.(X[2, :]) + g2x = sin.(X[1, :]) .- cos.(X[2, :]) + gx = zeros(2, n) + gx[1, :] = g1x + gx[2, :] = g2x -# Plot mean and variance of the predicted observables y1 and y2 -# For this, we generate test points on a x1-x2 grid. -n_pts = 200 -x1 = range(0.0, stop = 4.0 / 5.0 * 2 * π, length = n_pts) -x2 = range(0.0, stop = 4.0 / 5.0 * 2 * π, length = n_pts) -X1, X2 = meshgrid(x1, x2) -# Input for predict has to be of size N_samples x input_dim -inputs = permutedims(hcat(X1[:], X2[:]), (2, 1)) + # Add noise η + μ = zeros(d) + Σ = 0.1 * [[0.8, 0.1] [0.1, 0.5]] # d x d + noise_samples = rand(MvNormal(μ, Σ), n) + # y = G(x) + η + Y = gx .+ noise_samples -rf_mean, rf_cov = predict(emulator, inputs, transform_to_real = true) -println("end predictions at ", n_pts * n_pts, " points") + iopairs = PairedDataContainer(X, Y, data_are_columns = true) + @assert get_inputs(iopairs) == X + @assert get_outputs(iopairs) == Y + #plot training data with and without noise + if plot_flag + p1 = plot( + X[1, :], + X[2, :], + g1x, + st = :surface, + camera = (30, 60), + c = :cividis, + xlabel = "x1", + ylabel = "x2", + zguidefontrotation = 90, + ) -#plot predictions -for y_i in 1:d - rf_var_temp = [diag(rf_cov[j]) for j in 1:length(rf_cov)] # (40000,) - rf_var = permutedims(vcat([x' for x in rf_var_temp]...), (2, 1)) # 2 x 40000 + figpath = joinpath(output_directory, "RF_observed_y1nonoise.png") + savefig(figpath) - mean_grid = reshape(rf_mean[y_i, :], n_pts, n_pts) # 2 x 40000 - if plot_flag - p5 = plot( - x1, - x2, - mean_grid, + p2 = plot( + X[1, :], + X[2, :], + g2x, st = :surface, camera = (30, 60), c = :cividis, xlabel = "x1", ylabel = "x2", - zlabel = "mean of y" * string(y_i), zguidefontrotation = 90, ) - end - var_grid = reshape(rf_var[y_i, :], n_pts, n_pts) - if plot_flag - p6 = plot( - x1, - x2, - var_grid, + figpath = joinpath(output_directory, "RF_observed_y2nonoise.png") + savefig(figpath) + + p3 = plot( + X[1, :], + X[2, :], + Y[1, :], st = :surface, camera = (30, 60), c = :cividis, xlabel = "x1", ylabel = "x2", - zlabel = "var of y" * string(y_i), zguidefontrotation = 90, ) + figpath = joinpath(output_directory, "RF_observed_y1.png") + savefig(figpath) - plot(p5, p6, layout = (1, 2), legend = false) + p4 = plot( + X[1, :], + X[2, :], + Y[2, :], + st = :surface, + camera = (30, 60), + c = :cividis, + xlabel = "x1", + ylabel = "x2", + zguidefontrotation = 90, + ) + figpath = joinpath(output_directory, "RF_observed_y2.png") + savefig(figpath) - savefig(joinpath(output_directory, "RF_" * case * "_y" * string(y_i) * "_predictions.png")) end -end - -# Plot the true components of G(x1, x2) -g1_true = sin.(inputs[1, :]) .+ cos.(inputs[2, :]) -g1_true_grid = reshape(g1_true, n_pts, n_pts) -if plot_flag - p7 = plot( - x1, - x2, - g1_true_grid, - st = :surface, - camera = (30, 60), - c = :cividis, - xlabel = "x1", - ylabel = "x2", - zlabel = "sin(x1) + cos(x2)", - zguidefontrotation = 90, - ) - savefig(joinpath(output_directory, "RF_" * case * "_true_g1.png")) -end -g2_true = sin.(inputs[1, :]) .- cos.(inputs[2, :]) -g2_true_grid = reshape(g2_true, n_pts, n_pts) -if plot_flag - p8 = plot( - x1, - x2, - g2_true_grid, - st = :surface, - camera = (30, 60), - c = :cividis, - xlabel = "x1", - ylabel = "x2", - zlabel = "sin(x1) - cos(x2)", - zguidefontrotation = 90, + # setup random features + n_features = 400 + optimizer_options = Dict( + "n_iteration" => 20, + "n_ensemble" => 20, + "verbose" => true, + "scheduler" => DataMisfitController(terminate_at = 100.0), + ) #"Max" iterations (may do less) + + + srfi = ScalarRandomFeatureInterface( + n_features, + p, + kernel_structure = kernel_structure, + optimizer_options = optimizer_options, ) - g_true_grids = [g1_true_grid, g2_true_grid] - - savefig(joinpath(output_directory, "RF_" * case * "_true_g2.png")) - -end - -# Plot the difference between the truth and the mean of the predictions -for y_i in 1:d - - # Reshape rf_cov to size N_samples x output_dim - rf_var_temp = [diag(rf_cov[j]) for j in 1:length(rf_cov)] # (40000,) - rf_var = permutedims(vcat([x' for x in rf_var_temp]...), (2, 1)) # 40000 x 2 - - mean_grid = reshape(rf_mean[y_i, :], n_pts, n_pts) - var_grid = reshape(rf_var[y_i, :], n_pts, n_pts) - # Compute and plot 1/variance * (truth - prediction)^2 + emulator = Emulator(srfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true) + println("build RF with $n training points and $(n_features) random features.") + + optimize_hyperparameters!(emulator) # although RF already optimized + + # Plot mean and variance of the predicted observables y1 and y2 + # For this, we generate test points on a x1-x2 grid. + n_pts = 200 + x1 = range(0.0, stop = 4.0 / 5.0 * 2 * π, length = n_pts) + x2 = range(0.0, stop = 4.0 / 5.0 * 2 * π, length = n_pts) + X1, X2 = meshgrid(x1, x2) + # Input for predict has to be of size N_samples x input_dim + inputs = permutedims(hcat(X1[:], X2[:]), (2, 1)) + + rf_mean, rf_cov = predict(emulator, inputs, transform_to_real = true) + println("end predictions at ", n_pts * n_pts, " points") + + + #plot predictions + for y_i in 1:d + rf_var_temp = [diag(rf_cov[j]) for j in 1:length(rf_cov)] # (40000,) + rf_var = permutedims(reduce(vcat, [x' for x in rf_var_temp]), (2, 1)) # 2 x 40000 + + mean_grid = reshape(rf_mean[y_i, :], n_pts, n_pts) # 2 x 40000 + if plot_flag + p5 = plot( + x1, + x2, + mean_grid, + st = :surface, + camera = (30, 60), + c = :cividis, + xlabel = "x1", + ylabel = "x2", + zlabel = "mean of y" * string(y_i), + zguidefontrotation = 90, + ) + end + var_grid = reshape(rf_var[y_i, :], n_pts, n_pts) + if plot_flag + p6 = plot( + x1, + x2, + var_grid, + st = :surface, + camera = (30, 60), + c = :cividis, + xlabel = "x1", + ylabel = "x2", + zlabel = "var of y" * string(y_i), + zguidefontrotation = 90, + ) + + plot(p5, p6, layout = (1, 2), legend = false) + + savefig(joinpath(output_directory, "RF_" * case * "_y" * string(y_i) * "_predictions.png")) + end + end + # Plot the true components of G(x1, x2) + g1_true = sin.(inputs[1, :]) .+ cos.(inputs[2, :]) + g1_true_grid = reshape(g1_true, n_pts, n_pts) if plot_flag - zlabel = "1/var * (true_y" * string(y_i) * " - predicted_y" * string(y_i) * ")^2" + p7 = plot( + x1, + x2, + g1_true_grid, + st = :surface, + camera = (30, 60), + c = :cividis, + xlabel = "x1", + ylabel = "x2", + zlabel = "sin(x1) + cos(x2)", + zguidefontrotation = 90, + ) + savefig(joinpath(output_directory, "RF_" * case * "_true_g1.png")) + end - p9 = plot( + g2_true = sin.(inputs[1, :]) .- cos.(inputs[2, :]) + g2_true_grid = reshape(g2_true, n_pts, n_pts) + if plot_flag + p8 = plot( x1, x2, - sqrt.(1.0 ./ var_grid .* (g_true_grids[y_i] .- mean_grid) .^ 2), + g2_true_grid, st = :surface, camera = (30, 60), - c = :magma, - zlabel = zlabel, + c = :cividis, xlabel = "x1", ylabel = "x2", + zlabel = "sin(x1) - cos(x2)", zguidefontrotation = 90, ) + g_true_grids = [g1_true_grid, g2_true_grid] - savefig(joinpath(output_directory, "RF_" * case * "_y" * string(y_i) * "_difference_truth_prediction.png")) + savefig(joinpath(output_directory, "RF_" * case * "_true_g2.png")) + + end + + # Plot the difference between the truth and the mean of the predictions + for y_i in 1:d + + # Reshape rf_cov to size N_samples x output_dim + rf_var_temp = [diag(rf_cov[j]) for j in 1:length(rf_cov)] # (40000,) + rf_var = permutedims(vcat([x' for x in rf_var_temp]...), (2, 1)) # 40000 x 2 + + mean_grid = reshape(rf_mean[y_i, :], n_pts, n_pts) + var_grid = reshape(rf_var[y_i, :], n_pts, n_pts) + # Compute and plot 1/variance * (truth - prediction)^2 + + if plot_flag + zlabel = "1/var * (true_y" * string(y_i) * " - predicted_y" * string(y_i) * ")^2" + + p9 = plot( + x1, + x2, + sqrt.(1.0 ./ var_grid .* (g_true_grids[y_i] .- mean_grid) .^ 2), + st = :surface, + camera = (30, 60), + c = :magma, + zlabel = zlabel, + xlabel = "x1", + ylabel = "x2", + zguidefontrotation = 90, + ) + + savefig(joinpath(output_directory, "RF_" * case * "_y" * string(y_i) * "_difference_truth_prediction.png")) + end end end + +main() diff --git a/examples/Emulator/RandomFeature/vector_optimize_and_plot_RF.jl b/examples/Emulator/RandomFeature/vector_optimize_and_plot_RF.jl index 08d045c44..12cbc17a4 100644 --- a/examples/Emulator/RandomFeature/vector_optimize_and_plot_RF.jl +++ b/examples/Emulator/RandomFeature/vector_optimize_and_plot_RF.jl @@ -10,11 +10,13 @@ using LinearAlgebra using CalibrateEmulateSample.Emulators using CalibrateEmulateSample.DataContainers using CalibrateEmulateSample.ParameterDistributions +using CalibrateEmulateSample.EnsembleKalmanProcesses plot_flag = true if plot_flag + ENV["GKSwstype"] = "100" using Plots gr(size = (1500, 700)) - Plots.scalefontsizes(1.3) + # Plots.scalefontsizes(1.3) font = Plots.font("Helvetica", 18) fontdict = Dict(:guidefont => font, :xtickfont => font, :ytickfont => font, :legendfont => font) @@ -38,8 +40,9 @@ function main() mkdir(output_directory) end - cases = ["svd-diag", "svd-nondiag", "nosvd-diag", "nosvd-nondiag"] - case_mask = 1:4 # which cases to do + cases = ["svd-diag", "svd-nondiag", "nosvd-diag", "nosvd-nondiag", "svd-nonsep", "nosvd-nonsep"] + case_mask = 1:6 # which cases to do + nugget = 1e-12 #problem n = 150 # number of training points @@ -128,43 +131,67 @@ function main() for case in cases[case_mask] println("running case $case") - - - # setup random features - n_features = 200 - if case ∈ ["svd-diag", "svd-nondiag"] - optimizer_options = - Dict("n_iteration" => 20, "prior_in_scale" => 0.01, "prior_out_scale" => 1.0, "verbose" => true) #"Max" iterations (may do less) - else # without svd - optimizer_options = - Dict("n_iteration" => 20, "prior_in_scale" => 0.01, "prior_out_scale" => 0.01, "verbose" => true) #"Max" iterations (may do less) - end + n_features = 400 + optimizer_options = + Dict("n_iteration" => 5, "verbose" => true, "scheduler" => DataMisfitController(on_terminate = "continue")) #"Max" iterations (may do less) if case == "svd-diag" vrfi = VectorRandomFeatureInterface( n_features, p, d, - diagonalize_output = true, + kernel_structure = SeparableKernel(LowRankFactor(2, nugget), DiagonalFactor()), optimizer_options = optimizer_options, ) emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true) elseif case == "svd-nondiag" - vrfi = VectorRandomFeatureInterface(n_features, p, d, optimizer_options = optimizer_options) + vrfi = VectorRandomFeatureInterface( + n_features, + p, + d, + kernel_structure = SeparableKernel(LowRankFactor(2, nugget), LowRankFactor(2, nugget)), + optimizer_options = optimizer_options, + ) + emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true) + elseif case == "svd-nonsep" + vrfi = VectorRandomFeatureInterface( + n_features, + p, + d, + kernel_structure = NonseparableKernel(LowRankFactor(4, nugget)), + optimizer_options = optimizer_options, + ) emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true) + elseif case == "nosvd-diag" vrfi = VectorRandomFeatureInterface( n_features, p, d, - diagonalize_output = true, + kernel_structure = SeparableKernel(LowRankFactor(2, nugget), DiagonalFactor()), # roughly normalize output by noise optimizer_options = optimizer_options, ) emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true, decorrelate = false) elseif case == "nosvd-nondiag" - vrfi = VectorRandomFeatureInterface(n_features, p, d, optimizer_options = optimizer_options) + vrfi = VectorRandomFeatureInterface( + n_features, + p, + d, + kernel_structure = SeparableKernel(LowRankFactor(2, nugget), LowRankFactor(2, nugget)), # roughly normalize output by noise + optimizer_options = optimizer_options, + ) emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true, decorrelate = false) + elseif case == "nosvd-nonsep" + vrfi = VectorRandomFeatureInterface( + n_features, + p, + d, + kernel_structure = NonseparableKernel(LowRankFactor(4, nugget)), + optimizer_options = optimizer_options, + ) + emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true, decorrelate = false) + end println("build RF with $n training points and $(n_features) random features.") @@ -187,7 +214,7 @@ function main() for y_i in 1:d rf_var_temp = [diag(rf_cov[j]) for j in 1:length(rf_cov)] # (40000,) - rf_var = permutedims(vcat([x' for x in rf_var_temp]...), (2, 1)) # 2 x 40000 + rf_var = permutedims(reduce(vcat, [x' for x in rf_var_temp]), (2, 1)) # 2 x 40000 mean_grid = reshape(rf_mean[y_i, :], n_pts, n_pts) # 2 x 40000 if plot_flag diff --git a/examples/GCM/Project.toml b/examples/GCM/Project.toml index aba51ad3c..73fa40b73 100644 --- a/examples/GCM/Project.toml +++ b/examples/GCM/Project.toml @@ -10,6 +10,7 @@ PyPlot = "d330b81b-6aea-500a-939a-2ce795aea3ee" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd" [compat] FiniteDiff = "~2.10" diff --git a/examples/GCM/emulate_sample_script.jl b/examples/GCM/emulate_sample_script.jl index 0c8607554..4c0f9ca9c 100644 --- a/examples/GCM/emulate_sample_script.jl +++ b/examples/GCM/emulate_sample_script.jl @@ -1,38 +1,37 @@ # Script to run Emulation and Sampling on data from GCM # Import modules -using Distributions # probability distributions and associated functions +using Distributions using LinearAlgebra +ENV["GKSwstype"] = "100" using Plots +using Plots.PlotMeasures # for mm +using StatsPlots using Random using JLD2 # CES using CalibrateEmulateSample.Emulators using CalibrateEmulateSample.MarkovChainMonteCarlo using CalibrateEmulateSample.ParameterDistributions +using CalibrateEmulateSample.EnsembleKalmanProcesses using CalibrateEmulateSample.DataContainers +using CalibrateEmulateSample.EnsembleKalmanProcesses.Localizers -@time begin +function main() rng_seed = 2413798 Random.seed!(rng_seed) - # expname = "vrf-nondiag-logdet_newprior" - #emulator_type ∈ ["GPR","ScalarRFR","VectorRFR-svd-diag","VectorRFR-svd-nondiag", "VectorRFR-nondiag"] - # emulator_type = "GPR" - # expname = "gpr" - - # emulator_type = "ScalarRFR" - # expname = "srf" + # CHOOSE YOUR CASE + case = 6 - # emulator_type = "VectorRFR-svd-diag" - # expname = "vrf-svd-diag" + emulator_types = + ["GPR", "ScalarRFR", "VectorRFR-svd-diag", "VectorRFR-svd-nondiag", "VectorRFR-nondiag", "VectorRFR-svd-nonsep"] - # emulator_type = "VectorRFR-svd-nondiag" - # expname = "vrf-svd-nondiag" + expnames = ["gpr", "srf", "vrf-svd-diag", "vrf-svd-nondiag", "vrf-nondiag_standardized", "vrf-svd-nonsep"] - emulator_type = "VectorRFR-nondiag" - expname = "vrf-nondiag_standardized" + emulator_type = emulator_types[case] + expname = expnames[case] # Output figure save directory example_directory = @__DIR__ @@ -54,11 +53,11 @@ using CalibrateEmulateSample.DataContainers obs_noise_cov = load(datafile)["obs_noise_cov"] # 96 x 96 #take only first 400 points - iter_mask = 1:4 - #data_mask = 1:32 + iter_mask = [1, 2, 4] + data_mask = 1:96 # data_mask= 33:64 # data_mask= 65:96 - data_mask = 1:96 + #data_mask = 1:96 #data_mask = [5*i for i = 1:Int(floor(96/5))] inputs = inputs[:, :, iter_mask] @@ -66,8 +65,6 @@ using CalibrateEmulateSample.DataContainers obs_noise_cov = obs_noise_cov[data_mask, data_mask] truth = truth[data_mask] - # priorfile = "priors.jld2" - # prior = load(priorfile)["prior"] # derived quantities N_ens, input_dim, N_iter = size(inputs) @@ -79,7 +76,19 @@ using CalibrateEmulateSample.DataContainers normalized = true # setup random features - eki_options_override = Dict("tikhonov" => 0, "multithread" => "ensemble") #faster than tullio multithread for training + eki_options_override = Dict( + "verbose" => true, + "tikhonov" => 0.0, + "scheduler" => DataMisfitController(on_terminate = "continue"), + "n_iteration" => 10, + "multithread" => "ensemble", + "train_fraction" => 0.95, + "inflation" => 0.0, + "cov_sample_multiplier" => 0.5, + "n_ensemble" => 50, + "localization" => SEC(1.0), + ) + if emulator_type == "VectorRFR-svd-nondiag" || emulator_type == "VectorRFR-nondiag" @@ -89,23 +98,46 @@ using CalibrateEmulateSample.DataContainers println("Running Vector RF model - without SVD and assuming non-diagonal variance ") end - n_features = 80 * Int(floor(5 * sqrt(N_ens * N_iter))) + n_features = 20 * Int(floor(5 * sqrt(N_ens * N_iter))) #80 * println("build RF with $(N_ens*N_iter) training points and $(n_features) random features.") + kernel_structure = SeparableKernel(LowRankFactor(2), LowRankFactor(3)) + mlt = VectorRandomFeatureInterface( + n_features, + input_dim, + output_dim, + kernel_structure = kernel_structure, + optimizer_options = eki_options_override, + ) + elseif emulator_type == "VectorRFR-svd-nonsep" || emulator_type == "VectorRFR-nonsep" + if emulator_type == "VectorRFR-svd-nondiag" + println("Running Vector RF model with nonseparable kernel - using SVD") + elseif emulator_type == "VectorRFR-nonsep" + println("Running Vector RF model with nonseparable kernel - without SVD") + end - mlt = VectorRandomFeatureInterface(n_features, input_dim, output_dim, optimizer_options = eki_options_override) + n_features = 20 * Int(floor(5 * sqrt(N_ens * N_iter))) #80 * + println("build RF with $(N_ens*N_iter) training points and $(n_features) random features.") + kernel_structure = NonseparableKernel(LowRankFactor(5)) + mlt = VectorRandomFeatureInterface( + n_features, + input_dim, + output_dim, + kernel_structure = kernel_structure, + optimizer_options = eki_options_override, + ) elseif emulator_type == "VectorRFR-svd-diag" println("Running Vector RF model - using SVD and assuming diagonal variance") - n_features = 20 * Int(floor(5 * sqrt(N_ens * N_iter))) + n_features = 5 * Int(floor(5 * sqrt(N_ens * N_iter))) #20 * println("build RF with $(N_ens*N_iter) training points and $(n_features) random features.") - + kernel_structure = SeparableKernel(LowRankFactor(2), DiagonalFactor(1e-8)) mlt = VectorRandomFeatureInterface( n_features, input_dim, output_dim, - diagonalize_output = true, + kernel_structure = kernel_structure, optimizer_options = eki_options_override, ) @@ -231,8 +263,75 @@ using CalibrateEmulateSample.DataContainers end end - end + end # plots + + ### MCMC + + priorfile = "priors.jld2" + prior = load(priorfile)["prior"] + + ## + ### Sample: Markov Chain Monte Carlo + ### + # initial values + u0 = vec(mean(get_inputs(input_output_pairs), dims = 2)) + println("initial parameters: ", u0) + + # First let's run a short chain to determine a good step size + mcmc = MCMCWrapper(RWMHSampling(), truth, prior, emulator; init_params = u0) + new_step = optimize_stepsize(mcmc; init_stepsize = 0.1, N = 2000, discard_initial = 0) + + # Now begin the actual MCMC + println("Begin MCMC - with step size ", new_step) + chain = MarkovChainMonteCarlo.sample(mcmc, 100_000; stepsize = new_step, discard_initial = 2_000) + posterior = MarkovChainMonteCarlo.get_posterior(mcmc, chain) + + post_mean = mean(posterior) + post_cov = cov(posterior) + println("post_mean") + println(post_mean) + println("post_cov") + println(post_cov) + println("D util") + println(det(inv(post_cov))) + println(" ") + # plot posterior + truth_params = [log(0.7 / 0.3) log(7200)]' # parameter value (at truth) - unconstrained + + # Save data + save( + joinpath(data_save_directory, expname * "posterior.jld2"), + "posterior", + posterior, + "input_output_pairs", + input_output_pairs, + "truth_params", + truth_params, + ) + + + constrained_truth_params = transform_unconstrained_to_constrained(posterior, truth_params) + param_names = get_name(posterior) + + posterior_samples = vcat([get_distribution(posterior)[name] for name in param_names]...) #samples are columns + constrained_posterior_samples = + mapslices(x -> transform_unconstrained_to_constrained(posterior, x), posterior_samples, dims = 1) + + gr(dpi = 300, size = (300, 300)) + p = cornerplot(permutedims(constrained_posterior_samples, (2, 1)), label = param_names, compact = true) + plot!(p.subplots[1], [constrained_truth_params[1]], seriestype = "vline", w = 1.5, c = :steelblue, ls = :dash) # vline on top histogram + plot!(p.subplots[3], [constrained_truth_params[2]], seriestype = "hline", w = 1.5, c = :steelblue, ls = :dash) # hline on right histogram + plot!(p.subplots[2], [constrained_truth_params[1]], seriestype = "vline", w = 1.5, c = :steelblue, ls = :dash) # v & h line on scatter. + plot!(p.subplots[2], [constrained_truth_params[2]], seriestype = "hline", w = 1.5, c = :steelblue, ls = :dash) + figpath = joinpath(figure_save_directory, "plot_posterior_" * expname * ".png") + savefig(figpath) + + +end # main + +@time begin + main() end # for @time diff --git a/examples/GCM/sbatch_emulate_sample_jl b/examples/GCM/sbatch_emulate_sample_jl index efbdb391c..8807f0e9a 100644 --- a/examples/GCM/sbatch_emulate_sample_jl +++ b/examples/GCM/sbatch_emulate_sample_jl @@ -1,6 +1,6 @@ #!/bin/bash #Submit this script with: sbatch thefilename -#SBATCH --time=6:00:00 # walltime +#SBATCH --time=12:00:00 # walltime #SBATCH --ntasks-per-node=1 # number of processor cores (i.e. tasks) #SBATCH --cpus-per-task=28 #SBATCH --mem-per-cpu=6000 diff --git a/examples/Lorenz/GModel_common.jl b/examples/Lorenz/GModel_common.jl index d3a99de5c..1954cae29 100644 --- a/examples/Lorenz/GModel_common.jl +++ b/examples/Lorenz/GModel_common.jl @@ -128,7 +128,8 @@ end # tend: End of simulation Float64(1), nstep: function lorenz_solve(settings::LSettings, params::LParams) # Initialize - nstep = Int32(ceil((settings.tend - settings.t_start) / settings.dt)) + # nstep = Int32(ceil((settings.tend - settings.t_start) / settings.dt)) + nstep = Int32(ceil(settings.tend / settings.dt)) xn = zeros(Float64, settings.N, nstep) t = zeros(Float64, nstep) # Initial perturbation @@ -137,7 +138,7 @@ function lorenz_solve(settings::LSettings, params::LParams) Xbuffer = zeros(4, settings.N) # March forward in time for j in 1:nstep - t[j] = settings.t_start + settings.dt * j + t[j] = settings.dt * j #use view to update a slice # RK4! modifies first and last arguments if settings.dynamics == 1 diff --git a/examples/Lorenz/calibrate.jl b/examples/Lorenz/calibrate.jl index a799899a0..17e1b1242 100644 --- a/examples/Lorenz/calibrate.jl +++ b/examples/Lorenz/calibrate.jl @@ -15,6 +15,34 @@ using CalibrateEmulateSample const EKP = CalibrateEmulateSample.EnsembleKalmanProcesses const PD = EKP.ParameterDistributions + +function shrinkage_cov(sample_mat::AA) where {AA <: AbstractMatrix} + n_out, n_sample = size(sample_mat) + + # de-mean (as we will use the samples directly for calculation of β) + sample_mat_zeromean = sample_mat .- mean(sample_mat, dims = 2) + # Ledoit Wolf shrinkage to I + + # get sample covariance + Γ = cov(sample_mat_zeromean, dims = 2) + # estimate opt shrinkage + μ_shrink = 1 / n_out * tr(Γ) + δ_shrink = norm(Γ - μ_shrink * I)^2 / n_out # (scaled) frob norm of Γ_m + #once de-meaning, we need to correct the sample covariance with an n_sample -> n_sample-1 + β_shrink = sum([norm(c * c' - -Γ)^2 / n_out for c in eachcol(sample_mat_zeromean)]) / (n_sample - 1)^2 + + γ_shrink = min(β_shrink / δ_shrink, 1) # clipping is typically rare + # γμI + (1-γ)Γ + Γ .*= (1 - γ_shrink) + for i in 1:n_out + Γ[i, i] += γ_shrink * μ_shrink + end + + @info "Shrinkage scale: $(γ_shrink), (0 = none, 1 = revert to scaled Identity)\n shrinkage covariance condition number: $(cond(Γ))" + return Γ +end + + function main() rng_seed = 4137 @@ -63,10 +91,6 @@ function main() n_param = length(param_names) params_true = reshape(params_true, (n_param, 1)) - println(n_param) - println(params_true) - - ### ### Define the parameter priors ### @@ -97,7 +121,7 @@ function main() N = 36 dt = 1 / 64.0 # Start of integration - t_start = 800.0 + t_start = 100.0#800.0 # We now rescale all the timings etc, to be in nondim-time # Data collection length if dynamics == 1 @@ -110,7 +134,7 @@ function main() # Integration length Tfit = Ts_days / τc # Initial perturbation - Fp = rand(Normal(0.0, 0.01), N) + Fp = rand(rng, Normal(0.0, 0.01), N) kmax = 1 # Prescribe variance or use a number of forward passes to define true interval variability var_prescribe = false @@ -148,40 +172,61 @@ function main() μ = zeros(length(gt)) # Add noise for i in 1:n_samples - yt[:, i] = gt .+ rand(MvNormal(μ, Γy)) + yt[:, i] = gt .+ rand(rng, MvNormal(μ, Γy)) end - println(Γy) else println("Using truth values to compute covariance") n_samples = 100 - nthreads = Threads.nthreads() - yt = zeros(nthreads, length(gt), n_samples) - Threads.@threads for i in 1:n_samples - - #= - lorenz_settings_local = GModel.LSettings( - dynamics, - stats_type, - t_start + T * (i - 1), - T, - Ts, - Tfit, - Fp, - N, - dt, - t_start + T * (i - 1) + T, - kmax, - ω_fixed, - ω_true, - )=# - lorenz_settings_local = lorenz_settings - tid = Threads.threadid() - yt[tid, :, i] = GModel.run_G_ensemble(params_true, lorenz_settings_local) + yt = zeros(length(gt), n_samples) + for i in 1:n_samples + + # Fp changes IC, then run from 0 to t_start + T, + # then gathering stats over [t_start, t_start+T] + Fp = rand(rng, Normal(0.0, 0.01), N) + lorenz_settings_local = GModel.LSettings( + dynamics, + stats_type, + t_start, # data collection start start time + T, #data collection length + Ts, # integration_length + Tfit, # number of Ts to fit over + Fp, + N, + dt, + t_start + T, #sim end time + kmax, + ω_fixed, + ω_true, + ) + + #= lorenz_settings_local = GModel.LSettings( + dynamics, + stats_type, + t_start + T * (i - 1), #sim start time + T, #data collection length + Ts, # integration_length + Tfit, # number of Ts to fit over + Fp, + N, + dt, + t_start + T * (i - 1) + T , #sim end time + kmax, + ω_fixed, + ω_true, + )=# + yt[:, i] = GModel.run_G_ensemble(params_true, lorenz_settings_local) + end - yt = dropdims(sum(yt, dims = 1), dims = 1) + gr(dpi = 300, size = (400, 400)) + KK = plot(yt[1:end, 1:100], legend = false) + savefig(KK, joinpath(figure_save_directory, "data_samples.png")) + + # [1.] use shrinkage for regularization + Γy = shrinkage_cov(yt) - # add a little extra variance for regularization here + #= + # [2.] add fixed variance for regularization noise_sd = 0.1 Γy_diag = noise_sd .^ 2 * convert(Array, I(size(yt, 1))) μ = zeros(size(yt, 1)) @@ -192,8 +237,8 @@ function main() #Γy = convert(Array, Diagonal(dropdims(mean((yt.-mean(yt,dims=1)).^2,dims=1),dims=1))) # Covariance of truth data Γy = cov(yt, dims = 2) + =# - println(Γy) end diff --git a/examples/Lorenz/emulate_sample.jl b/examples/Lorenz/emulate_sample.jl index 7ea16c67c..a42cd5c5a 100644 --- a/examples/Lorenz/emulate_sample.jl +++ b/examples/Lorenz/emulate_sample.jl @@ -4,6 +4,7 @@ include(joinpath(@__DIR__, "..", "ci", "linkfig.jl")) # Import modules using Distributions # probability distributions and associated functions using LinearAlgebra +ENV["GKSwstype"] = "100" using StatsPlots using Plots using Random @@ -43,25 +44,22 @@ function main() "RF-vector-svd-nondiag", "RF-vector-nosvd-diag", "RF-vector-nosvd-nondiag", + "RF-vector-svd-nonsep", ] #### CHOOSE YOUR CASE: - mask = 2:7 - # One day we can use good heuristics here - # Currently set so that learnt hyperparameters stays relatively far inside the prior. (use "verbose" => true in optimizer overrides to see this info) - prior_scalings = [[0.0, 0.0], [1e-2, 0.0], [1e-1, 0.0], [1e-2, 1e-2], [1e-2, 1e-1], [1e-2, 1e-2], [1e-2, 1e-2]] - for (case, scaling) in zip(cases[mask], prior_scalings[mask]) + mask = 2:7 # e.g. 1:8 or [7] + for (case) in cases[mask] + - #case = cases[7] println("case: ", case) - max_iter = 6 # number of EKP iterations to use data from is at most this + min_iter = 1 + max_iter = 5 # number of EKP iterations to use data from is at most this overrides = Dict( "verbose" => true, - "train_fraction" => 0.8, - "n_iteration" => 40, - "scheduler" => DataMisfitController(), - "prior_in_scale" => scaling[1], - "prior_out_scale" => scaling[2], + "scheduler" => DataMisfitController(terminate_at = 100.0), + "cov_sample_multiplier" => 1.0, + "n_iteration" => 20, ) # we do not want termination, as our priors have relatively little interpretation @@ -93,12 +91,12 @@ function main() ekiobj = load(data_save_file)["eki"] priors = load(data_save_file)["priors"] - truth_sample = load(data_save_file)["truth_sample"] truth_sample_mean = load(data_save_file)["truth_sample_mean"] + truth_sample = load(data_save_file)["truth_sample"] truth_params_constrained = load(data_save_file)["truth_input_constrained"] #true parameters in constrained space truth_params = transform_constrained_to_unconstrained(priors, truth_params_constrained) Γy = ekiobj.obs_noise_cov - println(Γy) + n_params = length(truth_params) # "input dim" output_dim = size(Γy, 1) @@ -108,6 +106,7 @@ function main() # Emulate-sample settings # choice of machine-learning tool + nugget = 0.001 if case == "GP" gppackage = Emulators.GPJL() pred_type = Emulators.YType() @@ -118,26 +117,43 @@ function main() noise_learn = false, ) elseif case ∈ ["RF-scalar", "RF-scalar-diagin"] - n_features = 300 - diagonalize_input = case == "RF-scalar-diagin" ? true : false + n_features = 100 + kernel_structure = + case == "RF-scalar-diagin" ? SeparableKernel(DiagonalFactor(nugget), OneDimFactor()) : + SeparableKernel(LowRankFactor(2, nugget), OneDimFactor()) mlt = ScalarRandomFeatureInterface( n_features, n_params, rng = rng, - diagonalize_input = diagonalize_input, + kernel_structure = kernel_structure, optimizer_options = overrides, ) elseif case ∈ ["RF-vector-svd-diag", "RF-vector-nosvd-diag", "RF-vector-svd-nondiag", "RF-vector-nosvd-nondiag"] # do we want to assume that the outputs are decorrelated in the machine-learning problem? - diagonalize_output = case ∈ ["RF-vector-svd-diag", "RF-vector-nosvd-diag"] ? true : false - n_features = 300 + kernel_structure = + case ∈ ["RF-vector-svd-diag", "RF-vector-nosvd-diag"] ? + SeparableKernel(LowRankFactor(2, nugget), DiagonalFactor(nugget)) : + SeparableKernel(LowRankFactor(2, nugget), LowRankFactor(3, nugget)) + n_features = 500 mlt = VectorRandomFeatureInterface( n_features, n_params, output_dim, rng = rng, - diagonalize_output = diagonalize_output, # assume outputs are decorrelated + kernel_structure = kernel_structure, + optimizer_options = overrides, + ) + elseif case ∈ ["RF-vector-svd-nonsep"] + kernel_structure = NonseparableKernel(LowRankFactor(3, nugget)) + n_features = 500 + + mlt = VectorRandomFeatureInterface( + n_features, + n_params, + output_dim, + rng = rng, + kernel_structure = kernel_structure, optimizer_options = overrides, ) end @@ -146,21 +162,41 @@ function main() # Use median over all data since all data are the same type truth_sample_norm = vcat(truth_sample...) norm_factor = get_standardizing_factors(truth_sample_norm) - println(size(norm_factor)) #norm_factor = vcat(norm_factor...) norm_factor = fill(norm_factor, size(truth_sample)) - println("Standardization factors") - println(norm_factor) # Get training points from the EKP iteration number in the second input term N_iter = min(max_iter, length(get_u(ekiobj)) - 1) # number of paired iterations taken from EKP - - input_output_pairs = Utilities.get_training_points(ekiobj, N_iter - 1) # 1:N-1 - - input_output_pairs_test = Utilities.get_training_points(ekiobj, N_iter:N_iter) # just "next" iteration used for testing + min_iter = min(max_iter, max(1, min_iter)) + input_output_pairs = Utilities.get_training_points(ekiobj, min_iter:(N_iter - 1)) + input_output_pairs_test = Utilities.get_training_points(ekiobj, N_iter:(length(get_u(ekiobj)) - 1)) # "next" iterations # Save data @save joinpath(data_save_directory, "input_output_pairs.jld2") input_output_pairs + # plot training points in constrained space + if case == cases[mask[1]] + gr(dpi = 300, size = (400, 400)) + inputs_unconstrained = get_inputs(input_output_pairs) + inputs_constrained = transform_unconstrained_to_constrained(priors, inputs_unconstrained) + p = plot( + title = "training points", + xlims = extrema(inputs_constrained[1, :]), + xaxis = "F", + yaxis = "A", + ylims = extrema(inputs_constrained[2, :]), + ) + scatter!(p, inputs_constrained[1, :], inputs_constrained[2, :], color = :magenta, label = false) + inputs_test_unconstrained = get_inputs(input_output_pairs_test) + inputs_test_constrained = transform_unconstrained_to_constrained(priors, inputs_test_unconstrained) + + scatter!(p, inputs_test_constrained[1, :], inputs_test_constrained[2, :], color = :black, label = false) + + vline!(p, [truth_params_constrained[1]], linestyle = :dash, linecolor = :red, label = false) + hline!(p, [truth_params_constrained[2]], linestyle = :dash, linecolor = :red, label = false) + savefig(p, joinpath(figure_save_directory, "training_points.pdf")) + savefig(p, joinpath(figure_save_directory, "training_points.png")) + end + standardize = false retained_svd_frac = 1.0 normalized = true @@ -190,11 +226,11 @@ function main() println("ML prediction on true parameters: ") println(vec(y_mean)) println("true data: ") - println(truth_sample_mean) # same, regardless of norm_factor - println(" ML predicted variance") - println(diag(y_var[1], 0)) + println(truth_sample) # what was used as truth + println(" ML predicted standard deviation") + println(sqrt.(diag(y_var[1], 0))) println("ML MSE (truth): ") - println(mean((truth_sample_mean - vec(y_mean)) .^ 2)) + println(mean((truth_sample - vec(y_mean)) .^ 2)) println("ML MSE (next ensemble): ") println(mean((get_outputs(input_output_pairs_test) - y_mean_test) .^ 2)) @@ -207,13 +243,13 @@ function main() println("initial parameters: ", u0) # First let's run a short chain to determine a good step size - truth_sample = truth_sample mcmc = MCMCWrapper(RWMHSampling(), truth_sample, priors, emulator; init_params = u0) new_step = optimize_stepsize(mcmc; init_stepsize = 0.1, N = 2000, discard_initial = 0) # Now begin the actual MCMC println("Begin MCMC - with step size ", new_step) chain = MarkovChainMonteCarlo.sample(mcmc, 100_000; stepsize = new_step, discard_initial = 2_000) + posterior = MarkovChainMonteCarlo.get_posterior(mcmc, chain) post_mean = mean(posterior) @@ -226,7 +262,6 @@ function main() println(det(inv(post_cov))) println(" ") - constrained_truth_params = transform_unconstrained_to_constrained(posterior, truth_params) param_names = get_name(posterior) posterior_samples = vcat([get_distribution(posterior)[name] for name in get_name(posterior)]...) #samples are columns @@ -235,10 +270,12 @@ function main() gr(dpi = 300, size = (300, 300)) p = cornerplot(permutedims(constrained_posterior_samples, (2, 1)), label = param_names, compact = true) - plot!(p.subplots[1], [constrained_truth_params[1]], seriestype = "vline", w = 1.5, c = :steelblue, ls = :dash) # vline on top histogram - plot!(p.subplots[3], [constrained_truth_params[2]], seriestype = "hline", w = 1.5, c = :steelblue, ls = :dash) # hline on right histogram - plot!(p.subplots[2], [constrained_truth_params[1]], seriestype = "vline", w = 1.5, c = :steelblue, ls = :dash) # v & h line on scatter. - plot!(p.subplots[2], [constrained_truth_params[2]], seriestype = "hline", w = 1.5, c = :steelblue, ls = :dash) + plot!(p.subplots[1], [truth_params_constrained[1]], seriestype = "vline", w = 1.5, c = :steelblue, ls = :dash) # vline on top histogram + plot!(p.subplots[3], [truth_params_constrained[2]], seriestype = "hline", w = 1.5, c = :steelblue, ls = :dash) # hline on right histogram + plot!(p.subplots[2], [truth_params_constrained[1]], seriestype = "vline", w = 1.5, c = :steelblue, ls = :dash) # v & h line on scatter. + plot!(p.subplots[2], [truth_params_constrained[2]], seriestype = "hline", w = 1.5, c = :steelblue, ls = :dash) + figpath = joinpath(figure_save_directory, "posterior_2d-" * case * ".pdf") + savefig(figpath) figpath = joinpath(figure_save_directory, "posterior_2d-" * case * ".png") savefig(figpath) diff --git a/src/Emulator.jl b/src/Emulator.jl index 8c6f7a2d8..985d59dee 100644 --- a/src/Emulator.jl +++ b/src/Emulator.jl @@ -132,7 +132,6 @@ function Emulator( svd_transform(training_outputs, obs_noise_cov, retained_svd_frac = retained_svd_frac) training_pairs = PairedDataContainer(training_inputs, decorrelated_training_outputs) - # [4.] build an emulator build_models!(machine_learning_tool, training_pairs) else @@ -141,7 +140,6 @@ function Emulator( end decomposition = nothing training_pairs = PairedDataContainer(training_inputs, training_outputs) - # [4.] build an emulator - providing the noise covariance as a Tikhonov regularizer build_models!(machine_learning_tool, training_pairs, regularization_matrix = obs_noise_cov) end @@ -258,13 +256,20 @@ function predict( # [4.] unstandardize return reverse_standardize(emulator, s_outputs, s_output_cov) else #... and want to stay in decorrelated standardized coordinates - diag_out_flag = get_diagonalize_output(get_machine_learning_tool(emulator)) - if diag_out_flag - ds_output_diagvar = vec([Diagonal(ds_output_covvec[j]) for j in 1:N_samples]) # extracts diagonal - if output_dim == 1 + + # special cases where you only return variances and not covariances, could be better acheived with dispatch... + kernel_structure = get_kernel_structure(get_machine_learning_tool(emulator)) + if nameof(typeof(kernel_structure)) == :SeparableKernel + output_cov_structure = get_output_cov_structure(kernel_structure) + if nameof(typeof(output_cov_structure)) == :DiagonalFactor + ds_output_diagvar = vec([Diagonal(ds_output_covvec[j]) for j in 1:N_samples]) # extracts diagonal + return ds_outputs, ds_output_diagvar + elseif nameof(typeof(output_cov_structure)) == :OneDimFactor ds_output_diagvar = ([ds_output_covvec[i][1, 1] for i in 1:N_samples], 1, :) # extracts value + return ds_outputs, ds_output_diagvar + else + return ds_outputs, ds_output_covvec end - return ds_outputs, ds_output_diagvar else return ds_outputs, ds_output_covvec end @@ -434,7 +439,7 @@ function svd_transform( decomposition = SVD(decomp.U[:, 1:k], decomp.S[1:k], decomp.Vt[1:k, :]) else transformed_data = sqrt_singular_values_inv * decomp.Vt * data - decomposition = svd(obs_noise_cov) + decomposition = decomp end return transformed_data, decomposition end diff --git a/src/RandomFeature.jl b/src/RandomFeature.jl index e04fde7eb..129d55f1b 100644 --- a/src/RandomFeature.jl +++ b/src/RandomFeature.jl @@ -8,15 +8,117 @@ using ..Utilities using StableRNGs using Random -export calculate_n_hyperparameters, build_default_prior +export calculate_n_hyperparameters, hyperparameters_from_flat, build_default_prior, cov_structure_from_string +export CovarianceStructureType, OneDimFactor, DiagonalFactor, CholeskyFactor, LowRankFactor, HierarchicalLowRankFactor +export SeparableKernel, NonseparableKernel +export get_input_cov_structure, get_output_cov_structure, get_cov_structure +export get_eps +export rank + + abstract type RandomFeatureInterface <: MachineLearningTool end +# Different types of threading abstract type MultithreadType end struct TullioThreading <: MultithreadType end struct EnsembleThreading <: MultithreadType end -#common functions + +# Different types of covariance representation in the prior +abstract type CovarianceStructureType end +struct OneDimFactor <: CovarianceStructureType end +struct DiagonalFactor{FT <: AbstractFloat} <: CovarianceStructureType + eps::FT +end +DiagonalFactor() = DiagonalFactor(Float64(1.0)) +get_eps(df::DiagonalFactor) = df.eps + +struct CholeskyFactor{FT <: AbstractFloat} <: CovarianceStructureType + eps::FT +end +CholeskyFactor() = CholeskyFactor(Float64(1.0)) +get_eps(cf::CholeskyFactor) = cf.eps + +struct LowRankFactor{FT <: AbstractFloat} <: CovarianceStructureType + rank::Int + eps::FT +end +LowRankFactor(r::Int) = LowRankFactor(r, Float64(1.0)) +get_eps(lrf::LowRankFactor) = lrf.eps + +struct HierarchicalLowRankFactor{FT <: AbstractFloat} <: CovarianceStructureType + #cst::CovarianceStructureType + rank::Int + eps::FT +end + +HierarchicalLowRankFactor(r::Int) = HierarchicalLowRankFactor(r, Float64(1.0)) +get_eps(hlrf::HierarchicalLowRankFactor) = hlrf.eps + +import LinearAlgebra: rank +rank(lrf::LowRankFactor) = lrf.rank +rank(hlrf::HierarchicalLowRankFactor) = hlrf.rank + +# To add a new cov type `T` one must define these 3 methods: +# hyperparameters_from_flat(x::V, d::Int, t::T) where {V <: AbstractVector} +# calculate_n_hyperparameters(d::Int, t::T) +# build_default_prior(name::SS, d::Int, t::T) where {SS <: AbstractString} +# and add a string id to cov_structure_from_string + +function cov_structure_from_string(s::S, d::Int) where {S <: AbstractString} + if s == "onedim" + return OneDimFactor() + elseif s == "diagonal" + return DiagonalFactor() + elseif s == "cholesky" + return CholeskyFactor() + elseif s == "lowrank" + return LowRankFactor(Int(ceil(sqrt(d)))) + elseif s == "hierlowrank" + return HierarchicalLowRankFactor(Int(ceil(sqrt(d)))) + else + throw( + ArgumentError( + "Recieved unknown `input_cov_structure` keyword $(s), \n please choose from [\"onedim\",\"diagonal\",\"cholesky\", \"lowrank\"] or provide a CovarianceStructureType", + ), + ) + end +end +cov_structure_from_string(cst::CST, args...; kwargs...) where {CST <: CovarianceStructureType} = cst + + +# Different types of kernel +abstract type KernelStructureType end + +struct SeparableKernel{CST1 <: CovarianceStructureType, CST2 <: CovarianceStructureType} <: KernelStructureType + input_cov_structure::CST1 + output_cov_structure::CST2 +end +get_input_cov_structure(kernel_structure::SeparableKernel) = kernel_structure.input_cov_structure +get_output_cov_structure(kernel_structure::SeparableKernel) = kernel_structure.output_cov_structure + +struct NonseparableKernel{CST <: CovarianceStructureType} <: KernelStructureType + cov_structure::CST +end +get_cov_structure(kernel_structure::NonseparableKernel) = kernel_structure.cov_structure + + +# calculate_n_hyperparameters +calculate_n_hyperparameters(d::Int, odf::OneDimFactor) = 0 +calculate_n_hyperparameters(d::Int, df::DiagonalFactor) = d +calculate_n_hyperparameters(d::Int, cf::CholeskyFactor) = Int(d * (d + 1) / 2) +calculate_n_hyperparameters(d::Int, lrf::LowRankFactor) = Int(rank(lrf) + d * rank(lrf)) +calculate_n_hyperparameters(d::Int, hlrf::HierarchicalLowRankFactor) = + Int(rank(hlrf) * (rank(hlrf) + 1) / 2 + d * rank(hlrf)) + +# build from flat +hyperparameters_from_flat(x::V, odf::OneDimFactor) where {V <: AbstractVector} = nothing + +function hyperparameters_from_flat(x::V, df::DiagonalFactor) where {V <: AbstractVector} + return convert(Matrix, Diagonal(x) + get_eps(df) * I) +end + """ $(DocStringExtensions.TYPEDSIGNATURES) @@ -33,185 +135,234 @@ function flat_to_chol(x::V) where {V <: AbstractVector} return cholmat end -""" -$(DocStringExtensions.TYPEDSIGNATURES) +function hyperparameters_from_flat(x::V, cf::CholeskyFactor) where {V <: AbstractVector} + chol = flat_to_chol(x) + return chol * permutedims(chol, (2, 1)) + get_eps(cf) * I -Builds Covariance matrices from a flat array of hyperparameters -""" -function hyperparameters_from_flat( - x::VV, - input_dim::Int, - output_dim::Int, - diagonalize_input::Bool, - diagonalize_output::Bool, -) where {VV <: AbstractVector} - # if dim = 1 then parameters are a 1x1 matrix - # if we diagonalize then parameters are diagonal entries + constant - # if we don't diagonalize then parameters are a cholesky product + constant on diagonal. - - #input space - if input_dim == 1 - in_max = 1 - U = x[in_max] * ones(1, 1) - elseif diagonalize_input - in_max = input_dim + 2 - U = convert(Matrix, x[in_max - 1] * (Diagonal(x[1:(in_max - 2)]) + x[in_max] * I)) - elseif !diagonalize_input - in_max = Int(input_dim * (input_dim + 1) / 2) + 2 - cholU = flat_to_chol(x[1:(in_max - 2)]) - U = x[in_max - 1] * (cholU * permutedims(cholU, (2, 1)) + x[in_max] * I) - end +end - # output_space - out_min = in_max + 1 - if output_dim == 1 - V = x[end] * ones(1, 1) - elseif diagonalize_output - V = convert(Matrix, x[end - 1] * (Diagonal(x[out_min:(end - 2)]) + x[end] * I)) - elseif !diagonalize_output - cholV = flat_to_chol(x[out_min:(end - 2)]) - V = x[end - 1] * (cholV * permutedims(cholV, (2, 1)) + x[end] * I) - end +function hyperparameters_from_flat(x::V, lrf::LowRankFactor) where {V <: AbstractVector} - # sometimes this process does not give a (numerically) symmetric matrix - U = 0.5 * (U + permutedims(U, (2, 1))) - V = 0.5 * (V + permutedims(V, (2, 1))) + r = rank(lrf) + d = Int(length(x) / r - 1) # l = r + d*r - return U, V + D = Diagonal(x[1:r]) # D acts like sing.vals.^2 + U = reshape(x[(r + 1):end], d, r) + qrU = qr(U) # qrU.Q is semi-orthogonal in first r columns, i.e. Q^T.Q = I + return qrU.Q[:, 1:r] * D * qrU.Q[:, 1:r]' + get_eps(lrf) * I end -function hyperparameters_from_flat(x::VV, input_dim::Int, diagonalize_input::Bool) where {VV <: AbstractVector} - output_dim = 1 - diagonalize_output = false - U, V = hyperparameters_from_flat(x, input_dim, output_dim, diagonalize_input, diagonalize_output) - # Note that the scalar setting has a slight inconsistency from the 1-D output vector case - # We correct here by rescaling U using the single hyperparameter in V - return V[1, 1] * U +function hyperparameters_from_flat(x::V, hlrf::HierarchicalLowRankFactor) where {V <: AbstractVector} + r = rank(hlrf) + d = Int(length(x) / r - (r + 1) / 2) # l = r^2 + d*r + + # Ambikasaran, O'Neil, Singh 2016 + L = flat_to_chol(x[1:Int(r * (r + 1) / 2)]) + U = reshape(x[Int(r * (r + 1) / 2 + 1):end], d, r) + K = L * permutedims(L, (2, 1)) + get_eps(hlrf) * I + + qrU = qr(U) + svdT = svd(I + qrU.R * K * qrU.R') # square so T = Q S Q^t + M = svdT.V * Diagonal(sqrt.(svdT.S)) # M M^t = Q sqrt(S) (Q sqrt(S))^t = Q sqrt(S) sqrt(S) Q^t = T + X = M - I + update_factor = I + qrU.Q * X * qrU.Q' + + return update_factor * update_factor' +end + + +build_default_prior(name::SS, n_hp::Int, odf::OneDimFactor) where {SS <: AbstractString} = nothing + +function build_default_prior(name::SS, n_hp::Int, df::DiagonalFactor) where {SS <: AbstractString} + return ParameterDistribution( + Dict( + "name" => "$(name)_diagonal", + "distribution" => VectorOfParameterized(repeat([Normal(0, 3)], n_hp)), + "constraint" => repeat([bounded_below(0.0)], n_hp), + ), + ) end +function build_default_prior(name::SS, n_hp::Int, df::CholeskyFactor) where {SS <: AbstractString} + return constrained_gaussian("$(name)_cholesky", 0.0, 5.0, -Inf, Inf, repeats = n_hp) +end + +function build_default_prior(name::SS, n_hp::Int, lrf::LowRankFactor) where {SS <: AbstractString} + r = rank(lrf) + d = Int(n_hp / r - 1) + D = ParameterDistribution( + Dict( + "name" => "$(name)_lowrank_diagonal", + "distribution" => VectorOfParameterized(repeat([Normal(0, 3)], r)), + "constraint" => repeat([bounded_below(0.0)], r), + ), + ) + U = constrained_gaussian("$(name)_lowrank_U", 0.0, 100.0, -Inf, Inf, repeats = Int(d * r)) + return combine_distributions([D, U]) +end + +function build_default_prior(name::SS, n_hp::Int, hlrf::HierarchicalLowRankFactor) where {SS <: AbstractString} + r = rank(hlrf) + d = Int(n_hp / r - (r + 1) / 2) + L = constrained_gaussian("$(name)_lowrank_Kchol", 0.0, 5.0, -Inf, Inf, repeats = Int(r * (r + 1) / 2)) + U = constrained_gaussian("$(name)_lowrank_U", 0.0, 100.0, -Inf, Inf, repeats = Int(d * r)) + return combine_distributions([L, U]) +end + +# combining input and output spaces: + """ $(DocStringExtensions.TYPEDSIGNATURES) -Builds a prior over the hyperparameters (i.e. the cholesky/diagaonal or individaul entries of the input/output covariances). -For example, the case where the input covariance ``U = γ_1 * (LL^T + γ_2 I)``, -we set priors for the entries of the lower triangular matrix ``L`` as normal, and constant scalings ``γ_i`` as log-normal to retain positivity. - -priors can be scaled by a constant factor by using the in_scale and out_scale keywords +Calculate number of hyperparameters required to create the default prior in the given input/output space dimensions (determined from the `CovarianceStructureType`) """ -function build_default_prior( +function calculate_n_hyperparameters( input_dim::Int, output_dim::Int, - diagonalize_input::Bool, - diagonalize_output::Bool; - in_scale = 1.0, - out_scale = 1.0, -) - n_hp_in = n_hyperparameters_cov(input_dim, diagonalize_input) - n_hp_out = n_hyperparameters_cov(output_dim, diagonalize_output) - - # aspect ratio for mean:sd for positive parameters in constrained_gaussian() - # 10 seems to give a decent range, this isn't very tuneable so the alternative would be to use the basic constructor - pos_asp_ratio = 10 - # if dim = 1 , positive constant - # elseif diagonalized, positive on diagonal and positive scalings - # else chol factor, and positive scalings - if input_dim > 1 - if diagonalize_input - param_diag = - constrained_gaussian("input_prior_diagonal", 1.0, pos_asp_ratio, 0.0, Inf, repeats = n_hp_in - 2) - param_scaling = - constrained_gaussian("input_prior_scaling", in_scale, in_scale * pos_asp_ratio, 0.0, Inf, repeats = 2) - input_prior = combine_distributions([param_diag, param_scaling]) - else - param_chol = constrained_gaussian("input_prior_cholesky", 0.0, 1.0, -Inf, Inf, repeats = n_hp_in - 2) - param_scaling = - constrained_gaussian("input_prior_scaling", in_scale, in_scale * pos_asp_ratio, 0.0, Inf, repeats = 2) - input_prior = combine_distributions([param_chol, param_scaling]) - end - else - input_prior = constrained_gaussian("input_prior", in_scale, in_scale, 0.0, Inf) + kernel_structure::SK, +) where {SK <: SeparableKernel} + + input_cov_structure = get_input_cov_structure(kernel_structure) + n_hp_in = calculate_n_hyperparameters(input_dim, input_cov_structure) + output_cov_structure = get_output_cov_structure(kernel_structure) + n_hp_out = calculate_n_hyperparameters(output_dim, output_cov_structure) + n_scalings = 1 + + # if both in and output dim are "OneDimFactors" we still learn 1 hp + if n_hp_in + n_hp_out == 0 + return 1 + n_scalings end - if output_dim > 1 - if diagonalize_output - param_diag = - constrained_gaussian("output_prior_diagonal", 1.0, pos_asp_ratio, 0.0, Inf, repeats = n_hp_out - 2) - param_scaling = constrained_gaussian( - "output_prior_scaling", - out_scale, - out_scale * pos_asp_ratio, - 0.0, - Inf, - repeats = 2, - ) - output_prior = combine_distributions([param_diag, param_scaling]) - else - param_chol = constrained_gaussian("output_prior_cholesky", 0.0, 1.0, -Inf, Inf, repeats = n_hp_out - 2) - param_scaling = constrained_gaussian( - "output_prior_scaling", - out_scale, - out_scale * pos_asp_ratio, - 0.0, - Inf, - repeats = 2, - ) - output_prior = combine_distributions([param_chol, param_scaling]) - end - else - output_prior = constrained_gaussian("output_prior", out_scale, 10 * out_scale, 0.0, Inf) - end - - return combine_distributions([input_prior, output_prior]) - + return n_hp_in + n_hp_out + n_scalings end -function build_default_prior(input_dim::Int, diagonalize_input::Bool; in_scale = 1.0) - #scalar case consistent with vector case +function calculate_n_hyperparameters(input_dim::Int, kernel_structure::KST) where {KST <: KernelStructureType} output_dim = 1 - diagonalize_output = false - out_scale = 1.0 - return build_default_prior( - input_dim, - output_dim, - diagonalize_input, - diagonalize_output, - in_scale = in_scale, - out_scale = out_scale, - ) + return calculate_n_hyperparameters(input_dim, output_dim, kernel_structure) end -""" -$(DocStringExtensions.TYPEDSIGNATURES) +function calculate_n_hyperparameters( + input_dim::Int, + output_dim::Int, + kernel_structure::NK, +) where {NK <: NonseparableKernel} + cov_structure = get_cov_structure(kernel_structure) + n_hp = calculate_n_hyperparameters(input_dim * output_dim, cov_structure) + n_scalings = 1 + # if both in and output dim are "OneDimFactors" we still learn 1 hp + if n_hp == 0 + return 1 + n_scalings + end -The number of hyperparameters required to characterize covariance (determined from `hyperparameters_from_flat`). -""" -function n_hyperparameters_cov(dim::Int, diagonalize::Bool) - n_hp = 0 - n_hp += diagonalize ? dim : Int(dim * (dim + 1) / 2) # inputs: diagonal or cholesky - n_hp += (dim > 1) ? 2 : 0 # add two more for constant diagonal in each case. - return n_hp + return n_hp + n_scalings end + + """ $(DocStringExtensions.TYPEDSIGNATURES) -Calculate number of hyperparameters required to create the default prior in the given input/output space (determined from `hyperparameters_from_flat`) +Builds a prior over the hyperparameters (i.e. the low-rank/cholesky/diagaonal or individaul entries of the input/output covariances). +For example, the case where the input covariance ``U = γ_1 * (LL^T + γ_2 I)``, +we set priors for the entries of the lower triangular matrix ``L`` as normal, and constant scalings ``γ_i`` as log-normal to retain positivity. """ -function calculate_n_hyperparameters(input_dim::Int, output_dim::Int, diagonalize_input::Bool, diagonalize_output::Bool) - n_hp_in = n_hyperparameters_cov(input_dim, diagonalize_input) - n_hp_out = n_hyperparameters_cov(output_dim, diagonalize_output) - return n_hp_in + n_hp_out +function build_default_prior(input_dim::Int, output_dim::Int, kernel_structure::SK) where {SK <: SeparableKernel} + input_cov_structure = get_input_cov_structure(kernel_structure) + output_cov_structure = get_output_cov_structure(kernel_structure) + + n_hp_in = calculate_n_hyperparameters(input_dim, input_cov_structure) + input_prior = build_default_prior("input", n_hp_in, input_cov_structure) + n_hp_out = calculate_n_hyperparameters(output_dim, output_cov_structure) + output_prior = build_default_prior("output", n_hp_out, output_cov_structure) + + scaling_kern = constrained_gaussian("sigma", 1, 5, 0, Inf) + + # We only use OneDimFactor values, default to input if both in and output dimension are one dimensional.Otherwise they are ignored + if isnothing(input_prior) && isnothing(output_prior) + onedim_prior = constrained_gaussian("onedim", 1.0, 1.0, 0.0, Inf) + return combine_distributions([onedim_prior, scaling_kern]) + elseif isnothing(input_prior) + return combine_distributions([output_prior, scaling_kern]) + elseif isnothing(output_prior) + return combine_distributions([input_prior, scaling_kern]) + else + return combine_distributions([input_prior, output_prior, scaling_kern]) + end +end + +function build_default_prior(input_dim::Int, kernel_structure::KST) where {KST <: KernelStructureType} + output_dim = 1 + return build_default_prior(input_dim, output_dim, kernel_structure) +end + +function build_default_prior(input_dim::Int, output_dim::Int, kernel_structure::NK) where {NK <: NonseparableKernel} + cov_structure = get_cov_structure(kernel_structure) + n_hp = calculate_n_hyperparameters(input_dim * output_dim, cov_structure) + if n_hp == 0 + pd_kern = constrained_gaussian("onedim", 1.0, 1.0, 0.0, Inf) + else + pd_kern = build_default_prior("full", n_hp, cov_structure) + end + + scaling_kern = constrained_gaussian("sigma", 1, 5, 0, Inf) + return combine_distributions([pd_kern, scaling_kern]) +end + +function hyperparameters_from_flat( + x::VV, + input_dim::Int, + output_dim::Int, + kernel_structure::SK, +) where {VV <: AbstractVector, SK <: SeparableKernel} + input_cov_structure = get_input_cov_structure(kernel_structure) + output_cov_structure = get_output_cov_structure(kernel_structure) + + n_hp_in = calculate_n_hyperparameters(input_dim, input_cov_structure) + U = hyperparameters_from_flat(x[1:n_hp_in], input_cov_structure) + V = hyperparameters_from_flat(x[(n_hp_in + 1):end], output_cov_structure) + + if isnothing(U) && isnothing(V) #i.e. both are from OneDimFactor cov structures + return x[1] * ones(1, 1), V + elseif isnothing(U) + return U, 0.5 * (V + permutedims(V, (2, 1))) + elseif isnothing(V) + return 0.5 * (U + permutedims(U, (2, 1))), V + else + # symmetrize (sometimes there are numerical artifacts) + U = 0.5 * (U + permutedims(U, (2, 1))) + V = 0.5 * (V + permutedims(V, (2, 1))) + return U, V + end end -function calculate_n_hyperparameters(input_dim::Int, diagonalize_input::Bool) - #scalar case consistent with vector case +function hyperparameters_from_flat( + x::VV, + input_dim::Int, + kernel_structure::SK, +) where {VV <: AbstractVector, SK <: SeparableKernel} output_dim = 1 - diagonalize_output = false - return calculate_n_hyperparameters(input_dim, output_dim, diagonalize_input, diagonalize_output) + U, V = hyperparameters_from_flat(x, input_dim, output_dim, kernel_structure) + + return U end +function hyperparameters_from_flat( + x::VV, + input_dim::Int, + output_dim::Int, + kernel_structure::NK, +) where {VV <: AbstractVector, NK <: NonseparableKernel} + cov_structure = get_cov_structure(kernel_structure) + C = hyperparameters_from_flat(x, cov_structure) + if isnothing(C) + return x[1] * ones(1, 1) + else + return 0.5 * (C + permutedims(C, (2, 1))) + end + +end """ $(DocStringExtensions.TYPEDSIGNATURES) @@ -252,7 +403,7 @@ function calculate_mean_cov_and_coeffs( otest = get_outputs(io_pairs)[:, (n_train + 1):end] input_dim = size(itrain, 1) output_dim = size(otrain, 1) - + n_test = size(itest, 2) # build and fit the RF rfm = RFM_from_hyperparameters( rfi, @@ -278,11 +429,15 @@ function calculate_mean_cov_and_coeffs( buffer, tullio_threading = thread_opt, ) + # sizes (output_dim x n_test), (output_dim x output_dim x n_test) - scaled_coeffs = sqrt(1 / n_features) * RF.Methods.get_coeffs(fitted_features) - # we add the additional complexity term - # TODO only cholesky and SVD available + ## TODO - the theory states that the following should be set: + # scaled_coeffs = sqrt(1 / (n_features)) * RF.Methods.get_coeffs(fitted_features) + # However the convergence is much improved with setting this to zero: + scaled_coeffs = 0 + + if decomp_type == "cholesky" chol_fac = RF.Methods.get_decomposition(RF.Methods.get_feature_factors(fitted_features)).L complexity = 2 * sum(log(chol_fac[i, i]) for i in 1:size(chol_fac, 1)) @@ -290,10 +445,47 @@ function calculate_mean_cov_and_coeffs( svd_singval = RF.Methods.get_decomposition(RF.Methods.get_feature_factors(fitted_features)).S complexity = sum(log, svd_singval) # note this is log(abs(det)) end + complexity = sqrt(abs(complexity)) #abs can introduce nonconvexity, + return scaled_coeffs, complexity end + + +""" +$(DocStringExtensions.TYPEDSIGNATURES) + +Calculate the empirical covariance, additionally applying a shrinkage operator (here the Ledoit Wolf 2004 shrinkage operation). Known to have better stability properties than Monte-Carlo for low sample sizes +""" +function shrinkage_cov(sample_mat::AA) where {AA <: AbstractMatrix} + n_out, n_sample = size(sample_mat) + + # de-mean (as we will use the samples directly for calculation of β) + sample_mat_zeromean = sample_mat .- mean(sample_mat, dims = 2) + # Ledoit Wolf shrinkage to I + + # get sample covariance + Γ = cov(sample_mat_zeromean, dims = 2) + # estimate opt shrinkage + μ_shrink = 1 / n_out * tr(Γ) + δ_shrink = norm(Γ - μ_shrink * I)^2 / n_out # (scaled) frob norm of Γ_m + #once de-meaning, we need to correct the sample covariance with an n_sample -> n_sample-1 + β_shrink = sum([norm(c * c' - -Γ)^2 / n_out for c in eachcol(sample_mat_zeromean)]) / (n_sample - 1)^2 + + γ_shrink = min(β_shrink / δ_shrink, 1) # clipping is typically rare + # γμI + (1-γ)Γ + Γ .*= (1 - γ_shrink) + for i in 1:n_out + Γ[i, i] += γ_shrink * μ_shrink + end + + @info "Shrinkage scale: $(γ_shrink), (0 = none, 1 = revert to scaled Identity)\n shrinkage covariance condition number: $(cond(Γ))" + return Γ +end + + + """ $(DocStringExtensions.TYPEDSIGNATURES) @@ -377,14 +569,19 @@ function estimate_mean_and_coeffnorm_covariance( blockmeans[id, :] = permutedims(means[i, :, :], (2, 1)) end + sample_mat = vcat(blockmeans, coeffl2norm, complexity) + shrinkage = true + if shrinkage + Γ = shrinkage_cov(sample_mat) + else + Γ = cov(sample_mat, dims = 2) + end + if !isposdef(approx_σ2) println("approx_σ2 not posdef") approx_σ2 = posdef_correct(approx_σ2) end - Γ = cov(vcat(blockmeans, coeffl2norm, complexity), dims = 2) - - return Γ, approx_σ2 end @@ -579,16 +776,27 @@ function estimate_mean_and_coeffnorm_covariance( blockmeans[id, :] = permutedims(means[i, :, :], (2, 1)) end + + sample_mat = vcat(blockmeans, coeffl2norm, complexity) + shrinkage = true + if shrinkage + Γ = shrinkage_cov(sample_mat) + else + Γ = cov(sample_mat, dims = 2) + end + if !isposdef(approx_σ2) println("approx_σ2 not posdef") approx_σ2 = posdef_correct(approx_σ2) end - Γ = cov(vcat(blockmeans, coeffl2norm, complexity), dims = 2) return Γ, approx_σ2 end + + + function calculate_ensemble_mean_and_coeffnorm( rfi::RFI, rng::RNG, @@ -622,11 +830,6 @@ function calculate_ensemble_mean_and_coeffnorm( println("calculating " * string(N_ens * repeats) * " ensemble members...") nthreads = Threads.nthreads() - - means = zeros(output_dim, N_ens, n_test) - complexity = zeros(1, N_ens) - coeffl2norm = zeros(1, N_ens) - c_list = [zeros(1, N_ens) for i in 1:nthreads] cp_list = [zeros(1, N_ens) for i in 1:nthreads] m_list = [zeros(output_dim, N_ens, n_test) for i in 1:nthreads] diff --git a/src/ScalarRandomFeature.jl b/src/ScalarRandomFeature.jl index 0a3c0ae9c..a23586127 100644 --- a/src/ScalarRandomFeature.jl +++ b/src/ScalarRandomFeature.jl @@ -6,11 +6,12 @@ $(DocStringExtensions.TYPEDEF) Structure holding the Scalar Random Feature models. -# Fields +# FieldsWhen calibrated with ocean LES, $(DocStringExtensions.TYPEDFIELDS) """ -struct ScalarRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG} <: RandomFeatureInterface +struct ScalarRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG, KST <: KernelStructureType} <: + RandomFeatureInterface "vector of `RandomFeatureMethod`s, contains the feature structure, batch-sizes and regularization" rfms::Vector{RF.Methods.RandomFeatureMethod} "vector of `Fit`s, containing the matrix decomposition and coefficients of RF when fitted to data" @@ -23,8 +24,8 @@ struct ScalarRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG} <: input_dim::Int "choice of random number generator" rng::RNG - "Assume inputs are decorrelated for ML" - diagonalize_input::Bool + "Kernel structure type (e.g. Separable or Nonseparable)" + kernel_structure::KST "Random Feature decomposition, choose from \"svd\" or \"cholesky\" (default)" feature_decomposition::S "dictionary of options for hyperparameter optimizer" @@ -76,9 +77,9 @@ get_rng(srfi::ScalarRandomFeatureInterface) = srfi.rng """ $(DocStringExtensions.TYPEDSIGNATURES) -gets the diagonalize_input field +Gets the kernel_structure field """ -get_diagonalize_input(srfi::ScalarRandomFeatureInterface) = srfi.diagonalize_input +get_kernel_structure(srfi::ScalarRandomFeatureInterface) = srfi.kernel_structure """ $(DocStringExtensions.TYPEDSIGNATURES) @@ -101,7 +102,7 @@ $(DocStringExtensions.TYPEDSIGNATURES) Constructs a `ScalarRandomFeatureInterface <: MachineLearningTool` interface for the `RandomFeatures.jl` package for multi-input and single- (or decorrelated-)output emulators. - `n_features` - the number of random features - `input_dim` - the dimension of the input space - - `diagonalize_input = false` - whether to assume independence in input space (Note, whens set `true`, this tool will approximate directly the behaviour of the scalar-valued GaussianProcess with ARD kernel) + - `kernel_structure` - - a prescribed form of kernel structure - `batch_sizes = nothing` - Dictionary of batch sizes passed `RandomFeatures.jl` object (see definition there) - `rng = Random.GLOBAL_RNG` - random number generator - `feature_decomposition = "cholesky"` - choice of how to store decompositions of random features, `cholesky` or `svd` available @@ -110,7 +111,7 @@ Constructs a `ScalarRandomFeatureInterface <: MachineLearningTool` interface for - "prior_in_scale": use this to tune the input prior scale - "n_ensemble": number of ensemble members - "n_iteration": number of eki iterations - - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 1.0) + - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 0.0) - "scheduler": Learning rate Scheduler (a.k.a. EKP timestepper) Default: DataMisfitController - "tikhonov": tikhonov regularization parameter if >0 - "inflation": additive inflation ∈ [0,1] with 0 being no inflation @@ -121,22 +122,23 @@ Constructs a `ScalarRandomFeatureInterface <: MachineLearningTool` interface for function ScalarRandomFeatureInterface( n_features::Int, input_dim::Int; - diagonalize_input::Bool = false, + kernel_structure::Union{KST, Nothing} = nothing, batch_sizes::Union{Dict{S, Int}, Nothing} = nothing, rng::RNG = Random.GLOBAL_RNG, feature_decomposition::S = "cholesky", optimizer_options::Union{Dict{S}, Nothing} = nothing, -) where {S <: AbstractString, RNG <: AbstractRNG} +) where {S <: AbstractString, RNG <: AbstractRNG, KST <: KernelStructureType} # Initialize vector for GP models rfms = Vector{RF.Methods.RandomFeatureMethod}(undef, 0) fitted_features = Vector{RF.Methods.Fit}(undef, 0) - if !isnothing(optimizer_options) - prior_in_scale = "prior_in_scale" ∈ keys(optimizer_options) ? optimizer_options["prior_in_scale"] : 1.0 - else - prior_in_scale = 1.0 + if isnothing(kernel_structure) + kernel_structure = SeparableKernel(cov_structure_from_string("lowrank", input_dim), OneDimFactor()) end - prior = build_default_prior(input_dim, diagonalize_input, in_scale = prior_in_scale) + KSType = typeof(kernel_structure) + + prior = build_default_prior(input_dim, kernel_structure) + # default optimizer settings optimizer_opts = Dict( "prior" => prior, #the hyperparameter_prior @@ -164,23 +166,60 @@ function ScalarRandomFeatureInterface( end @info("hyperparameter optimization with EKI configured with $opt_tmp") - return ScalarRandomFeatureInterface{S, RNG}( + return ScalarRandomFeatureInterface{S, RNG, KSType}( rfms, fitted_features, batch_sizes, n_features, input_dim, rng, - diagonalize_input, + kernel_structure, feature_decomposition, optimizer_opts, ) end +function hyperparameter_distribution_from_flat( + x::VV, + input_dim::Int, + kernel_structure::SK, +) where {VV <: AbstractVector, SK <: SeparableKernel} + + M = zeros(input_dim) #scalar output + U = hyperparameters_from_flat(x, input_dim, kernel_structure) + if !isposdef(U) + println("U not posdef - correcting") + U = posdef_correct(U) + end + + dist = MvNormal(M, U) + pd = ParameterDistribution( + Dict( + "distribution" => Parameterized(dist), + "constraint" => repeat([no_constraint()], input_dim), + "name" => "xi", + ), + ) + + return pd +end + +function hyperparameter_distribution_from_flat( + x::VV, + input_dim::Int, + kernel_structure::NK, +) where {VV <: AbstractVector, NK <: NonseparableKernel} + throw( + ArgumentError( + "Scalar Kernels must be of type `Separable( *** , OneDimFactor())`, received $(kernel_structure)", + ), + ) +end + """ $(DocStringExtensions.TYPEDSIGNATURES) -Builds the random feature method from hyperparameters. We use cosine activation functions and a MatrixVariateNormal(M,U,V) distribution (from `Distributions.jl`) with mean M=0, and input covariance U built from cholesky factorization or diagonal components based on the diagonalize_input flag. +Builds the random feature method from hyperparameters. We use cosine activation functions and a MatrixVariateNormal(M,U,V) distribution (from `Distributions.jl`) with mean M=0, and input covariance U built using a `CovarianceStructureType`. """ function RFM_from_hyperparameters( srfi::ScalarRandomFeatureInterface, @@ -198,33 +237,22 @@ function RFM_from_hyperparameters( S <: AbstractString, MT <: MultithreadType, } - diagonalize_input = get_diagonalize_input(srfi) - M = zeros(input_dim) #scalar output - U = hyperparameters_from_flat(l, input_dim, diagonalize_input) - if !isposdef(U) - println("U not posdef - correcting") - U = posdef_correct(U) - end + xi_hp = l[1:(end - 1)] + sigma_hp = l[end] + kernel_structure = get_kernel_structure(srfi) + pd = hyperparameter_distribution_from_flat(xi_hp, input_dim, kernel_structure) - dist = MvNormal(M, U) - pd = ParameterDistribution( - Dict( - "distribution" => Parameterized(dist), - "constraint" => repeat([no_constraint()], input_dim), - "name" => "xi", - ), - ) feature_sampler = RF.Samplers.FeatureSampler(pd, rng = rng) # Learn hyperparameters for different feature types - - sf = RF.Features.ScalarFourierFeature(n_features, feature_sampler) + feature_parameters = Dict("sigma" => sigma_hp) + sff = RF.Features.ScalarFourierFeature(n_features, feature_sampler, feature_parameters = feature_parameters) thread_opt = isa(multithread_type, TullioThreading) # if we want to multithread with tullio if isnothing(batch_sizes) - return RF.Methods.RandomFeatureMethod(sf, regularization = regularization, tullio_threading = thread_opt) + return RF.Methods.RandomFeatureMethod(sff, regularization = regularization, tullio_threading = thread_opt) else return RF.Methods.RandomFeatureMethod( - sf, + sff, regularization = regularization, batch_sizes = batch_sizes, tullio_threading = thread_opt, @@ -254,7 +282,7 @@ RFM_from_hyperparameters( """ $(DocStringExtensions.TYPEDSIGNATURES) -Builds the random feature method from hyperparameters. We use cosine activation functions and a Multivariate Normal distribution (from `Distributions.jl`) with mean M=0, and input covariance U built from cholesky factorization or diagonal components based on the diagonalize_input flag. +Builds the random feature method from hyperparameters. We use cosine activation functions and a Multivariate Normal distribution (from `Distributions.jl`) with mean M=0, and input covariance U built with the `CovarianceStructureType`. """ function build_models!( srfi::ScalarRandomFeatureInterface, @@ -268,6 +296,10 @@ function build_models!( input_dim = size(input_values, 1) + kernel_structure = get_kernel_structure(srfi) + n_hp = calculate_n_hyperparameters(input_dim, kernel_structure) + + rfms = get_rfms(srfi) fitted_features = get_fitted_features(srfi) n_features = get_n_features(srfi) @@ -276,16 +308,19 @@ function build_models!( decomp_type = get_feature_decomposition(srfi) optimizer_options = get_optimizer_options(srfi) - #regularization = I = 1.0 in scalar case - regularization = I + # Optimize features with EKP for each output dim # [1.] Split data into test/train 80/20 train_fraction = optimizer_options["train_fraction"] n_train = Int(floor(train_fraction * n_data)) n_test = n_data - n_train - n_features_opt = min(n_features, Int(floor(2 * n_test))) #we take a specified number of features for optimization. + n_features_opt = min(n_features, Int(floor(10 * n_test)))#Int(floor(2 * n_test))) #we take a specified number of features for optimization. idx_shuffle = randperm(rng, n_data) + #regularization = I = 1.0 in scalar case + regularization = I + + @info ( "hyperparameter learning for $n_rfms models using $n_train training points, $n_test validation points and $n_features_opt features" ) @@ -311,11 +346,22 @@ function build_models!( end # [2.] Estimate covariance at mean value prior = optimizer_options["prior"] + + # where prior space has changed we need to rebuild the priors + if ndims(prior) > n_hp + + # comes from having a truncated output_dimension + # TODO not really a truncation here, resetting to default + @info "Original input space of dimension $(get_input_dim(srfi)) has been truncated to $(input_dim). \n Rebuilding prior... number of hyperparameters reduced from $(ndims(prior)) to $(n_hp)." + prior = build_default_prior(input_dim, kernel_structure) + + end + μ_hp = transform_unconstrained_to_constrained(prior, mean(prior)) cov_sample_multiplier = optimizer_options["cov_sample_multiplier"] n_cov_samples_min = n_test + 2 - n_cov_samples = Int(floor(n_cov_samples_min * max(cov_sample_multiplier, 1.0))) + n_cov_samples = Int(floor(n_cov_samples_min * max(cov_sample_multiplier, 0.0))) internal_Γ, approx_σ2 = estimate_mean_and_coeffnorm_covariance( srfi, @@ -331,9 +377,8 @@ function build_models!( decomp_type, multithread_type, ) - Γ = internal_Γ - Γ[1:n_test, 1:n_test] += approx_σ2 + Γ[1:n_test, 1:n_test] += regularization # + approx_σ2 Γ[(n_test + 1):end, (n_test + 1):end] += I # [3.] set up EKP optimization @@ -343,7 +388,8 @@ function build_models!( scheduler = optimizer_options["scheduler"] initial_params = construct_initial_ensemble(rng, prior, n_ensemble) - min_complexity = log(det(regularization)) + min_complexity = log(regularization.λ) + min_complexity = sqrt(abs(min_complexity)) data = vcat(get_outputs(io_pairs_opt)[(n_train + 1):end], 0.0, min_complexity) ekiobj = EKP.EnsembleKalmanProcess( initial_params, @@ -415,6 +461,7 @@ function build_models!( io_pairs_i = PairedDataContainer(input_values, reshape(output_values[i, :], 1, size(output_values, 2))) # Now, fit new RF model with the optimized hyperparameters + rfm_i = RFM_from_hyperparameters( srfi, rng, diff --git a/src/VectorRandomFeature.jl b/src/VectorRandomFeature.jl index bbc7807c5..5326d1678 100644 --- a/src/VectorRandomFeature.jl +++ b/src/VectorRandomFeature.jl @@ -8,8 +8,7 @@ export get_rfms, get_input_dim, get_output_dim, get_rng, - get_diagonalize_input, - get_diagonalize_output, + get_kernel_structure, get_optimizer_options """ @@ -21,7 +20,8 @@ Structure holding the Vector Random Feature models. $(DocStringExtensions.TYPEDFIELDS) """ -struct VectorRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG} <: RandomFeatureInterface +struct VectorRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG, KST <: KernelStructureType} <: + RandomFeatureInterface "A vector of `RandomFeatureMethod`s, contains the feature structure, batch-sizes and regularization" rfms::Vector{RF.Methods.RandomFeatureMethod} "vector of `Fit`s, containing the matrix decomposition and coefficients of RF when fitted to data" @@ -38,10 +38,8 @@ struct VectorRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG} <: rng::RNG "regularization" regularization::Vector{Union{Matrix, UniformScaling, Diagonal}} - "Assume inputs are decorrelated for ML" - diagonalize_input::Bool - "Assume outputs are decorrelated for ML. Note: emulator(..., decorrelate=true) by default" - diagonalize_output::Bool + "Kernel structure type (e.g. Separable or Nonseparable)" + kernel_structure::KST "Random Feature decomposition, choose from \"svd\" or \"cholesky\" (default)" feature_decomposition::S "dictionary of options for hyperparameter optimizer" @@ -107,16 +105,9 @@ get_regularization(vrfi::VectorRandomFeatureInterface) = vrfi.regularization """ $(DocStringExtensions.TYPEDSIGNATURES) -Gets the diagonalize_input field +Gets the kernel_structure field """ -get_diagonalize_input(vrfi::VectorRandomFeatureInterface) = vrfi.diagonalize_input - -""" -$(DocStringExtensions.TYPEDSIGNATURES) - -Gets the diagonalize_output field -""" -get_diagonalize_output(vrfi::VectorRandomFeatureInterface) = vrfi.diagonalize_output +get_kernel_structure(vrfi::VectorRandomFeatureInterface) = vrfi.kernel_structure """ $(DocStringExtensions.TYPEDSIGNATURES) @@ -139,8 +130,7 @@ Constructs a `VectorRandomFeatureInterface <: MachineLearningTool` interface for - `n_features` - the number of random features - `input_dim` - the dimension of the input space - `output_dim` - the dimension of the output space - - `diagonalize_input = false` - whether to assume independence in input space - - `diagonalize_output::Bool = false` - whether to assume independence in output space (even if set true, this is not the same as a list of scalar-valued models, as hyperparameters will be shared between models in this case) + - `kernel_structure` - - a prescribed form of kernel structure - `batch_sizes = nothing` - Dictionary of batch sizes passed `RandomFeatures.jl` object (see definition there) - `rng = Random.GLOBAL_RNG` - random number generator - `feature_decomposition = "cholesky"` - choice of how to store decompositions of random features, `cholesky` or `svd` available @@ -150,7 +140,7 @@ Constructs a `VectorRandomFeatureInterface <: MachineLearningTool` interface for - "n_ensemble": number of ensemble members - "n_iteration": number of eki iterations - "scheduler": Learning rate Scheduler (a.k.a. EKP timestepper) Default: DataMisfitController - - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 1.0) + - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 0.0) - "tikhonov": tikhonov regularization parameter if > 0 - "inflation": additive inflation ∈ [0,1] with 0 being no inflation - "train_fraction": e.g. 0.8 (default) means 80:20 train - test split @@ -162,35 +152,28 @@ function VectorRandomFeatureInterface( n_features::Int, input_dim::Int, output_dim::Int; - diagonalize_input::Bool = false, - diagonalize_output::Bool = false, + kernel_structure::Union{KST, Nothing} = nothing, batch_sizes::Union{Dict{S, Int}, Nothing} = nothing, rng::RNG = Random.GLOBAL_RNG, feature_decomposition::S = "cholesky", optimizer_options::Union{Dict{S}, Nothing} = nothing, -) where {S <: AbstractString, RNG <: AbstractRNG} +) where {S <: AbstractString, RNG <: AbstractRNG, KST <: KernelStructureType} # Initialize vector for RF model rfms = Vector{RF.Methods.RandomFeatureMethod}(undef, 0) fitted_features = Vector{RF.Methods.Fit}(undef, 0) regularization = Vector{Union{Matrix, UniformScaling, Nothing}}(undef, 0) - #Optimization Defaults - if !isnothing(optimizer_options) - prior_in_scale = "prior_in_scale" ∈ keys(optimizer_options) ? optimizer_options["prior_in_scale"] : 1.0 - prior_out_scale = "prior_out_scale" ∈ keys(optimizer_options) ? optimizer_options["prior_out_scale"] : 1.0 - else - prior_in_scale = 1.0 - prior_out_scale = 1.0 + if isnothing(kernel_structure) + kernel_structure = SeparableKernel( + cov_structure_from_string("lowrank", input_dim), + cov_structure_from_string("lowrank", output_dim), + ) end - prior = build_default_prior( - input_dim, - output_dim, - diagonalize_input, - diagonalize_output, - in_scale = prior_in_scale, - out_scale = prior_out_scale, - ) + KSType = typeof(kernel_structure) + prior = build_default_prior(input_dim, output_dim, kernel_structure) + + #Optimization Defaults optimizer_opts = Dict( "prior" => prior, #the hyperparameter_prior (note scalings have already been applied) "n_ensemble" => max(ndims(prior) + 1, 10), #number of ensemble @@ -202,6 +185,7 @@ function VectorRandomFeatureInterface( "train_fraction" => 0.8, # 80:20 train - test split "multithread" => "ensemble", # instead of "tullio" "verbose" => false, # verbose optimizer statements + "localization" => EKP.Localizers.NoLocalization(), # localization / sample error correction for small ensembles ) if !isnothing(optimizer_options) @@ -218,7 +202,7 @@ function VectorRandomFeatureInterface( end @info("hyperparameter optimization with EKI configured with $opt_tmp") - return VectorRandomFeatureInterface{S, RNG}( + return VectorRandomFeatureInterface{S, RNG, KSType}( rfms, fitted_features, batch_sizes, @@ -227,17 +211,72 @@ function VectorRandomFeatureInterface( output_dim, rng, regularization, - diagonalize_input, - diagonalize_output, + kernel_structure, feature_decomposition, optimizer_opts, ) end + + +function hyperparameter_distribution_from_flat( + x::VV, + input_dim::Int, + output_dim::Int, + kernel_structure::SK, +) where {VV <: AbstractVector, SK <: SeparableKernel} + M = zeros(input_dim, output_dim) # n x p mean + + U, V = hyperparameters_from_flat(x, input_dim, output_dim, kernel_structure) + + if !isposdef(U) + println("U not posdef - correcting") + U = posdef_correct(U) + end + if !isposdef(V) + println("V not posdef - correcting") + V = posdef_correct(V) + end + + dist = MatrixNormal(M, U, V) + pd = ParameterDistribution( + Dict( + "distribution" => Parameterized(dist), + "constraint" => repeat([no_constraint()], input_dim * output_dim), + "name" => "xi", + ), + ) + return pd +end + +function hyperparameter_distribution_from_flat( + x::VV, + input_dim::Int, + output_dim::Int, + kernel_structure::NK, +) where {VV <: AbstractVector, NK <: NonseparableKernel} + + C = hyperparameters_from_flat(x, input_dim, output_dim, kernel_structure) + if !isposdef(C) + println("C not posdef - correcting") + C = posdef_correct(C) + end + dist = MvNormal(zeros(input_dim * output_dim), C) + pd = ParameterDistribution( + Dict( + "distribution" => Parameterized(dist), + "constraint" => repeat([no_constraint()], input_dim * output_dim), + "name" => "xi", + ), + ) + return pd +end + + """ $(DocStringExtensions.TYPEDSIGNATURES) -Builds the random feature method from hyperparameters. We use cosine activation functions and a Matrixvariate Normal distribution with mean M=0, and input(output) covariance U(V) built from cholesky factorization or diagonal components based on the diagonalize_input(diagonalize_output) flag. +Builds the random feature method from hyperparameters. We use cosine activation functions and a Matrixvariate Normal distribution with mean M=0, and input(output) covariance U(V) built with a `CovarianceStructureType`. """ function RFM_from_hyperparameters( vrfi::VectorRandomFeatureInterface, @@ -256,31 +295,22 @@ function RFM_from_hyperparameters( MorUSorD <: Union{Matrix, UniformScaling, Diagonal}, MT <: MultithreadType, } - diagonalize_input = get_diagonalize_input(vrfi) - diagonalize_output = get_diagonalize_output(vrfi) - M = zeros(input_dim, output_dim) # n x p mean - U, V = hyperparameters_from_flat(l, input_dim, output_dim, diagonalize_input, diagonalize_output) + xi_hp = l[1:(end - 1)] + sigma_hp = l[end] - if !isposdef(U) - println("U not posdef - correcting") - U = posdef_correct(U) - end - if !isposdef(V) - println("V not posdef - correcting") - V = posdef_correct(V) - end + kernel_structure = get_kernel_structure(vrfi) + pd = hyperparameter_distribution_from_flat(xi_hp, input_dim, output_dim, kernel_structure) - dist = MatrixNormal(M, U, V) - pd = ParameterDistribution( - Dict( - "distribution" => Parameterized(dist), - "constraint" => repeat([no_constraint()], input_dim * output_dim), - "name" => "xi", - ), - ) feature_sampler = RF.Samplers.FeatureSampler(pd, output_dim, rng = rng) - vff = RF.Features.VectorFourierFeature(n_features, output_dim, feature_sampler) + + feature_parameters = Dict("sigma" => sigma_hp) + vff = RF.Features.VectorFourierFeature( + n_features, + output_dim, + feature_sampler, + feature_parameters = feature_parameters, + ) thread_opt = isa(multithread_type, TullioThreading) # if we want to multithread with tullio if isnothing(batch_sizes) @@ -314,10 +344,10 @@ function build_models!( input_dim = size(input_values, 1) output_dim = size(output_values, 1) - # there are less hyperparameters when we diagonalize - diagonalize_input = get_diagonalize_input(vrfi) - diagonalize_output = get_diagonalize_output(vrfi) - n_hp = calculate_n_hyperparameters(input_dim, output_dim, diagonalize_input, diagonalize_output) + + kernel_structure = get_kernel_structure(vrfi) + + n_hp = calculate_n_hyperparameters(input_dim, output_dim, kernel_structure) rfms = get_rfms(vrfi) fitted_features = get_fitted_features(vrfi) @@ -340,12 +370,13 @@ function build_models!( prior = optimizer_options["prior"] rng = get_rng(vrfi) + # where prior space has changed we need to rebuild the priors [TODO just build them here in the first place?] if ndims(prior) > n_hp + # comes from having a truncated output_dimension # TODO not really a truncation here, resetting to default - @info "truncating hyperparameter space to default in first $n_hp dimensions, due to truncation of output space" - prior = constrained_gaussian("cholesky_factors", 1.0, 1.0, 0.0, Inf, repeats = n_hp) - + @info "Original input-output space of dimension ($(get_input_dim(vrfi)), $(get_output_dim(vrfi))) has been truncated to ($(input_dim), $(output_dim)). \n Rebuilding prior... number of hyperparameters reduced from $(ndims(prior)) to $(n_hp)." + prior = build_default_prior(input_dim, output_dim, kernel_structure) end # Optimize feature cholesky factors with EKP @@ -353,12 +384,20 @@ function build_models!( train_fraction = optimizer_options["train_fraction"] n_train = Int(floor(train_fraction * n_data)) # 20% split n_test = n_data - n_train - if diagonalize_output - n_features_opt = min(n_features, Int(floor(4 * n_train))) #we take a specified number of features for optimization. MAGIC NUMBER + + # there are less hyperparameters when we diagonalize + if nameof(typeof(kernel_structure)) == :SeparableKernel + if nameof(typeof(get_output_cov_structure(kernel_structure))) == :DiagonalFactor + n_features_opt = min(n_features, Int(floor(4 * n_train))) #we take a specified number of features for optimization. MAGIC NUMBER + else + # note the n_features_opt should NOT exceed output_dim * n_train or the regularization is replaced with a diagonal approx. + n_features_opt = max(min(n_features, Int(floor(4 * sqrt(n_train * output_dim)))), Int(floor(1.9 * n_train)))#we take a specified number of features for optimization. MAGIC NUMBER + end else # note the n_features_opt should NOT exceed output_dim * n_train or the regularization is replaced with a diagonal approx. n_features_opt = max(min(n_features, Int(floor(4 * sqrt(n_train * output_dim)))), Int(floor(1.9 * n_train)))#we take a specified number of features for optimization. MAGIC NUMBER end + @info ( "hyperparameter learning using $n_train training points, $n_test validation points and $n_features_opt features" ) @@ -368,10 +407,11 @@ function build_models!( # in this setting, noise = I if regularization_matrix === nothing regularization = I + else reg_mat = regularization_matrix if !isposdef(regularization_matrix) - reg_mat = posdef_correct(regularization_matrix) + regularization = posdef_correct(regularization_matrix) println("RF regularization matrix is not positive definite, correcting") else @@ -382,7 +422,6 @@ function build_models!( end end - idx_shuffle = randperm(rng, n_data) io_pairs_opt = PairedDataContainer( @@ -394,12 +433,16 @@ function build_models!( μ_hp = transform_unconstrained_to_constrained(prior, mean(prior)) cov_sample_multiplier = optimizer_options["cov_sample_multiplier"] - if diagonalize_output - n_cov_samples_min = n_test + 2 # diagonal case + if nameof(typeof(kernel_structure)) == :SeparableKernel + if nameof(typeof(get_output_cov_structure(kernel_structure))) == :DiagonalFactor + n_cov_samples_min = n_test + 2 # diagonal case + else + n_cov_samples_min = (n_test * output_dim + 2) + end else - n_cov_samples_min = (n_test * output_dim + 2) # min required for + n_cov_samples_min = (n_test * output_dim + 2) end - n_cov_samples = Int(floor(n_cov_samples_min * max(cov_sample_multiplier, 1.0))) + n_cov_samples = Int(floor(n_cov_samples_min * max(cov_sample_multiplier, 0.0))) internal_Γ, approx_σ2 = estimate_mean_and_coeffnorm_covariance( vrfi, @@ -420,13 +463,15 @@ function build_models!( if tikhonov_opt_val == 0 # Build the covariance Γ = internal_Γ - Γ[1:(n_test * output_dim), 1:(n_test * output_dim)] += approx_σ2 + Γ[1:(n_test * output_dim), 1:(n_test * output_dim)] += regularization # + approx_σ2 Γ[(n_test * output_dim + 1):end, (n_test * output_dim + 1):end] += I #in diag case we have data logdet = λ^m, in non diag case we have logdet(Λ^) to match the different reg matrices. min_complexity = isa(regularization, UniformScaling) ? n_features_opt * log(regularization.λ) : n_features_opt / output_dim * 2 * sum(log.(diag(cholesky(regularization).L))) + min_complexity = sqrt(abs(min_complexity)) + data = vcat(reshape(get_outputs(io_pairs_opt)[:, (n_train + 1):end], :, 1), 0.0, min_complexity) #flatten data @@ -435,7 +480,7 @@ function build_models!( outsize = size(internal_Γ, 1) Γ = zeros(outsize + n_hp, outsize + n_hp) Γ[1:outsize, 1:outsize] = internal_Γ - Γ[1:(n_test * output_dim), 1:(n_test * output_dim)] += approx_σ2 + Γ[1:(n_test * output_dim), 1:(n_test * output_dim)] += approx_σ2 + regularization Γ[(n_test * output_dim + 1):outsize, (n_test * output_dim + 1):outsize] += I Γ[(outsize + 1):end, (outsize + 1):end] = tikhonov_opt_val .* cov(prior) @@ -444,6 +489,7 @@ function build_models!( min_complexity = isa(regularization, UniformScaling) ? n_features_opt * log(regularization.λ) : n_features_opt / output_dim * 2 * sum(log.(diag(cholesky(regularization).L))) + min_complexity = sqrt(abs(min_complexity)) data = vcat( reshape(get_outputs(io_pairs_opt)[:, (n_train + 1):end], :, 1), @@ -464,6 +510,7 @@ function build_models!( n_iteration = optimizer_options["n_iteration"] opt_verbose_flag = optimizer_options["verbose"] scheduler = optimizer_options["scheduler"] + localization = optimizer_options["localization"] if !isposdef(Γ) Γ = posdef_correct(Γ) end @@ -478,6 +525,7 @@ function build_models!( scheduler = scheduler, rng = rng, verbose = opt_verbose_flag, + localization_method = localization, ) err = zeros(n_iteration) @@ -527,6 +575,7 @@ function build_models!( # [5.] extract optimal hyperparameters hp_optimal = get_ϕ_mean_final(prior, ekiobj)[:] + # Now, fit new RF model with the optimized hyperparameters if opt_verbose_flag names = get_name(prior) @@ -538,6 +587,7 @@ function build_models!( pcic = [(pci_constrained[1][i], pci_constrained[2][i]) for i in 1:length(pci_constrained[1])] pcic_batched = [pcic[b][1] for b in batch(prior)] @info("EKI Optimization result:") + println( display( [ @@ -547,6 +597,7 @@ function build_models!( ), ) end + rfm_tmp = RFM_from_hyperparameters( vrfi, rng, @@ -596,7 +647,8 @@ function predict(vrfi::VectorRandomFeatureInterface, new_inputs::M) where {M <: N_samples = size(new_inputs, 2) # Predicts columns of inputs: input_dim × N_samples - μ, σ2 = RF.Methods.predict(rfm, ff, DataContainer(new_inputs), tullio_threading = tullio_threading) + μ, σ2 = RF.Methods.predict(rfm, ff, DataContainer(new_inputs), tullio_threading = "tullio") + # μ, σ2 = RF.Methods.predict(rfm, ff, DataContainer(new_inputs), tullio_threading = tullio_threading) # sizes (output_dim x n_test), (output_dim x output_dim x n_test) # add the noise contribution from the regularization # note this is because we are predicting the data here, not the latent function. @@ -608,6 +660,5 @@ function predict(vrfi::VectorRandomFeatureInterface, new_inputs::M) where {M <: σ2[:, :, i] = posdef_correct(σ2[:, :, i]) end end - return μ, σ2 end diff --git a/test/RandomFeature/runtests.jl b/test/RandomFeature/runtests.jl index b5c04d0f9..5a7a45b89 100644 --- a/test/RandomFeature/runtests.jl +++ b/test/RandomFeature/runtests.jl @@ -14,40 +14,137 @@ rng = Random.MersenneTwister(seed) @testset "RandomFeatures" begin + @testset "hyperparameter prior interface" begin + + # [1. ] CovarianceStructures + # Costruction + eps = 1e-4 + r = 3 + odf = OneDimFactor() + @test typeof(odf) <: CovarianceStructureType + df = DiagonalFactor(eps) + @test typeof(df) <: CovarianceStructureType + @test get_eps(df) == eps + @test get_eps(DiagonalFactor()) == Float64(1.0) + cf = CholeskyFactor(eps) + @test typeof(cf) <: CovarianceStructureType + @test get_eps(cf) == eps + @test get_eps(CholeskyFactor()) == Float64(1.0) + lrf = LowRankFactor(r, eps) + @test typeof(lrf) <: CovarianceStructureType + @test rank(lrf) == r + @test get_eps(lrf) == eps + @test get_eps(LowRankFactor(r)) == Float64(1.0) + hlrf = HierarchicalLowRankFactor(r, eps) + @test typeof(hlrf) <: CovarianceStructureType + @test rank(hlrf) == r + @test get_eps(hlrf) == eps + @test get_eps(HierarchicalLowRankFactor(r)) == Float64(1.0) + + # calculate_n_hyperparameters + d = 6 + @test calculate_n_hyperparameters(d, odf) == 0 + @test calculate_n_hyperparameters(d, df) == d + @test calculate_n_hyperparameters(d, cf) == Int(d * (d + 1) / 2) + @test calculate_n_hyperparameters(d, lrf) == Int(r + d * r) + @test calculate_n_hyperparameters(d, hlrf) == Int(r * (r + 1) / 2 + d * r) + + # hyperparameters_from_flat - only check shape + @test isnothing(hyperparameters_from_flat([1], odf)) + for factor in (df, cf, lrf, hlrf) + x = ones(calculate_n_hyperparameters(d, factor)) + @test size(hyperparameters_from_flat(x, factor)) == (d, d) + end + + # build_default_prior - only check size of distribution + name = "test_name" + @test isnothing(build_default_prior(name, 0, odf)) + for factor in (df, cf, lrf, hlrf) + n_hp = calculate_n_hyperparameters(d, factor) + prior = build_default_prior(name, n_hp, factor) + @test ndims(prior) == n_hp + end + + # [2. ] Kernel Structures + d = 6 + p = 3 + c_in = lrf + c_out = cf + k_sep = SeparableKernel(c_in, c_out) + @test get_input_cov_structure(k_sep) == c_in + @test get_output_cov_structure(k_sep) == c_out + + k_nonsep = NonseparableKernel(c_in) + @test get_cov_structure(k_nonsep) == c_in + + + # calculate_n_hyperparameters + @test calculate_n_hyperparameters(d, p, k_sep) == + calculate_n_hyperparameters(d, c_in) + calculate_n_hyperparameters(p, c_out) + 1 + @test calculate_n_hyperparameters(d, p, k_nonsep) == calculate_n_hyperparameters(d * p, c_in) + 1 + k_sep1d = SeparableKernel(odf, odf) + k_nonsep1d = NonseparableKernel(odf) + @test calculate_n_hyperparameters(1, 1, k_sep1d) == 2 + @test calculate_n_hyperparameters(1, 1, k_nonsep1d) == 2 + + # hyper_parameters_from_flat: not applied with scaling hyperparameter + x = ones(calculate_n_hyperparameters(d, c_in) + calculate_n_hyperparameters(p, c_out)) + @test size(hyperparameters_from_flat(x, d, p, k_sep)[1]) == (d, d) + @test size(hyperparameters_from_flat(x, d, p, k_sep)[2]) == (p, p) + + x = ones(calculate_n_hyperparameters(d * p, c_in)) + @test size(hyperparameters_from_flat(x, d, p, k_nonsep)) == (d * p, d * p) + + x = [1] # in the 1D case, return(U,V) = (1x1 matrix, nothing) + @test size(hyperparameters_from_flat(x, 1, 1, k_sep1d)[1]) == (1, 1) + @test isnothing(hyperparameters_from_flat(x, 1, 1, k_sep1d)[2]) + @test size(hyperparameters_from_flat(x, 1, 1, k_nonsep1d)) == (1, 1) + + # build_default_prior + @test ndims(build_default_prior(d, p, k_sep)) == + ndims(build_default_prior("input", calculate_n_hyperparameters(d, c_in), c_in)) + + ndims(build_default_prior("output", calculate_n_hyperparameters(p, c_out), c_out)) + + 1 + @test ndims(build_default_prior(d, p, k_nonsep)) == + ndims(build_default_prior("full", calculate_n_hyperparameters(d * p, c_in), c_in)) + 1 + + @test ndims(build_default_prior(1, 1, k_sep1d)) == 2 + @test ndims(build_default_prior(1, 1, k_nonsep1d)) == 2 + + + end + @testset "ScalarRandomFeatureInterface" begin input_dim = 2 n_features = 200 batch_sizes = Dict("train" => 100, "test" => 100, "feature" => 100) - diagonalize_input = false #build interface - n_input_hp = Int(input_dim * (input_dim + 1) / 2) + 2 # see calculate_n_hyperparameters for details - n_output_hp = 1 + # prior built from: - # input: γ₁*(Cholesky_factor + γ₂ * I ) - # output: γ₃ - # where γᵢ positive - prior = build_default_prior(input_dim, diagonalize_input) + eps = 1e-8 + kernel_structure = SeparableKernel(CholeskyFactor(eps), OneDimFactor()) # Cholesky factorized input, 1D output + prior = build_default_prior(input_dim, kernel_structure) optimizer_options = Dict( "prior" => prior, "n_ensemble" => max(ndims(prior) + 1, 10), "n_iteration" => 5, + "scheduler" => DataMisfitController(), "cov_sample_multiplier" => 10.0, "inflation" => 1e-4, "train_fraction" => 0.8, "multithread" => "ensemble", "verbose" => false, - "scheduler" => DataMisfitController(), ) srfi = ScalarRandomFeatureInterface( n_features, input_dim, + kernel_structure = kernel_structure, batch_sizes = batch_sizes, rng = rng, - diagonalize_input = diagonalize_input, optimizer_options = optimizer_options, ) @@ -57,17 +154,19 @@ rng = Random.MersenneTwister(seed) @test get_n_features(srfi) == n_features @test get_input_dim(srfi) == input_dim @test get_rng(srfi) == rng - @test get_diagonalize_input(srfi) == diagonalize_input + @test get_kernel_structure(srfi) == kernel_structure @test get_optimizer_options(srfi) == optimizer_options # check defaults srfi2 = ScalarRandomFeatureInterface(n_features, input_dim) @test get_batch_sizes(srfi2) === nothing @test get_rng(srfi2) == Random.GLOBAL_RNG - @test get_diagonalize_input(srfi2) == false - # currently the "scheduler" doesn't always satisfy X() = X(), bug so we need to remove this for now + @test get_kernel_structure(srfi2) == + SeparableKernel(cov_structure_from_string("lowrank", input_dim), OneDimFactor()) + + # currently the "scheduler" doesn't always satisfy X() = X(), bug so we need to remove this for now for key in keys(optimizer_options) - if !(key == "scheduler") + if !(key ∈ ["scheduler", "prior", "n_ensemble"]) @test get_optimizer_options(srfi2)[key] == optimizer_options[key] # we just set the defaults above end end @@ -81,29 +180,25 @@ rng = Random.MersenneTwister(seed) output_dim = 3 n_features = 200 batch_sizes = Dict("train" => 100, "test" => 100, "feature" => 100) - diagonalize_input = true - diagonalize_output = false - - n_input_hp = Int(input_dim) + 2 - n_output_hp = Int(output_dim * (output_dim + 1) / 2) + 2 # prior built from: - # input: γ₁*(Cholesky_factor_in + γ₂ * I) - # output: γ₃*(Cholesky_factor_out + γ₄ * I) - # where γᵢ positive - prior = build_default_prior(input_dim, output_dim, diagonalize_input, diagonalize_output) + eps = 1e-8 + kernel_structure = SeparableKernel(DiagonalFactor(eps), CholeskyFactor(eps)) # Diagonal input, Cholesky factorized output + prior = build_default_prior(input_dim, output_dim, kernel_structure) + optimizer_options = Dict( "prior" => prior, "n_ensemble" => max(ndims(prior) + 1, 10), "n_iteration" => 5, - "tikhonov" => 0, + "scheduler" => DataMisfitController(), "cov_sample_multiplier" => 10.0, + "tikhonov" => 0, "inflation" => 1e-4, "train_fraction" => 0.8, "multithread" => "ensemble", "verbose" => false, - "scheduler" => DataMisfitController(), + "localization" => EnsembleKalmanProcesses.Localizers.NoLocalization(), ) #build interfaces @@ -111,10 +206,9 @@ rng = Random.MersenneTwister(seed) n_features, input_dim, output_dim, + kernel_structure = kernel_structure, rng = rng, batch_sizes = batch_sizes, - diagonalize_input = diagonalize_input, - diagonalize_output = diagonalize_output, optimizer_options = optimizer_options, ) @@ -124,21 +218,22 @@ rng = Random.MersenneTwister(seed) @test get_n_features(vrfi) == n_features @test get_input_dim(vrfi) == input_dim @test get_output_dim(vrfi) == output_dim + @test get_kernel_structure(vrfi) == kernel_structure @test get_rng(vrfi) == rng - @test get_diagonalize_input(vrfi) == diagonalize_input - @test get_diagonalize_output(vrfi) == diagonalize_output @test get_optimizer_options(vrfi) == optimizer_options - #check defaults - vrfi2 = VectorRandomFeatureInterface(n_features, input_dim, output_dim, diagonalize_input = diagonalize_input) + vrfi2 = VectorRandomFeatureInterface(n_features, input_dim, output_dim) @test get_batch_sizes(vrfi2) === nothing @test get_rng(vrfi2) == Random.GLOBAL_RNG - @test get_diagonalize_input(vrfi2) == true - @test get_diagonalize_output(vrfi2) == false + @test get_kernel_structure(vrfi2) == SeparableKernel( + cov_structure_from_string("lowrank", input_dim), + cov_structure_from_string("lowrank", output_dim), + ) + for key in keys(optimizer_options) - if !(key == "scheduler") + if !(key ∈ ["scheduler", "prior", "n_ensemble"]) @test get_optimizer_options(vrfi2)[key] == optimizer_options[key] # we just set the defaults above end end @@ -162,18 +257,14 @@ rng = Random.MersenneTwister(seed) # RF parameters n_features = 100 - diagonalize_input = true + eps = 1e-8 + scalar_ks = SeparableKernel(DiagonalFactor(eps), OneDimFactor()) # Diagonalize input (ARD-type kernel) + vector_ks = SeparableKernel(DiagonalFactor(eps), CholeskyFactor()) # Diagonalize input (ARD-type kernel) # Scalar RF options to mimic squared-exp ARD kernel - srfi = ScalarRandomFeatureInterface(n_features, input_dim, diagonalize_input = diagonalize_input, rng = rng) + srfi = ScalarRandomFeatureInterface(n_features, input_dim, kernel_structure = scalar_ks, rng = rng) # Vector RF options to mimic squared-exp ARD kernel (in 1D) - vrfi = VectorRandomFeatureInterface( - n_features, - input_dim, - output_dim, - diagonalize_input = diagonalize_input, - rng = rng, - ) + vrfi = VectorRandomFeatureInterface(n_features, input_dim, output_dim, kernel_structure = vector_ks, rng = rng) # build emulators em_srfi = Emulator(srfi, iopairs, obs_noise_cov = obs_noise_cov) @@ -231,28 +322,35 @@ rng = Random.MersenneTwister(seed) # 1) scalar + diag in # 2) scalar # 3) vector + diag out - # 4) vector - - srfi_diagin = ScalarRandomFeatureInterface(n_features, input_dim, diagonalize_input = true, rng = rng) - srfi = ScalarRandomFeatureInterface(n_features, input_dim, diagonalize_input = false, rng = rng) + # 4) vector + # 5) vector nonseparable + eps = 1e-8 + r = 1 + scalar_diagin_ks = SeparableKernel(DiagonalFactor(eps), OneDimFactor()) + scalar_ks = SeparableKernel(CholeskyFactor(eps), OneDimFactor()) + vector_diagout_ks = SeparableKernel(CholeskyFactor(eps), DiagonalFactor(eps)) + vector_ks = SeparableKernel(LowRankFactor(r, eps), HierarchicalLowRankFactor(r, eps)) + vector_nonsep_ks = NonseparableKernel(LowRankFactor(r + 1, eps)) + + srfi_diagin = + ScalarRandomFeatureInterface(n_features, input_dim, kernel_structure = scalar_diagin_ks, rng = rng) + srfi = ScalarRandomFeatureInterface(n_features, input_dim, kernel_structure = scalar_ks, rng = rng) vrfi_diagout = VectorRandomFeatureInterface( n_features, input_dim, output_dim, - diagonalize_input = false, - diagonalize_output = true, - optimizer_options = Dict("train_fraction" => 0.7), + kernel_structure = vector_diagout_ks, rng = rng, ) - vrfi = VectorRandomFeatureInterface( + vrfi = VectorRandomFeatureInterface(n_features, input_dim, output_dim, kernel_structure = vector_ks, rng = rng) + + vrfi_nonsep = VectorRandomFeatureInterface( n_features, input_dim, output_dim, - diagonalize_input = false, - diagonalize_output = false, - optimizer_options = Dict("train_fraction" => 0.7), + kernel_structure = vector_nonsep_ks, rng = rng, ) @@ -266,6 +364,7 @@ rng = Random.MersenneTwister(seed) em_vrfi_svd = Emulator(vrfi, iopairs, obs_noise_cov = Σ) em_vrfi = Emulator(vrfi, iopairs, obs_noise_cov = Σ, decorrelate = false) + em_vrfi_nonsep = Emulator(vrfi_nonsep, iopairs, obs_noise_cov = Σ, decorrelate = false) #TODO truncated SVD option for vector (involves resizing prior) @@ -281,21 +380,22 @@ rng = Random.MersenneTwister(seed) μ_vsd, σ²_vsd = Emulators.predict(em_vrfi_svd_diagout, new_inputs, transform_to_real = true) μ_vs, σ²_vs = Emulators.predict(em_vrfi_svd, new_inputs, transform_to_real = true) μ_v, σ²_v = Emulators.predict(em_vrfi, new_inputs) + μ_vns, σ²_vns = Emulators.predict(em_vrfi_nonsep, new_inputs) tol_μ = 0.1 * ntest * output_dim @test isapprox.(norm(μ_ssd - outx), 0, atol = tol_μ) @test isapprox.(norm(μ_ss - outx), 0, atol = tol_μ) @test isapprox.(norm(μ_vsd - outx), 0, atol = tol_μ) @test isapprox.(norm(μ_vs - outx), 0, atol = tol_μ) - @test isapprox.(norm(μ_v - outx), 0, atol = 5 * tol_μ) # a more approximate option so likely less good approx + @test isapprox.(norm(μ_v - outx), 0, atol = 5 * tol_μ) # approximate option so likely less good approx + @test isapprox.(norm(μ_vns - outx), 0, atol = 5 * tol_μ) # approximate option so likely less good approx # An example with the other threading option vrfi_tul = VectorRandomFeatureInterface( n_features, input_dim, output_dim, - diagonalize_input = false, - diagonalize_output = false, + kernel_structure = vector_ks, optimizer_options = Dict("train_fraction" => 0.7, "multithread" => "tullio"), rng = rng, )