diff --git a/docs/src/API/RandomFeatures.md b/docs/src/API/RandomFeatures.md index b5f5956bc..2baffd992 100644 --- a/docs/src/API/RandomFeatures.md +++ b/docs/src/API/RandomFeatures.md @@ -31,7 +31,7 @@ get_n_features get_input_dim get_output_dim get_rng -get_diagonalize_input +get_kernel_structure get_feature_decomposition get_optimizer_options optimize_hyperparameters!(::ScalarRandomFeatureInterface) diff --git a/examples/EDMF_data/Project.toml b/examples/EDMF_data/Project.toml index a865fa8be..2936bdfbe 100644 --- a/examples/EDMF_data/Project.toml +++ b/examples/EDMF_data/Project.toml @@ -1,9 +1,11 @@ [deps] +CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" CalibrateEmulateSample = "95e48a1f-0bec-4818-9538-3db4340308e3" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" NCDatasets = "85f8d34a-cbdd-5861-8df4-14fed0d494ab" +PairPlots = "43a3c2be-4208-490b-832a-a21dcd55d7da" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" diff --git a/examples/EDMF_data/plot_posterior.jl b/examples/EDMF_data/plot_posterior.jl index 5173889fb..9ba36b414 100644 --- a/examples/EDMF_data/plot_posterior.jl +++ b/examples/EDMF_data/plot_posterior.jl @@ -1,5 +1,8 @@ # Import modules -using StatsPlots +ENV["GKSwstype"] = "100" +using Plots + +using CairoMakie, PairPlots using JLD2 using Dates @@ -11,11 +14,11 @@ using CalibrateEmulateSample.ParameterDistributions # 2-parameter calibration exp exp_name = "ent-det-calibration" -date_of_run = Date(2022, 7, 15) +date_of_run = Date(2023, 10, 5) # 5-parameter calibration exp #exp_name = "ent-det-tked-tkee-stab-calibration" -#date_of_run = Date(2022,7,14) +#date_of_run = Date(2023,10,4) # Output figure read/write directory figure_save_directory = joinpath(@__DIR__, "output", exp_name, string(date_of_run)) @@ -24,7 +27,7 @@ data_save_directory = joinpath(@__DIR__, "output", exp_name, string(date_of_run) # load posterior_filepath = joinpath(data_save_directory, "posterior.jld2") if !isfile(posterior_filepath) - LoadError(posterior_filepath * " not found. Please check experiment name and date") + throw(ArgumentError(posterior_filepath * " not found. Please check experiment name and date")) else println("Loading posterior distribution from: " * posterior_filepath) posterior = load(posterior_filepath)["posterior"] @@ -40,10 +43,10 @@ density_filepath = joinpath(figure_save_directory, "posterior_dist_comp.png") transformed_density_filepath = joinpath(figure_save_directory, "posterior_dist_phys.png") labels = get_name(posterior) -gr(dpi = 300, size = (nparam_plots * 300, nparam_plots * 300)) -# lower triangular marginal cornerplot -p = cornerplot(permutedims(posterior_samples, (2, 1)), label = labels, compact = true) -trans_p = cornerplot(permutedims(transformed_posterior_samples, (2, 1)), label = labels, compact = true) +data = (; [(Symbol(labels[i]), posterior_samples[i, :]) for i in 1:length(labels)]...) +transformed_data = (; [(Symbol(labels[i]), transformed_posterior_samples[i, :]) for i in 1:length(labels)]...) -savefig(p, density_filepath) -savefig(trans_p, transformed_density_filepath) +p = pairplot(data => (PairPlots.Scatter(),)) +trans_p = pairplot(transformed_data => (PairPlots.Scatter(),)) +save(density_filepath, p) +save(transformed_density_filepath, trans_p) diff --git a/examples/EDMF_data/uq_for_edmf.jl b/examples/EDMF_data/uq_for_edmf.jl index c9da771d5..7c5a1ac0d 100644 --- a/examples/EDMF_data/uq_for_edmf.jl +++ b/examples/EDMF_data/uq_for_edmf.jl @@ -1,9 +1,10 @@ #include(joinpath(@__DIR__, "..", "ci", "linkfig.jl")) -PLOT_FLAG = true +PLOT_FLAG = false # Import modules using Distributions # probability distributions and associated functions using LinearAlgebra +ENV["GKSwstype"] = "100" using Plots using Random using JLD2 @@ -17,6 +18,7 @@ using CalibrateEmulateSample.MarkovChainMonteCarlo using CalibrateEmulateSample.ParameterDistributions using CalibrateEmulateSample.DataContainers using CalibrateEmulateSample.EnsembleKalmanProcesses +using CalibrateEmulateSample.EnsembleKalmanProcesses.Localizers using CalibrateEmulateSample.Utilities rng_seed = 42424242 @@ -126,6 +128,7 @@ function main() end # load and create prior distributions + #= prior_filepath = joinpath(exp_dir, "prior.jld2") if !isfile(prior_filepath) LoadError("prior file \"prior.jld2\" not found in directory \"" * exp_dir * "/\"") @@ -133,6 +136,47 @@ function main() prior_dict_raw = load(prior_filepath) #using JLD2 prior = prior_dict_raw["prior"] end + =# + # build prior if jld2 does not work + function get_prior_config(s::SS) where {SS <: AbstractString} + config = Dict() + if s == "ent-det-calibration" + config["constraints"] = + Dict("entrainment_factor" => [bounded(0.0, 1.0)], "detrainment_factor" => [bounded(0.0, 1.0)]) + config["prior_mean"] = Dict("entrainment_factor" => 0.13, "detrainment_factor" => 0.51) + config["unconstrained_σ"] = 1.0 + elseif s == "ent-det-tked-tkee-stab-calibration" + config["constraints"] = Dict( + "entrainment_factor" => [bounded(0.0, 1.0)], + "detrainment_factor" => [bounded(0.0, 1.0)], + "tke_ed_coeff" => [bounded(0.01, 1.0)], + "tke_diss_coeff" => [bounded(0.01, 1.0)], + "static_stab_coeff" => [bounded(0.01, 1.0)], + ) + config["prior_mean"] = Dict( + "entrainment_factor" => 0.13, + "detrainment_factor" => 0.51, + "tke_ed_coeff" => 0.14, + "tke_diss_coeff" => 0.22, + "static_stab_coeff" => 0.4, + ) + config["unconstrained_σ"] = 1.0 + else + throw(ArgumentError("prior for experiment $s not found, please implement in uq_for_edmf.jl")) + end + return config + end + prior_config = get_prior_config(exp_name) + means = prior_config["prior_mean"] + std = prior_config["unconstrained_σ"] + constraints = prior_config["constraints"] + + + prior = combine_distributions([ + ParameterDistribution( + Dict("name" => name, "distribution" => Parameterized(Normal(mean, std)), "constraint" => constraints[name]), + ) for (name, mean) in means + ]) # Option (ii) load EKP object # max_ekp_it = 10 # use highest available iteration file @@ -145,7 +189,7 @@ function main() # end # input_output_pairs = Utilities.get_training_points(ekpobj, max_ekp_it) - println("Completed calibration stage") + println("Completed calibration loading stage") println(" ") ############################################## # [3. ] Build Emulator from calibration data # @@ -153,19 +197,82 @@ function main() println("Begin Emulation stage") # Create GP object - gppackage = Emulators.SKLJL() - pred_type = Emulators.YType() - gauss_proc = GaussianProcess( - gppackage; - kernel = nothing, # use default squared exponential kernel - prediction_type = pred_type, - noise_learn = false, + cases = [ + "GP", # diagonalize, train scalar GP, assume diag inputs + "RF-scalar", # diagonalize, train scalar RF, don't asume diag inputs + "RF-vector-svd-diag", + "RF-vector-svd-nondiag", + "RF-vector-svd-nonsep", + ] + case = cases[5] + + overrides = Dict( + "verbose" => true, + "train_fraction" => 0.95, + "scheduler" => DataMisfitController(terminate_at = 100), + "cov_sample_multiplier" => 0.5, + "n_iteration" => 5, + # "n_ensemble" => 20, + # "localization" => SEC(0.1), # localization / sample error correction for small ensembles ) + nugget = 0.01 + rng_seed = 99330 + rng = Random.MersenneTwister(rng_seed) + input_dim = size(get_inputs(input_output_pairs), 1) + output_dim = size(get_outputs(input_output_pairs), 1) + if case == "GP" + + gppackage = Emulators.SKLJL() + pred_type = Emulators.YType() + mlt = GaussianProcess( + gppackage; + kernel = nothing, # use default squared exponential kernel + prediction_type = pred_type, + noise_learn = false, + ) + elseif case ∈ ["RF-scalar"] + n_features = 100 + kernel_structure = SeparableKernel(CholeskyFactor(nugget), OneDimFactor()) + mlt = ScalarRandomFeatureInterface( + n_features, + input_dim, + rng = rng, + kernel_structure = kernel_structure, + optimizer_options = overrides, + ) + elseif case ∈ ["RF-vector-svd-diag", "RF-vector-svd-nondiag"] + # do we want to assume that the outputs are decorrelated in the machine-learning problem? + kernel_structure = + case ∈ ["RF-vector-svd-diag"] ? SeparableKernel(LowRankFactor(1, nugget), DiagonalFactor(nugget)) : + SeparableKernel(LowRankFactor(2, nugget), LowRankFactor(2, nugget)) + n_features = 500 + + mlt = VectorRandomFeatureInterface( + n_features, + input_dim, + output_dim, + rng = rng, + kernel_structure = kernel_structure, + optimizer_options = overrides, + ) + elseif case ∈ ["RF-vector-svd-nonsep"] + kernel_structure = NonseparableKernel(LowRankFactor(3, nugget)) + n_features = 500 + + mlt = VectorRandomFeatureInterface( + n_features, + input_dim, + output_dim, + rng = rng, + kernel_structure = kernel_structure, + optimizer_options = overrides, + ) + end # Fit an emulator to the data normalized = true - emulator = Emulator(gauss_proc, input_output_pairs; obs_noise_cov = truth_cov, normalize_inputs = normalized) + emulator = Emulator(mlt, input_output_pairs; obs_noise_cov = truth_cov, normalize_inputs = normalized) # Optimize the GP hyperparameters for better fit optimize_hyperparameters!(emulator) diff --git a/examples/Emulator/GaussianProcess/plot_GP.jl b/examples/Emulator/GaussianProcess/plot_GP.jl index a0ee9751f..721382ce7 100644 --- a/examples/Emulator/GaussianProcess/plot_GP.jl +++ b/examples/Emulator/GaussianProcess/plot_GP.jl @@ -11,6 +11,7 @@ using CalibrateEmulateSample.DataContainers plot_flag = true if plot_flag + ENV["GKSwstype"] = "100" using Plots gr(size = (1500, 700)) Plots.scalefontsizes(1.3) @@ -78,7 +79,7 @@ gaussian_process = GaussianProcess(gppackage, noise_learn = false) # The observables y are related to the parameters x by: # y = G(x1, x2) + η, # where G(x1, x2) := [sin(x1) + cos(x2), sin(x1) - cos(x2)], and η ~ N(0, Σ) -n = 100 # number of training points +n = 150 # number of training points p = 2 # input dim d = 2 # output dim diff --git a/examples/Emulator/L63/Project.toml b/examples/Emulator/L63/Project.toml new file mode 100644 index 000000000..e2c9baa08 --- /dev/null +++ b/examples/Emulator/L63/Project.toml @@ -0,0 +1,9 @@ +[deps] +CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" +CalibrateEmulateSample = "95e48a1f-0bec-4818-9538-3db4340308e3" +ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" diff --git a/examples/Emulator/L63/emulate.jl b/examples/Emulator/L63/emulate.jl new file mode 100644 index 000000000..289314295 --- /dev/null +++ b/examples/Emulator/L63/emulate.jl @@ -0,0 +1,207 @@ +using OrdinaryDiffEq +using Random, Distributions, LinearAlgebra +ENV["GKSwstype"] = "100" +using CairoMakie, ColorSchemes #for plots +using JLD2 + +# CES +using CalibrateEmulateSample.Emulators +using CalibrateEmulateSample.ParameterDistributions +using CalibrateEmulateSample.EnsembleKalmanProcesses +using CalibrateEmulateSample.DataContainers + + +function lorenz(du, u, p, t) + du[1] = 10.0 * (u[2] - u[1]) + du[2] = u[1] * (28.0 - u[3]) - u[2] + du[3] = u[1] * u[2] - (8 / 3) * u[3] +end + +function main() + + # rng + rng = MersenneTwister(1232434) + + # Run L63 from 0 -> tmax + u0 = [1.0; 0.0; 0.0] + tmax = 20 + dt = 0.01 + tspan = (0.0, tmax) + prob = ODEProblem(lorenz, u0, tspan) + sol = solve(prob, Euler(), dt = dt) + + # Run L63 from end for test trajetory data + tmax_test = 100 + tspan_test = (0.0, tmax_test) + u0_test = sol.u[end] + prob_test = ODEProblem(lorenz, u0_test, tspan_test) + sol_test = solve(prob_test, Euler(), dt = dt) + + # Run L63 from end for histogram matching data + tmax_hist = 1000 + tspan_hist = (0.0, tmax_hist) + u0_hist = sol_test.u[end] + prob_hist = ODEProblem(lorenz, u0_hist, tspan_hist) + sol_hist = solve(prob_hist, Euler(), dt = dt) + + # Create training pairs (with noise) from subsampling [burnin,tmax] + tburn = 10 + burnin = Int(floor(10 / dt)) + n_train_pts = 400 #600 + ind = shuffle!(rng, Vector(burnin:(tmax / dt - 1)))[1:n_train_pts] + n_tp = length(ind) + input = zeros(3, n_tp) + output = zeros(3, n_tp) + Γy = 1e-6 * I(3) + noise = rand(rng, MvNormal(zeros(3), Γy), n_tp) + for i in 1:n_tp + input[:, i] = sol.u[i] + output[:, i] = sol.u[i + 1] + noise[:, i] + end + iopairs = PairedDataContainer(input, output) + + # Emulate + cases = ["GP", "RF-scalar", "RF-scalar-diagin", "RF-vector-svd-nonsep"] + + case = cases[4] + decorrelate = true + nugget = Float64(1e-12) + overrides = Dict( + "verbose" => true, + "scheduler" => DataMisfitController(terminate_at = 1e4), + "cov_sample_multiplier" => 2.0, + "n_iteration" => 10, + ) + + # Build ML tools + if case == "GP" + gppackage = Emulators.GPJL() + pred_type = Emulators.YType() + mlt = GaussianProcess( + gppackage; + kernel = nothing, # use default squared exponential kernel + prediction_type = pred_type, + noise_learn = false, + ) + + elseif case ∈ ["RF-scalar", "RF-scalar-diagin"] + n_features = 10 * Int(floor(sqrt(3 * n_tp))) + kernel_structure = + case == "RF-scalar-diagin" ? SeparableKernel(DiagonalFactor(nugget), OneDimFactor()) : + SeparableKernel(LowRankFactor(3, nugget), OneDimFactor()) + mlt = ScalarRandomFeatureInterface( + n_features, + 3, + rng = rng, + kernel_structure = kernel_structure, + optimizer_options = overrides, + ) + elseif case ∈ ["RF-vector-svd-nonsep"] + kernel_structure = NonseparableKernel(LowRankFactor(9, nugget)) + n_features = 500 + + mlt = VectorRandomFeatureInterface( + n_features, + 3, + 3, + rng = rng, + kernel_structure = kernel_structure, + optimizer_options = overrides, + ) + end + + # Emulate + emulator = Emulator(mlt, iopairs; obs_noise_cov = Γy, decorrelate = decorrelate) + optimize_hyperparameters!(emulator) + + + # Predict with emulator + xspan_test = 0.0:dt:tmax_test + u_test = zeros(3, length(xspan_test)) + u_test[:, 1] = sol_test.u[1] + + for i in 1:(length(xspan_test) - 1) + rf_mean, _ = predict(emulator, u_test[:, i:i], transform_to_real = true) # 3x1 matrix + u_test[:, i + 1] = rf_mean + end + train_err = [0.0] + for i in 1:size(input, 2) + train_mean, _ = predict(emulator, input[:, i:i], transform_to_real = true) # 3x1 + train_err[1] += norm(train_mean - output[:, i]) + end + println("normalized L^2 error on training data:", 1 / size(input, 2) * train_err[1]) + + + + # plotting trace + f = Figure(resolution = (900, 450)) + axx = Axis(f[1, 1], xlabel = "time", ylabel = "x") + axy = Axis(f[2, 1], xlabel = "time", ylabel = "y") + axz = Axis(f[3, 1], xlabel = "time", ylabel = "z") + + xx = 0:dt:tmax_test + lines!(axx, xx, u_test[1, :], color = :blue) + lines!(axy, xx, u_test[2, :], color = :blue) + lines!(axz, xx, u_test[3, :], color = :blue) + + # run test data + solplot = zeros(3, length(xspan_test)) + for i in 1:length(xspan_test) + solplot[:, i] = sol_test.u[i] #noiseless + end + JLD2.save("l63_testdata.jld2", "solplot", solplot, "uplot", u_test) + + lines!(axx, xspan_test, solplot[1, :], color = :orange) + lines!(axy, xspan_test, solplot[2, :], color = :orange) + lines!(axz, xspan_test, solplot[3, :], color = :orange) + + current_figure() + # save + save("l63_test.png", f, px_per_unit = 3) + save("l63_test.pdf", f, pt_per_unit = 3) + + # plot attractor + f3 = Figure() + lines(f3[1, 1], u_test[1, :], u_test[3, :], color = :blue) + lines(f3[2, 1], solplot[1, :], solplot[3, :], color = :orange) + + # save + save("l63_attr.png", f3, px_per_unit = 3) + save("l63_attr.pdf", f3, pt_per_unit = 3) + + + # Predict with emulator for histogram + xspan_hist = 0.0:dt:tmax_hist + u_hist = zeros(3, length(xspan_hist)) + u_hist[:, 1] = sol_hist.u[1] # start at end of previous sim + + for i in 1:(length(xspan_hist) - 1) + rf_mean, _ = predict(emulator, u_hist[:, i:i], transform_to_real = true) # 3x1 matrix + u_hist[:, i + 1] = rf_mean + end + + solhist = zeros(3, length(xspan_hist)) + for i in 1:length(xspan_hist) + solhist[:, i] = sol_hist.u[i] #noiseless + end + JLD2.save("l63_histdata.jld2", "solhist", solhist, "uhist", u_hist) + + # plotting histograms + f2 = Figure() + hist(f2[1, 1], u_hist[1, :], bins = 50, normalization = :pdf, color = (:blue, 0.5)) + hist(f2[1, 2], u_hist[2, :], bins = 50, normalization = :pdf, color = (:blue, 0.5)) + hist(f2[1, 3], u_hist[3, :], bins = 50, normalization = :pdf, color = (:blue, 0.5)) + + hist!(f2[1, 1], solhist[1, :], bins = 50, normalization = :pdf, color = (:orange, 0.5)) + hist!(f2[1, 2], solhist[2, :], bins = 50, normalization = :pdf, color = (:orange, 0.5)) + hist!(f2[1, 3], solhist[3, :], bins = 50, normalization = :pdf, color = (:orange, 0.5)) + + # save + save("l63_pdf.png", f2, px_per_unit = 3) + save("l63_pdf.pdf", f2, pt_per_unit = 3) + + + +end + +main() diff --git a/examples/Emulator/RandomFeature/scalar_optimize_and_plot_RF.jl b/examples/Emulator/RandomFeature/scalar_optimize_and_plot_RF.jl index b379ff689..3ac0a64a9 100644 --- a/examples/Emulator/RandomFeature/scalar_optimize_and_plot_RF.jl +++ b/examples/Emulator/RandomFeature/scalar_optimize_and_plot_RF.jl @@ -10,17 +10,18 @@ using LinearAlgebra using CalibrateEmulateSample.Emulators using CalibrateEmulateSample.DataContainers using CalibrateEmulateSample.ParameterDistributions +using CalibrateEmulateSample.EnsembleKalmanProcesses + case = "scalar" println("running case $case") - -diagonalize_input = true - +kernel_structure = SeparableKernel(LowRankFactor(2, 1e-8), OneDimFactor()) # input and output(1D) cov structure in sep kernel plot_flag = true if plot_flag + ENV["GKSwstype"] = "100" using Plots gr(size = (1500, 700)) - Plots.scalefontsizes(1.3) + # Plots.scalefontsizes(1.3) # scales on recursive calls to include.. font = Plots.font("Helvetica", 18) fontdict = Dict(:guidefont => font, :xtickfont => font, :ytickfont => font, :legendfont => font) @@ -33,233 +34,247 @@ function meshgrid(vx::AbstractVector{T}, vy::AbstractVector{T}) where {T} return gx, gy end -rng_seed = 41 -Random.seed!(rng_seed) -output_directory = joinpath(@__DIR__, "output") -if !isdir(output_directory) - mkdir(output_directory) -end -#problem -n = 100 # number of training points -p = 2 # input dim -d = 2 # output dim -X = 2.0 * π * rand(p, n) -# G(x1, x2) -g1x = sin.(X[1, :]) .+ cos.(X[2, :]) -g2x = sin.(X[1, :]) .- cos.(X[2, :]) -gx = zeros(2, n) -gx[1, :] = g1x -gx[2, :] = g2x - -# Add noise η -μ = zeros(d) -Σ = 0.1 * [[0.8, 0.1] [0.1, 0.5]] # d x d -noise_samples = rand(MvNormal(μ, Σ), n) -# y = G(x) + η -Y = gx .+ noise_samples - -iopairs = PairedDataContainer(X, Y, data_are_columns = true) -@assert get_inputs(iopairs) == X -@assert get_outputs(iopairs) == Y - -#plot training data with and without noise -if plot_flag - p1 = plot( - X[1, :], - X[2, :], - g1x, - st = :surface, - camera = (30, 60), - c = :cividis, - xlabel = "x1", - ylabel = "x2", - zguidefontrotation = 90, - ) - - figpath = joinpath(output_directory, "RF_observed_y1nonoise.png") - savefig(figpath) - - p2 = plot( - X[1, :], - X[2, :], - g2x, - st = :surface, - camera = (30, 60), - c = :cividis, - xlabel = "x1", - ylabel = "x2", - zguidefontrotation = 90, - ) - figpath = joinpath(output_directory, "RF_observed_y2nonoise.png") - savefig(figpath) - - p3 = plot( - X[1, :], - X[2, :], - Y[1, :], - st = :surface, - camera = (30, 60), - c = :cividis, - xlabel = "x1", - ylabel = "x2", - zguidefontrotation = 90, - ) - figpath = joinpath(output_directory, "RF_observed_y1.png") - savefig(figpath) - - p4 = plot( - X[1, :], - X[2, :], - Y[2, :], - st = :surface, - camera = (30, 60), - c = :cividis, - xlabel = "x1", - ylabel = "x2", - zguidefontrotation = 90, - ) - figpath = joinpath(output_directory, "RF_observed_y2.png") - savefig(figpath) +function main() -end + rng_seed = 41 + Random.seed!(rng_seed) + output_directory = joinpath(@__DIR__, "output") + if !isdir(output_directory) + mkdir(output_directory) + end -# setup random features -n_features = 180 -optimizer_options = Dict("n_iteration" => 20, "prior_in_scale" => 0.1, "verbose" => true) #"Max" iterations (may do less) -srfi = ScalarRandomFeatureInterface( - n_features, - p, - diagonalize_input = diagonalize_input, - optimizer_options = optimizer_options, -) -emulator = Emulator(srfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true) -println("build RF with $n training points and $(n_features) random features.") -optimize_hyperparameters!(emulator) # although RF already optimized + #problem + n = 150 # number of training points + p = 2 # input dim + d = 2 # output dim + X = 2.0 * π * rand(p, n) + # G(x1, x2) + g1x = sin.(X[1, :]) .+ cos.(X[2, :]) + g2x = sin.(X[1, :]) .- cos.(X[2, :]) + gx = zeros(2, n) + gx[1, :] = g1x + gx[2, :] = g2x -# Plot mean and variance of the predicted observables y1 and y2 -# For this, we generate test points on a x1-x2 grid. -n_pts = 200 -x1 = range(0.0, stop = 4.0 / 5.0 * 2 * π, length = n_pts) -x2 = range(0.0, stop = 4.0 / 5.0 * 2 * π, length = n_pts) -X1, X2 = meshgrid(x1, x2) -# Input for predict has to be of size N_samples x input_dim -inputs = permutedims(hcat(X1[:], X2[:]), (2, 1)) + # Add noise η + μ = zeros(d) + Σ = 0.1 * [[0.8, 0.1] [0.1, 0.5]] # d x d + noise_samples = rand(MvNormal(μ, Σ), n) + # y = G(x) + η + Y = gx .+ noise_samples -rf_mean, rf_cov = predict(emulator, inputs, transform_to_real = true) -println("end predictions at ", n_pts * n_pts, " points") + iopairs = PairedDataContainer(X, Y, data_are_columns = true) + @assert get_inputs(iopairs) == X + @assert get_outputs(iopairs) == Y + #plot training data with and without noise + if plot_flag + p1 = plot( + X[1, :], + X[2, :], + g1x, + st = :surface, + camera = (30, 60), + c = :cividis, + xlabel = "x1", + ylabel = "x2", + zguidefontrotation = 90, + ) -#plot predictions -for y_i in 1:d - rf_var_temp = [diag(rf_cov[j]) for j in 1:length(rf_cov)] # (40000,) - rf_var = permutedims(vcat([x' for x in rf_var_temp]...), (2, 1)) # 2 x 40000 + figpath = joinpath(output_directory, "RF_observed_y1nonoise.png") + savefig(figpath) - mean_grid = reshape(rf_mean[y_i, :], n_pts, n_pts) # 2 x 40000 - if plot_flag - p5 = plot( - x1, - x2, - mean_grid, + p2 = plot( + X[1, :], + X[2, :], + g2x, st = :surface, camera = (30, 60), c = :cividis, xlabel = "x1", ylabel = "x2", - zlabel = "mean of y" * string(y_i), zguidefontrotation = 90, ) - end - var_grid = reshape(rf_var[y_i, :], n_pts, n_pts) - if plot_flag - p6 = plot( - x1, - x2, - var_grid, + figpath = joinpath(output_directory, "RF_observed_y2nonoise.png") + savefig(figpath) + + p3 = plot( + X[1, :], + X[2, :], + Y[1, :], st = :surface, camera = (30, 60), c = :cividis, xlabel = "x1", ylabel = "x2", - zlabel = "var of y" * string(y_i), zguidefontrotation = 90, ) + figpath = joinpath(output_directory, "RF_observed_y1.png") + savefig(figpath) - plot(p5, p6, layout = (1, 2), legend = false) + p4 = plot( + X[1, :], + X[2, :], + Y[2, :], + st = :surface, + camera = (30, 60), + c = :cividis, + xlabel = "x1", + ylabel = "x2", + zguidefontrotation = 90, + ) + figpath = joinpath(output_directory, "RF_observed_y2.png") + savefig(figpath) - savefig(joinpath(output_directory, "RF_" * case * "_y" * string(y_i) * "_predictions.png")) end -end - -# Plot the true components of G(x1, x2) -g1_true = sin.(inputs[1, :]) .+ cos.(inputs[2, :]) -g1_true_grid = reshape(g1_true, n_pts, n_pts) -if plot_flag - p7 = plot( - x1, - x2, - g1_true_grid, - st = :surface, - camera = (30, 60), - c = :cividis, - xlabel = "x1", - ylabel = "x2", - zlabel = "sin(x1) + cos(x2)", - zguidefontrotation = 90, - ) - savefig(joinpath(output_directory, "RF_" * case * "_true_g1.png")) -end -g2_true = sin.(inputs[1, :]) .- cos.(inputs[2, :]) -g2_true_grid = reshape(g2_true, n_pts, n_pts) -if plot_flag - p8 = plot( - x1, - x2, - g2_true_grid, - st = :surface, - camera = (30, 60), - c = :cividis, - xlabel = "x1", - ylabel = "x2", - zlabel = "sin(x1) - cos(x2)", - zguidefontrotation = 90, + # setup random features + n_features = 400 + optimizer_options = Dict( + "n_iteration" => 20, + "n_ensemble" => 20, + "verbose" => true, + "scheduler" => DataMisfitController(terminate_at = 100.0), + ) #"Max" iterations (may do less) + + + srfi = ScalarRandomFeatureInterface( + n_features, + p, + kernel_structure = kernel_structure, + optimizer_options = optimizer_options, ) - g_true_grids = [g1_true_grid, g2_true_grid] - - savefig(joinpath(output_directory, "RF_" * case * "_true_g2.png")) - -end - -# Plot the difference between the truth and the mean of the predictions -for y_i in 1:d - - # Reshape rf_cov to size N_samples x output_dim - rf_var_temp = [diag(rf_cov[j]) for j in 1:length(rf_cov)] # (40000,) - rf_var = permutedims(vcat([x' for x in rf_var_temp]...), (2, 1)) # 40000 x 2 - - mean_grid = reshape(rf_mean[y_i, :], n_pts, n_pts) - var_grid = reshape(rf_var[y_i, :], n_pts, n_pts) - # Compute and plot 1/variance * (truth - prediction)^2 + emulator = Emulator(srfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true) + println("build RF with $n training points and $(n_features) random features.") + + optimize_hyperparameters!(emulator) # although RF already optimized + + # Plot mean and variance of the predicted observables y1 and y2 + # For this, we generate test points on a x1-x2 grid. + n_pts = 200 + x1 = range(0.0, stop = 4.0 / 5.0 * 2 * π, length = n_pts) + x2 = range(0.0, stop = 4.0 / 5.0 * 2 * π, length = n_pts) + X1, X2 = meshgrid(x1, x2) + # Input for predict has to be of size N_samples x input_dim + inputs = permutedims(hcat(X1[:], X2[:]), (2, 1)) + + rf_mean, rf_cov = predict(emulator, inputs, transform_to_real = true) + println("end predictions at ", n_pts * n_pts, " points") + + + #plot predictions + for y_i in 1:d + rf_var_temp = [diag(rf_cov[j]) for j in 1:length(rf_cov)] # (40000,) + rf_var = permutedims(reduce(vcat, [x' for x in rf_var_temp]), (2, 1)) # 2 x 40000 + + mean_grid = reshape(rf_mean[y_i, :], n_pts, n_pts) # 2 x 40000 + if plot_flag + p5 = plot( + x1, + x2, + mean_grid, + st = :surface, + camera = (30, 60), + c = :cividis, + xlabel = "x1", + ylabel = "x2", + zlabel = "mean of y" * string(y_i), + zguidefontrotation = 90, + ) + end + var_grid = reshape(rf_var[y_i, :], n_pts, n_pts) + if plot_flag + p6 = plot( + x1, + x2, + var_grid, + st = :surface, + camera = (30, 60), + c = :cividis, + xlabel = "x1", + ylabel = "x2", + zlabel = "var of y" * string(y_i), + zguidefontrotation = 90, + ) + + plot(p5, p6, layout = (1, 2), legend = false) + + savefig(joinpath(output_directory, "RF_" * case * "_y" * string(y_i) * "_predictions.png")) + end + end + # Plot the true components of G(x1, x2) + g1_true = sin.(inputs[1, :]) .+ cos.(inputs[2, :]) + g1_true_grid = reshape(g1_true, n_pts, n_pts) if plot_flag - zlabel = "1/var * (true_y" * string(y_i) * " - predicted_y" * string(y_i) * ")^2" + p7 = plot( + x1, + x2, + g1_true_grid, + st = :surface, + camera = (30, 60), + c = :cividis, + xlabel = "x1", + ylabel = "x2", + zlabel = "sin(x1) + cos(x2)", + zguidefontrotation = 90, + ) + savefig(joinpath(output_directory, "RF_" * case * "_true_g1.png")) + end - p9 = plot( + g2_true = sin.(inputs[1, :]) .- cos.(inputs[2, :]) + g2_true_grid = reshape(g2_true, n_pts, n_pts) + if plot_flag + p8 = plot( x1, x2, - sqrt.(1.0 ./ var_grid .* (g_true_grids[y_i] .- mean_grid) .^ 2), + g2_true_grid, st = :surface, camera = (30, 60), - c = :magma, - zlabel = zlabel, + c = :cividis, xlabel = "x1", ylabel = "x2", + zlabel = "sin(x1) - cos(x2)", zguidefontrotation = 90, ) + g_true_grids = [g1_true_grid, g2_true_grid] - savefig(joinpath(output_directory, "RF_" * case * "_y" * string(y_i) * "_difference_truth_prediction.png")) + savefig(joinpath(output_directory, "RF_" * case * "_true_g2.png")) + + end + + # Plot the difference between the truth and the mean of the predictions + for y_i in 1:d + + # Reshape rf_cov to size N_samples x output_dim + rf_var_temp = [diag(rf_cov[j]) for j in 1:length(rf_cov)] # (40000,) + rf_var = permutedims(vcat([x' for x in rf_var_temp]...), (2, 1)) # 40000 x 2 + + mean_grid = reshape(rf_mean[y_i, :], n_pts, n_pts) + var_grid = reshape(rf_var[y_i, :], n_pts, n_pts) + # Compute and plot 1/variance * (truth - prediction)^2 + + if plot_flag + zlabel = "1/var * (true_y" * string(y_i) * " - predicted_y" * string(y_i) * ")^2" + + p9 = plot( + x1, + x2, + sqrt.(1.0 ./ var_grid .* (g_true_grids[y_i] .- mean_grid) .^ 2), + st = :surface, + camera = (30, 60), + c = :magma, + zlabel = zlabel, + xlabel = "x1", + ylabel = "x2", + zguidefontrotation = 90, + ) + + savefig(joinpath(output_directory, "RF_" * case * "_y" * string(y_i) * "_difference_truth_prediction.png")) + end end end + +main() diff --git a/examples/Emulator/RandomFeature/vector_optimize_and_plot_RF.jl b/examples/Emulator/RandomFeature/vector_optimize_and_plot_RF.jl index 08d045c44..12cbc17a4 100644 --- a/examples/Emulator/RandomFeature/vector_optimize_and_plot_RF.jl +++ b/examples/Emulator/RandomFeature/vector_optimize_and_plot_RF.jl @@ -10,11 +10,13 @@ using LinearAlgebra using CalibrateEmulateSample.Emulators using CalibrateEmulateSample.DataContainers using CalibrateEmulateSample.ParameterDistributions +using CalibrateEmulateSample.EnsembleKalmanProcesses plot_flag = true if plot_flag + ENV["GKSwstype"] = "100" using Plots gr(size = (1500, 700)) - Plots.scalefontsizes(1.3) + # Plots.scalefontsizes(1.3) font = Plots.font("Helvetica", 18) fontdict = Dict(:guidefont => font, :xtickfont => font, :ytickfont => font, :legendfont => font) @@ -38,8 +40,9 @@ function main() mkdir(output_directory) end - cases = ["svd-diag", "svd-nondiag", "nosvd-diag", "nosvd-nondiag"] - case_mask = 1:4 # which cases to do + cases = ["svd-diag", "svd-nondiag", "nosvd-diag", "nosvd-nondiag", "svd-nonsep", "nosvd-nonsep"] + case_mask = 1:6 # which cases to do + nugget = 1e-12 #problem n = 150 # number of training points @@ -128,43 +131,67 @@ function main() for case in cases[case_mask] println("running case $case") - - - # setup random features - n_features = 200 - if case ∈ ["svd-diag", "svd-nondiag"] - optimizer_options = - Dict("n_iteration" => 20, "prior_in_scale" => 0.01, "prior_out_scale" => 1.0, "verbose" => true) #"Max" iterations (may do less) - else # without svd - optimizer_options = - Dict("n_iteration" => 20, "prior_in_scale" => 0.01, "prior_out_scale" => 0.01, "verbose" => true) #"Max" iterations (may do less) - end + n_features = 400 + optimizer_options = + Dict("n_iteration" => 5, "verbose" => true, "scheduler" => DataMisfitController(on_terminate = "continue")) #"Max" iterations (may do less) if case == "svd-diag" vrfi = VectorRandomFeatureInterface( n_features, p, d, - diagonalize_output = true, + kernel_structure = SeparableKernel(LowRankFactor(2, nugget), DiagonalFactor()), optimizer_options = optimizer_options, ) emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true) elseif case == "svd-nondiag" - vrfi = VectorRandomFeatureInterface(n_features, p, d, optimizer_options = optimizer_options) + vrfi = VectorRandomFeatureInterface( + n_features, + p, + d, + kernel_structure = SeparableKernel(LowRankFactor(2, nugget), LowRankFactor(2, nugget)), + optimizer_options = optimizer_options, + ) + emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true) + elseif case == "svd-nonsep" + vrfi = VectorRandomFeatureInterface( + n_features, + p, + d, + kernel_structure = NonseparableKernel(LowRankFactor(4, nugget)), + optimizer_options = optimizer_options, + ) emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true) + elseif case == "nosvd-diag" vrfi = VectorRandomFeatureInterface( n_features, p, d, - diagonalize_output = true, + kernel_structure = SeparableKernel(LowRankFactor(2, nugget), DiagonalFactor()), # roughly normalize output by noise optimizer_options = optimizer_options, ) emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true, decorrelate = false) elseif case == "nosvd-nondiag" - vrfi = VectorRandomFeatureInterface(n_features, p, d, optimizer_options = optimizer_options) + vrfi = VectorRandomFeatureInterface( + n_features, + p, + d, + kernel_structure = SeparableKernel(LowRankFactor(2, nugget), LowRankFactor(2, nugget)), # roughly normalize output by noise + optimizer_options = optimizer_options, + ) emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true, decorrelate = false) + elseif case == "nosvd-nonsep" + vrfi = VectorRandomFeatureInterface( + n_features, + p, + d, + kernel_structure = NonseparableKernel(LowRankFactor(4, nugget)), + optimizer_options = optimizer_options, + ) + emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true, decorrelate = false) + end println("build RF with $n training points and $(n_features) random features.") @@ -187,7 +214,7 @@ function main() for y_i in 1:d rf_var_temp = [diag(rf_cov[j]) for j in 1:length(rf_cov)] # (40000,) - rf_var = permutedims(vcat([x' for x in rf_var_temp]...), (2, 1)) # 2 x 40000 + rf_var = permutedims(reduce(vcat, [x' for x in rf_var_temp]), (2, 1)) # 2 x 40000 mean_grid = reshape(rf_mean[y_i, :], n_pts, n_pts) # 2 x 40000 if plot_flag diff --git a/examples/GCM/Project.toml b/examples/GCM/Project.toml index aba51ad3c..73fa40b73 100644 --- a/examples/GCM/Project.toml +++ b/examples/GCM/Project.toml @@ -10,6 +10,7 @@ PyPlot = "d330b81b-6aea-500a-939a-2ce795aea3ee" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd" [compat] FiniteDiff = "~2.10" diff --git a/examples/GCM/emulate_sample_script.jl b/examples/GCM/emulate_sample_script.jl index 0c8607554..4c0f9ca9c 100644 --- a/examples/GCM/emulate_sample_script.jl +++ b/examples/GCM/emulate_sample_script.jl @@ -1,38 +1,37 @@ # Script to run Emulation and Sampling on data from GCM # Import modules -using Distributions # probability distributions and associated functions +using Distributions using LinearAlgebra +ENV["GKSwstype"] = "100" using Plots +using Plots.PlotMeasures # for mm +using StatsPlots using Random using JLD2 # CES using CalibrateEmulateSample.Emulators using CalibrateEmulateSample.MarkovChainMonteCarlo using CalibrateEmulateSample.ParameterDistributions +using CalibrateEmulateSample.EnsembleKalmanProcesses using CalibrateEmulateSample.DataContainers +using CalibrateEmulateSample.EnsembleKalmanProcesses.Localizers -@time begin +function main() rng_seed = 2413798 Random.seed!(rng_seed) - # expname = "vrf-nondiag-logdet_newprior" - #emulator_type ∈ ["GPR","ScalarRFR","VectorRFR-svd-diag","VectorRFR-svd-nondiag", "VectorRFR-nondiag"] - # emulator_type = "GPR" - # expname = "gpr" - - # emulator_type = "ScalarRFR" - # expname = "srf" + # CHOOSE YOUR CASE + case = 6 - # emulator_type = "VectorRFR-svd-diag" - # expname = "vrf-svd-diag" + emulator_types = + ["GPR", "ScalarRFR", "VectorRFR-svd-diag", "VectorRFR-svd-nondiag", "VectorRFR-nondiag", "VectorRFR-svd-nonsep"] - # emulator_type = "VectorRFR-svd-nondiag" - # expname = "vrf-svd-nondiag" + expnames = ["gpr", "srf", "vrf-svd-diag", "vrf-svd-nondiag", "vrf-nondiag_standardized", "vrf-svd-nonsep"] - emulator_type = "VectorRFR-nondiag" - expname = "vrf-nondiag_standardized" + emulator_type = emulator_types[case] + expname = expnames[case] # Output figure save directory example_directory = @__DIR__ @@ -54,11 +53,11 @@ using CalibrateEmulateSample.DataContainers obs_noise_cov = load(datafile)["obs_noise_cov"] # 96 x 96 #take only first 400 points - iter_mask = 1:4 - #data_mask = 1:32 + iter_mask = [1, 2, 4] + data_mask = 1:96 # data_mask= 33:64 # data_mask= 65:96 - data_mask = 1:96 + #data_mask = 1:96 #data_mask = [5*i for i = 1:Int(floor(96/5))] inputs = inputs[:, :, iter_mask] @@ -66,8 +65,6 @@ using CalibrateEmulateSample.DataContainers obs_noise_cov = obs_noise_cov[data_mask, data_mask] truth = truth[data_mask] - # priorfile = "priors.jld2" - # prior = load(priorfile)["prior"] # derived quantities N_ens, input_dim, N_iter = size(inputs) @@ -79,7 +76,19 @@ using CalibrateEmulateSample.DataContainers normalized = true # setup random features - eki_options_override = Dict("tikhonov" => 0, "multithread" => "ensemble") #faster than tullio multithread for training + eki_options_override = Dict( + "verbose" => true, + "tikhonov" => 0.0, + "scheduler" => DataMisfitController(on_terminate = "continue"), + "n_iteration" => 10, + "multithread" => "ensemble", + "train_fraction" => 0.95, + "inflation" => 0.0, + "cov_sample_multiplier" => 0.5, + "n_ensemble" => 50, + "localization" => SEC(1.0), + ) + if emulator_type == "VectorRFR-svd-nondiag" || emulator_type == "VectorRFR-nondiag" @@ -89,23 +98,46 @@ using CalibrateEmulateSample.DataContainers println("Running Vector RF model - without SVD and assuming non-diagonal variance ") end - n_features = 80 * Int(floor(5 * sqrt(N_ens * N_iter))) + n_features = 20 * Int(floor(5 * sqrt(N_ens * N_iter))) #80 * println("build RF with $(N_ens*N_iter) training points and $(n_features) random features.") + kernel_structure = SeparableKernel(LowRankFactor(2), LowRankFactor(3)) + mlt = VectorRandomFeatureInterface( + n_features, + input_dim, + output_dim, + kernel_structure = kernel_structure, + optimizer_options = eki_options_override, + ) + elseif emulator_type == "VectorRFR-svd-nonsep" || emulator_type == "VectorRFR-nonsep" + if emulator_type == "VectorRFR-svd-nondiag" + println("Running Vector RF model with nonseparable kernel - using SVD") + elseif emulator_type == "VectorRFR-nonsep" + println("Running Vector RF model with nonseparable kernel - without SVD") + end - mlt = VectorRandomFeatureInterface(n_features, input_dim, output_dim, optimizer_options = eki_options_override) + n_features = 20 * Int(floor(5 * sqrt(N_ens * N_iter))) #80 * + println("build RF with $(N_ens*N_iter) training points and $(n_features) random features.") + kernel_structure = NonseparableKernel(LowRankFactor(5)) + mlt = VectorRandomFeatureInterface( + n_features, + input_dim, + output_dim, + kernel_structure = kernel_structure, + optimizer_options = eki_options_override, + ) elseif emulator_type == "VectorRFR-svd-diag" println("Running Vector RF model - using SVD and assuming diagonal variance") - n_features = 20 * Int(floor(5 * sqrt(N_ens * N_iter))) + n_features = 5 * Int(floor(5 * sqrt(N_ens * N_iter))) #20 * println("build RF with $(N_ens*N_iter) training points and $(n_features) random features.") - + kernel_structure = SeparableKernel(LowRankFactor(2), DiagonalFactor(1e-8)) mlt = VectorRandomFeatureInterface( n_features, input_dim, output_dim, - diagonalize_output = true, + kernel_structure = kernel_structure, optimizer_options = eki_options_override, ) @@ -231,8 +263,75 @@ using CalibrateEmulateSample.DataContainers end end - end + end # plots + + ### MCMC + + priorfile = "priors.jld2" + prior = load(priorfile)["prior"] + + ## + ### Sample: Markov Chain Monte Carlo + ### + # initial values + u0 = vec(mean(get_inputs(input_output_pairs), dims = 2)) + println("initial parameters: ", u0) + + # First let's run a short chain to determine a good step size + mcmc = MCMCWrapper(RWMHSampling(), truth, prior, emulator; init_params = u0) + new_step = optimize_stepsize(mcmc; init_stepsize = 0.1, N = 2000, discard_initial = 0) + + # Now begin the actual MCMC + println("Begin MCMC - with step size ", new_step) + chain = MarkovChainMonteCarlo.sample(mcmc, 100_000; stepsize = new_step, discard_initial = 2_000) + posterior = MarkovChainMonteCarlo.get_posterior(mcmc, chain) + + post_mean = mean(posterior) + post_cov = cov(posterior) + println("post_mean") + println(post_mean) + println("post_cov") + println(post_cov) + println("D util") + println(det(inv(post_cov))) + println(" ") + # plot posterior + truth_params = [log(0.7 / 0.3) log(7200)]' # parameter value (at truth) - unconstrained + + # Save data + save( + joinpath(data_save_directory, expname * "posterior.jld2"), + "posterior", + posterior, + "input_output_pairs", + input_output_pairs, + "truth_params", + truth_params, + ) + + + constrained_truth_params = transform_unconstrained_to_constrained(posterior, truth_params) + param_names = get_name(posterior) + + posterior_samples = vcat([get_distribution(posterior)[name] for name in param_names]...) #samples are columns + constrained_posterior_samples = + mapslices(x -> transform_unconstrained_to_constrained(posterior, x), posterior_samples, dims = 1) + + gr(dpi = 300, size = (300, 300)) + p = cornerplot(permutedims(constrained_posterior_samples, (2, 1)), label = param_names, compact = true) + plot!(p.subplots[1], [constrained_truth_params[1]], seriestype = "vline", w = 1.5, c = :steelblue, ls = :dash) # vline on top histogram + plot!(p.subplots[3], [constrained_truth_params[2]], seriestype = "hline", w = 1.5, c = :steelblue, ls = :dash) # hline on right histogram + plot!(p.subplots[2], [constrained_truth_params[1]], seriestype = "vline", w = 1.5, c = :steelblue, ls = :dash) # v & h line on scatter. + plot!(p.subplots[2], [constrained_truth_params[2]], seriestype = "hline", w = 1.5, c = :steelblue, ls = :dash) + figpath = joinpath(figure_save_directory, "plot_posterior_" * expname * ".png") + savefig(figpath) + + +end # main + +@time begin + main() end # for @time diff --git a/examples/GCM/sbatch_emulate_sample_jl b/examples/GCM/sbatch_emulate_sample_jl index efbdb391c..8807f0e9a 100644 --- a/examples/GCM/sbatch_emulate_sample_jl +++ b/examples/GCM/sbatch_emulate_sample_jl @@ -1,6 +1,6 @@ #!/bin/bash #Submit this script with: sbatch thefilename -#SBATCH --time=6:00:00 # walltime +#SBATCH --time=12:00:00 # walltime #SBATCH --ntasks-per-node=1 # number of processor cores (i.e. tasks) #SBATCH --cpus-per-task=28 #SBATCH --mem-per-cpu=6000 diff --git a/examples/Lorenz/GModel_common.jl b/examples/Lorenz/GModel_common.jl index d3a99de5c..1954cae29 100644 --- a/examples/Lorenz/GModel_common.jl +++ b/examples/Lorenz/GModel_common.jl @@ -128,7 +128,8 @@ end # tend: End of simulation Float64(1), nstep: function lorenz_solve(settings::LSettings, params::LParams) # Initialize - nstep = Int32(ceil((settings.tend - settings.t_start) / settings.dt)) + # nstep = Int32(ceil((settings.tend - settings.t_start) / settings.dt)) + nstep = Int32(ceil(settings.tend / settings.dt)) xn = zeros(Float64, settings.N, nstep) t = zeros(Float64, nstep) # Initial perturbation @@ -137,7 +138,7 @@ function lorenz_solve(settings::LSettings, params::LParams) Xbuffer = zeros(4, settings.N) # March forward in time for j in 1:nstep - t[j] = settings.t_start + settings.dt * j + t[j] = settings.dt * j #use view to update a slice # RK4! modifies first and last arguments if settings.dynamics == 1 diff --git a/examples/Lorenz/calibrate.jl b/examples/Lorenz/calibrate.jl index a799899a0..17e1b1242 100644 --- a/examples/Lorenz/calibrate.jl +++ b/examples/Lorenz/calibrate.jl @@ -15,6 +15,34 @@ using CalibrateEmulateSample const EKP = CalibrateEmulateSample.EnsembleKalmanProcesses const PD = EKP.ParameterDistributions + +function shrinkage_cov(sample_mat::AA) where {AA <: AbstractMatrix} + n_out, n_sample = size(sample_mat) + + # de-mean (as we will use the samples directly for calculation of β) + sample_mat_zeromean = sample_mat .- mean(sample_mat, dims = 2) + # Ledoit Wolf shrinkage to I + + # get sample covariance + Γ = cov(sample_mat_zeromean, dims = 2) + # estimate opt shrinkage + μ_shrink = 1 / n_out * tr(Γ) + δ_shrink = norm(Γ - μ_shrink * I)^2 / n_out # (scaled) frob norm of Γ_m + #once de-meaning, we need to correct the sample covariance with an n_sample -> n_sample-1 + β_shrink = sum([norm(c * c' - -Γ)^2 / n_out for c in eachcol(sample_mat_zeromean)]) / (n_sample - 1)^2 + + γ_shrink = min(β_shrink / δ_shrink, 1) # clipping is typically rare + # γμI + (1-γ)Γ + Γ .*= (1 - γ_shrink) + for i in 1:n_out + Γ[i, i] += γ_shrink * μ_shrink + end + + @info "Shrinkage scale: $(γ_shrink), (0 = none, 1 = revert to scaled Identity)\n shrinkage covariance condition number: $(cond(Γ))" + return Γ +end + + function main() rng_seed = 4137 @@ -63,10 +91,6 @@ function main() n_param = length(param_names) params_true = reshape(params_true, (n_param, 1)) - println(n_param) - println(params_true) - - ### ### Define the parameter priors ### @@ -97,7 +121,7 @@ function main() N = 36 dt = 1 / 64.0 # Start of integration - t_start = 800.0 + t_start = 100.0#800.0 # We now rescale all the timings etc, to be in nondim-time # Data collection length if dynamics == 1 @@ -110,7 +134,7 @@ function main() # Integration length Tfit = Ts_days / τc # Initial perturbation - Fp = rand(Normal(0.0, 0.01), N) + Fp = rand(rng, Normal(0.0, 0.01), N) kmax = 1 # Prescribe variance or use a number of forward passes to define true interval variability var_prescribe = false @@ -148,40 +172,61 @@ function main() μ = zeros(length(gt)) # Add noise for i in 1:n_samples - yt[:, i] = gt .+ rand(MvNormal(μ, Γy)) + yt[:, i] = gt .+ rand(rng, MvNormal(μ, Γy)) end - println(Γy) else println("Using truth values to compute covariance") n_samples = 100 - nthreads = Threads.nthreads() - yt = zeros(nthreads, length(gt), n_samples) - Threads.@threads for i in 1:n_samples - - #= - lorenz_settings_local = GModel.LSettings( - dynamics, - stats_type, - t_start + T * (i - 1), - T, - Ts, - Tfit, - Fp, - N, - dt, - t_start + T * (i - 1) + T, - kmax, - ω_fixed, - ω_true, - )=# - lorenz_settings_local = lorenz_settings - tid = Threads.threadid() - yt[tid, :, i] = GModel.run_G_ensemble(params_true, lorenz_settings_local) + yt = zeros(length(gt), n_samples) + for i in 1:n_samples + + # Fp changes IC, then run from 0 to t_start + T, + # then gathering stats over [t_start, t_start+T] + Fp = rand(rng, Normal(0.0, 0.01), N) + lorenz_settings_local = GModel.LSettings( + dynamics, + stats_type, + t_start, # data collection start start time + T, #data collection length + Ts, # integration_length + Tfit, # number of Ts to fit over + Fp, + N, + dt, + t_start + T, #sim end time + kmax, + ω_fixed, + ω_true, + ) + + #= lorenz_settings_local = GModel.LSettings( + dynamics, + stats_type, + t_start + T * (i - 1), #sim start time + T, #data collection length + Ts, # integration_length + Tfit, # number of Ts to fit over + Fp, + N, + dt, + t_start + T * (i - 1) + T , #sim end time + kmax, + ω_fixed, + ω_true, + )=# + yt[:, i] = GModel.run_G_ensemble(params_true, lorenz_settings_local) + end - yt = dropdims(sum(yt, dims = 1), dims = 1) + gr(dpi = 300, size = (400, 400)) + KK = plot(yt[1:end, 1:100], legend = false) + savefig(KK, joinpath(figure_save_directory, "data_samples.png")) + + # [1.] use shrinkage for regularization + Γy = shrinkage_cov(yt) - # add a little extra variance for regularization here + #= + # [2.] add fixed variance for regularization noise_sd = 0.1 Γy_diag = noise_sd .^ 2 * convert(Array, I(size(yt, 1))) μ = zeros(size(yt, 1)) @@ -192,8 +237,8 @@ function main() #Γy = convert(Array, Diagonal(dropdims(mean((yt.-mean(yt,dims=1)).^2,dims=1),dims=1))) # Covariance of truth data Γy = cov(yt, dims = 2) + =# - println(Γy) end diff --git a/examples/Lorenz/emulate_sample.jl b/examples/Lorenz/emulate_sample.jl index 7ea16c67c..a42cd5c5a 100644 --- a/examples/Lorenz/emulate_sample.jl +++ b/examples/Lorenz/emulate_sample.jl @@ -4,6 +4,7 @@ include(joinpath(@__DIR__, "..", "ci", "linkfig.jl")) # Import modules using Distributions # probability distributions and associated functions using LinearAlgebra +ENV["GKSwstype"] = "100" using StatsPlots using Plots using Random @@ -43,25 +44,22 @@ function main() "RF-vector-svd-nondiag", "RF-vector-nosvd-diag", "RF-vector-nosvd-nondiag", + "RF-vector-svd-nonsep", ] #### CHOOSE YOUR CASE: - mask = 2:7 - # One day we can use good heuristics here - # Currently set so that learnt hyperparameters stays relatively far inside the prior. (use "verbose" => true in optimizer overrides to see this info) - prior_scalings = [[0.0, 0.0], [1e-2, 0.0], [1e-1, 0.0], [1e-2, 1e-2], [1e-2, 1e-1], [1e-2, 1e-2], [1e-2, 1e-2]] - for (case, scaling) in zip(cases[mask], prior_scalings[mask]) + mask = 2:7 # e.g. 1:8 or [7] + for (case) in cases[mask] + - #case = cases[7] println("case: ", case) - max_iter = 6 # number of EKP iterations to use data from is at most this + min_iter = 1 + max_iter = 5 # number of EKP iterations to use data from is at most this overrides = Dict( "verbose" => true, - "train_fraction" => 0.8, - "n_iteration" => 40, - "scheduler" => DataMisfitController(), - "prior_in_scale" => scaling[1], - "prior_out_scale" => scaling[2], + "scheduler" => DataMisfitController(terminate_at = 100.0), + "cov_sample_multiplier" => 1.0, + "n_iteration" => 20, ) # we do not want termination, as our priors have relatively little interpretation @@ -93,12 +91,12 @@ function main() ekiobj = load(data_save_file)["eki"] priors = load(data_save_file)["priors"] - truth_sample = load(data_save_file)["truth_sample"] truth_sample_mean = load(data_save_file)["truth_sample_mean"] + truth_sample = load(data_save_file)["truth_sample"] truth_params_constrained = load(data_save_file)["truth_input_constrained"] #true parameters in constrained space truth_params = transform_constrained_to_unconstrained(priors, truth_params_constrained) Γy = ekiobj.obs_noise_cov - println(Γy) + n_params = length(truth_params) # "input dim" output_dim = size(Γy, 1) @@ -108,6 +106,7 @@ function main() # Emulate-sample settings # choice of machine-learning tool + nugget = 0.001 if case == "GP" gppackage = Emulators.GPJL() pred_type = Emulators.YType() @@ -118,26 +117,43 @@ function main() noise_learn = false, ) elseif case ∈ ["RF-scalar", "RF-scalar-diagin"] - n_features = 300 - diagonalize_input = case == "RF-scalar-diagin" ? true : false + n_features = 100 + kernel_structure = + case == "RF-scalar-diagin" ? SeparableKernel(DiagonalFactor(nugget), OneDimFactor()) : + SeparableKernel(LowRankFactor(2, nugget), OneDimFactor()) mlt = ScalarRandomFeatureInterface( n_features, n_params, rng = rng, - diagonalize_input = diagonalize_input, + kernel_structure = kernel_structure, optimizer_options = overrides, ) elseif case ∈ ["RF-vector-svd-diag", "RF-vector-nosvd-diag", "RF-vector-svd-nondiag", "RF-vector-nosvd-nondiag"] # do we want to assume that the outputs are decorrelated in the machine-learning problem? - diagonalize_output = case ∈ ["RF-vector-svd-diag", "RF-vector-nosvd-diag"] ? true : false - n_features = 300 + kernel_structure = + case ∈ ["RF-vector-svd-diag", "RF-vector-nosvd-diag"] ? + SeparableKernel(LowRankFactor(2, nugget), DiagonalFactor(nugget)) : + SeparableKernel(LowRankFactor(2, nugget), LowRankFactor(3, nugget)) + n_features = 500 mlt = VectorRandomFeatureInterface( n_features, n_params, output_dim, rng = rng, - diagonalize_output = diagonalize_output, # assume outputs are decorrelated + kernel_structure = kernel_structure, + optimizer_options = overrides, + ) + elseif case ∈ ["RF-vector-svd-nonsep"] + kernel_structure = NonseparableKernel(LowRankFactor(3, nugget)) + n_features = 500 + + mlt = VectorRandomFeatureInterface( + n_features, + n_params, + output_dim, + rng = rng, + kernel_structure = kernel_structure, optimizer_options = overrides, ) end @@ -146,21 +162,41 @@ function main() # Use median over all data since all data are the same type truth_sample_norm = vcat(truth_sample...) norm_factor = get_standardizing_factors(truth_sample_norm) - println(size(norm_factor)) #norm_factor = vcat(norm_factor...) norm_factor = fill(norm_factor, size(truth_sample)) - println("Standardization factors") - println(norm_factor) # Get training points from the EKP iteration number in the second input term N_iter = min(max_iter, length(get_u(ekiobj)) - 1) # number of paired iterations taken from EKP - - input_output_pairs = Utilities.get_training_points(ekiobj, N_iter - 1) # 1:N-1 - - input_output_pairs_test = Utilities.get_training_points(ekiobj, N_iter:N_iter) # just "next" iteration used for testing + min_iter = min(max_iter, max(1, min_iter)) + input_output_pairs = Utilities.get_training_points(ekiobj, min_iter:(N_iter - 1)) + input_output_pairs_test = Utilities.get_training_points(ekiobj, N_iter:(length(get_u(ekiobj)) - 1)) # "next" iterations # Save data @save joinpath(data_save_directory, "input_output_pairs.jld2") input_output_pairs + # plot training points in constrained space + if case == cases[mask[1]] + gr(dpi = 300, size = (400, 400)) + inputs_unconstrained = get_inputs(input_output_pairs) + inputs_constrained = transform_unconstrained_to_constrained(priors, inputs_unconstrained) + p = plot( + title = "training points", + xlims = extrema(inputs_constrained[1, :]), + xaxis = "F", + yaxis = "A", + ylims = extrema(inputs_constrained[2, :]), + ) + scatter!(p, inputs_constrained[1, :], inputs_constrained[2, :], color = :magenta, label = false) + inputs_test_unconstrained = get_inputs(input_output_pairs_test) + inputs_test_constrained = transform_unconstrained_to_constrained(priors, inputs_test_unconstrained) + + scatter!(p, inputs_test_constrained[1, :], inputs_test_constrained[2, :], color = :black, label = false) + + vline!(p, [truth_params_constrained[1]], linestyle = :dash, linecolor = :red, label = false) + hline!(p, [truth_params_constrained[2]], linestyle = :dash, linecolor = :red, label = false) + savefig(p, joinpath(figure_save_directory, "training_points.pdf")) + savefig(p, joinpath(figure_save_directory, "training_points.png")) + end + standardize = false retained_svd_frac = 1.0 normalized = true @@ -190,11 +226,11 @@ function main() println("ML prediction on true parameters: ") println(vec(y_mean)) println("true data: ") - println(truth_sample_mean) # same, regardless of norm_factor - println(" ML predicted variance") - println(diag(y_var[1], 0)) + println(truth_sample) # what was used as truth + println(" ML predicted standard deviation") + println(sqrt.(diag(y_var[1], 0))) println("ML MSE (truth): ") - println(mean((truth_sample_mean - vec(y_mean)) .^ 2)) + println(mean((truth_sample - vec(y_mean)) .^ 2)) println("ML MSE (next ensemble): ") println(mean((get_outputs(input_output_pairs_test) - y_mean_test) .^ 2)) @@ -207,13 +243,13 @@ function main() println("initial parameters: ", u0) # First let's run a short chain to determine a good step size - truth_sample = truth_sample mcmc = MCMCWrapper(RWMHSampling(), truth_sample, priors, emulator; init_params = u0) new_step = optimize_stepsize(mcmc; init_stepsize = 0.1, N = 2000, discard_initial = 0) # Now begin the actual MCMC println("Begin MCMC - with step size ", new_step) chain = MarkovChainMonteCarlo.sample(mcmc, 100_000; stepsize = new_step, discard_initial = 2_000) + posterior = MarkovChainMonteCarlo.get_posterior(mcmc, chain) post_mean = mean(posterior) @@ -226,7 +262,6 @@ function main() println(det(inv(post_cov))) println(" ") - constrained_truth_params = transform_unconstrained_to_constrained(posterior, truth_params) param_names = get_name(posterior) posterior_samples = vcat([get_distribution(posterior)[name] for name in get_name(posterior)]...) #samples are columns @@ -235,10 +270,12 @@ function main() gr(dpi = 300, size = (300, 300)) p = cornerplot(permutedims(constrained_posterior_samples, (2, 1)), label = param_names, compact = true) - plot!(p.subplots[1], [constrained_truth_params[1]], seriestype = "vline", w = 1.5, c = :steelblue, ls = :dash) # vline on top histogram - plot!(p.subplots[3], [constrained_truth_params[2]], seriestype = "hline", w = 1.5, c = :steelblue, ls = :dash) # hline on right histogram - plot!(p.subplots[2], [constrained_truth_params[1]], seriestype = "vline", w = 1.5, c = :steelblue, ls = :dash) # v & h line on scatter. - plot!(p.subplots[2], [constrained_truth_params[2]], seriestype = "hline", w = 1.5, c = :steelblue, ls = :dash) + plot!(p.subplots[1], [truth_params_constrained[1]], seriestype = "vline", w = 1.5, c = :steelblue, ls = :dash) # vline on top histogram + plot!(p.subplots[3], [truth_params_constrained[2]], seriestype = "hline", w = 1.5, c = :steelblue, ls = :dash) # hline on right histogram + plot!(p.subplots[2], [truth_params_constrained[1]], seriestype = "vline", w = 1.5, c = :steelblue, ls = :dash) # v & h line on scatter. + plot!(p.subplots[2], [truth_params_constrained[2]], seriestype = "hline", w = 1.5, c = :steelblue, ls = :dash) + figpath = joinpath(figure_save_directory, "posterior_2d-" * case * ".pdf") + savefig(figpath) figpath = joinpath(figure_save_directory, "posterior_2d-" * case * ".png") savefig(figpath) diff --git a/src/Emulator.jl b/src/Emulator.jl index 8c6f7a2d8..985d59dee 100644 --- a/src/Emulator.jl +++ b/src/Emulator.jl @@ -132,7 +132,6 @@ function Emulator( svd_transform(training_outputs, obs_noise_cov, retained_svd_frac = retained_svd_frac) training_pairs = PairedDataContainer(training_inputs, decorrelated_training_outputs) - # [4.] build an emulator build_models!(machine_learning_tool, training_pairs) else @@ -141,7 +140,6 @@ function Emulator( end decomposition = nothing training_pairs = PairedDataContainer(training_inputs, training_outputs) - # [4.] build an emulator - providing the noise covariance as a Tikhonov regularizer build_models!(machine_learning_tool, training_pairs, regularization_matrix = obs_noise_cov) end @@ -258,13 +256,20 @@ function predict( # [4.] unstandardize return reverse_standardize(emulator, s_outputs, s_output_cov) else #... and want to stay in decorrelated standardized coordinates - diag_out_flag = get_diagonalize_output(get_machine_learning_tool(emulator)) - if diag_out_flag - ds_output_diagvar = vec([Diagonal(ds_output_covvec[j]) for j in 1:N_samples]) # extracts diagonal - if output_dim == 1 + + # special cases where you only return variances and not covariances, could be better acheived with dispatch... + kernel_structure = get_kernel_structure(get_machine_learning_tool(emulator)) + if nameof(typeof(kernel_structure)) == :SeparableKernel + output_cov_structure = get_output_cov_structure(kernel_structure) + if nameof(typeof(output_cov_structure)) == :DiagonalFactor + ds_output_diagvar = vec([Diagonal(ds_output_covvec[j]) for j in 1:N_samples]) # extracts diagonal + return ds_outputs, ds_output_diagvar + elseif nameof(typeof(output_cov_structure)) == :OneDimFactor ds_output_diagvar = ([ds_output_covvec[i][1, 1] for i in 1:N_samples], 1, :) # extracts value + return ds_outputs, ds_output_diagvar + else + return ds_outputs, ds_output_covvec end - return ds_outputs, ds_output_diagvar else return ds_outputs, ds_output_covvec end @@ -434,7 +439,7 @@ function svd_transform( decomposition = SVD(decomp.U[:, 1:k], decomp.S[1:k], decomp.Vt[1:k, :]) else transformed_data = sqrt_singular_values_inv * decomp.Vt * data - decomposition = svd(obs_noise_cov) + decomposition = decomp end return transformed_data, decomposition end diff --git a/src/RandomFeature.jl b/src/RandomFeature.jl index e04fde7eb..f6f793ad9 100644 --- a/src/RandomFeature.jl +++ b/src/RandomFeature.jl @@ -8,15 +8,117 @@ using ..Utilities using StableRNGs using Random -export calculate_n_hyperparameters, build_default_prior +export calculate_n_hyperparameters, hyperparameters_from_flat, build_default_prior, cov_structure_from_string +export CovarianceStructureType, OneDimFactor, DiagonalFactor, CholeskyFactor, LowRankFactor, HierarchicalLowRankFactor +export SeparableKernel, NonseparableKernel +export get_input_cov_structure, get_output_cov_structure, get_cov_structure +export get_eps +export rank +export shrinkage_cov + abstract type RandomFeatureInterface <: MachineLearningTool end +# Different types of threading abstract type MultithreadType end struct TullioThreading <: MultithreadType end struct EnsembleThreading <: MultithreadType end -#common functions + +# Different types of covariance representation in the prior +abstract type CovarianceStructureType end +struct OneDimFactor <: CovarianceStructureType end +struct DiagonalFactor{FT <: AbstractFloat} <: CovarianceStructureType + eps::FT +end +DiagonalFactor() = DiagonalFactor(Float64(1.0)) +get_eps(df::DiagonalFactor) = df.eps + +struct CholeskyFactor{FT <: AbstractFloat} <: CovarianceStructureType + eps::FT +end +CholeskyFactor() = CholeskyFactor(Float64(1.0)) +get_eps(cf::CholeskyFactor) = cf.eps + +struct LowRankFactor{FT <: AbstractFloat} <: CovarianceStructureType + rank::Int + eps::FT +end +LowRankFactor(r::Int) = LowRankFactor(r, Float64(1.0)) +get_eps(lrf::LowRankFactor) = lrf.eps + +struct HierarchicalLowRankFactor{FT <: AbstractFloat} <: CovarianceStructureType + #cst::CovarianceStructureType + rank::Int + eps::FT +end + +HierarchicalLowRankFactor(r::Int) = HierarchicalLowRankFactor(r, Float64(1.0)) +get_eps(hlrf::HierarchicalLowRankFactor) = hlrf.eps + +import LinearAlgebra: rank +rank(lrf::LowRankFactor) = lrf.rank +rank(hlrf::HierarchicalLowRankFactor) = hlrf.rank + +# To add a new cov type `T` one must define these 3 methods: +# hyperparameters_from_flat(x::V, d::Int, t::T) where {V <: AbstractVector} +# calculate_n_hyperparameters(d::Int, t::T) +# build_default_prior(name::SS, d::Int, t::T) where {SS <: AbstractString} +# and add a string id to cov_structure_from_string + +function cov_structure_from_string(s::S, d::Int) where {S <: AbstractString} + if s == "onedim" + return OneDimFactor() + elseif s == "diagonal" + return DiagonalFactor() + elseif s == "cholesky" + return CholeskyFactor() + elseif s == "lowrank" + return LowRankFactor(Int(ceil(sqrt(d)))) + elseif s == "hierlowrank" + return HierarchicalLowRankFactor(Int(ceil(sqrt(d)))) + else + throw( + ArgumentError( + "Recieved unknown `input_cov_structure` keyword $(s), \n please choose from [\"onedim\",\"diagonal\",\"cholesky\", \"lowrank\"] or provide a CovarianceStructureType", + ), + ) + end +end +cov_structure_from_string(cst::CST, args...; kwargs...) where {CST <: CovarianceStructureType} = cst + + +# Different types of kernel +abstract type KernelStructureType end + +struct SeparableKernel{CST1 <: CovarianceStructureType, CST2 <: CovarianceStructureType} <: KernelStructureType + input_cov_structure::CST1 + output_cov_structure::CST2 +end +get_input_cov_structure(kernel_structure::SeparableKernel) = kernel_structure.input_cov_structure +get_output_cov_structure(kernel_structure::SeparableKernel) = kernel_structure.output_cov_structure + +struct NonseparableKernel{CST <: CovarianceStructureType} <: KernelStructureType + cov_structure::CST +end +get_cov_structure(kernel_structure::NonseparableKernel) = kernel_structure.cov_structure + + +# calculate_n_hyperparameters +calculate_n_hyperparameters(d::Int, odf::OneDimFactor) = 0 +calculate_n_hyperparameters(d::Int, df::DiagonalFactor) = d +calculate_n_hyperparameters(d::Int, cf::CholeskyFactor) = Int(d * (d + 1) / 2) +calculate_n_hyperparameters(d::Int, lrf::LowRankFactor) = Int(rank(lrf) + d * rank(lrf)) +calculate_n_hyperparameters(d::Int, hlrf::HierarchicalLowRankFactor) = + Int(rank(hlrf) * (rank(hlrf) + 1) / 2 + d * rank(hlrf)) + +# build from flat +hyperparameters_from_flat(x::V, odf::OneDimFactor) where {V <: AbstractVector} = nothing + +function hyperparameters_from_flat(x::V, df::DiagonalFactor) where {V <: AbstractVector} + return convert(Matrix, Diagonal(x) + get_eps(df) * I) +end + """ $(DocStringExtensions.TYPEDSIGNATURES) @@ -33,185 +135,234 @@ function flat_to_chol(x::V) where {V <: AbstractVector} return cholmat end -""" -$(DocStringExtensions.TYPEDSIGNATURES) +function hyperparameters_from_flat(x::V, cf::CholeskyFactor) where {V <: AbstractVector} + chol = flat_to_chol(x) + return chol * permutedims(chol, (2, 1)) + get_eps(cf) * I -Builds Covariance matrices from a flat array of hyperparameters -""" -function hyperparameters_from_flat( - x::VV, - input_dim::Int, - output_dim::Int, - diagonalize_input::Bool, - diagonalize_output::Bool, -) where {VV <: AbstractVector} - # if dim = 1 then parameters are a 1x1 matrix - # if we diagonalize then parameters are diagonal entries + constant - # if we don't diagonalize then parameters are a cholesky product + constant on diagonal. - - #input space - if input_dim == 1 - in_max = 1 - U = x[in_max] * ones(1, 1) - elseif diagonalize_input - in_max = input_dim + 2 - U = convert(Matrix, x[in_max - 1] * (Diagonal(x[1:(in_max - 2)]) + x[in_max] * I)) - elseif !diagonalize_input - in_max = Int(input_dim * (input_dim + 1) / 2) + 2 - cholU = flat_to_chol(x[1:(in_max - 2)]) - U = x[in_max - 1] * (cholU * permutedims(cholU, (2, 1)) + x[in_max] * I) - end +end - # output_space - out_min = in_max + 1 - if output_dim == 1 - V = x[end] * ones(1, 1) - elseif diagonalize_output - V = convert(Matrix, x[end - 1] * (Diagonal(x[out_min:(end - 2)]) + x[end] * I)) - elseif !diagonalize_output - cholV = flat_to_chol(x[out_min:(end - 2)]) - V = x[end - 1] * (cholV * permutedims(cholV, (2, 1)) + x[end] * I) - end +function hyperparameters_from_flat(x::V, lrf::LowRankFactor) where {V <: AbstractVector} - # sometimes this process does not give a (numerically) symmetric matrix - U = 0.5 * (U + permutedims(U, (2, 1))) - V = 0.5 * (V + permutedims(V, (2, 1))) + r = rank(lrf) + d = Int(length(x) / r - 1) # l = r + d*r - return U, V + D = Diagonal(x[1:r]) # D acts like sing.vals.^2 + U = reshape(x[(r + 1):end], d, r) + qrU = qr(U) # qrU.Q is semi-orthogonal in first r columns, i.e. Q^T.Q = I + return qrU.Q[:, 1:r] * D * qrU.Q[:, 1:r]' + get_eps(lrf) * I end -function hyperparameters_from_flat(x::VV, input_dim::Int, diagonalize_input::Bool) where {VV <: AbstractVector} - output_dim = 1 - diagonalize_output = false - U, V = hyperparameters_from_flat(x, input_dim, output_dim, diagonalize_input, diagonalize_output) - # Note that the scalar setting has a slight inconsistency from the 1-D output vector case - # We correct here by rescaling U using the single hyperparameter in V - return V[1, 1] * U +function hyperparameters_from_flat(x::V, hlrf::HierarchicalLowRankFactor) where {V <: AbstractVector} + r = rank(hlrf) + d = Int(length(x) / r - (r + 1) / 2) # l = r^2 + d*r + + # Ambikasaran, O'Neil, Singh 2016 + L = flat_to_chol(x[1:Int(r * (r + 1) / 2)]) + U = reshape(x[Int(r * (r + 1) / 2 + 1):end], d, r) + K = L * permutedims(L, (2, 1)) + get_eps(hlrf) * I + + qrU = qr(U) + svdT = svd(I + qrU.R * K * qrU.R') # square so T = Q S Q^t + M = svdT.V * Diagonal(sqrt.(svdT.S)) # M M^t = Q sqrt(S) (Q sqrt(S))^t = Q sqrt(S) sqrt(S) Q^t = T + X = M - I + update_factor = I + qrU.Q * X * qrU.Q' + + return update_factor * update_factor' +end + + +build_default_prior(name::SS, n_hp::Int, odf::OneDimFactor) where {SS <: AbstractString} = nothing + +function build_default_prior(name::SS, n_hp::Int, df::DiagonalFactor) where {SS <: AbstractString} + return ParameterDistribution( + Dict( + "name" => "$(name)_diagonal", + "distribution" => VectorOfParameterized(repeat([Normal(0, 3)], n_hp)), + "constraint" => repeat([bounded_below(0.0)], n_hp), + ), + ) +end + +function build_default_prior(name::SS, n_hp::Int, df::CholeskyFactor) where {SS <: AbstractString} + return constrained_gaussian("$(name)_cholesky", 0.0, 5.0, -Inf, Inf, repeats = n_hp) end +function build_default_prior(name::SS, n_hp::Int, lrf::LowRankFactor) where {SS <: AbstractString} + r = rank(lrf) + d = Int(n_hp / r - 1) + D = ParameterDistribution( + Dict( + "name" => "$(name)_lowrank_diagonal", + "distribution" => VectorOfParameterized(repeat([Normal(0, 3)], r)), + "constraint" => repeat([bounded_below(0.0)], r), + ), + ) + U = constrained_gaussian("$(name)_lowrank_U", 0.0, 100.0, -Inf, Inf, repeats = Int(d * r)) + return combine_distributions([D, U]) +end + +function build_default_prior(name::SS, n_hp::Int, hlrf::HierarchicalLowRankFactor) where {SS <: AbstractString} + r = rank(hlrf) + d = Int(n_hp / r - (r + 1) / 2) + L = constrained_gaussian("$(name)_lowrank_Kchol", 0.0, 5.0, -Inf, Inf, repeats = Int(r * (r + 1) / 2)) + U = constrained_gaussian("$(name)_lowrank_U", 0.0, 100.0, -Inf, Inf, repeats = Int(d * r)) + return combine_distributions([L, U]) +end + +# combining input and output spaces: + """ $(DocStringExtensions.TYPEDSIGNATURES) -Builds a prior over the hyperparameters (i.e. the cholesky/diagaonal or individaul entries of the input/output covariances). -For example, the case where the input covariance ``U = γ_1 * (LL^T + γ_2 I)``, -we set priors for the entries of the lower triangular matrix ``L`` as normal, and constant scalings ``γ_i`` as log-normal to retain positivity. - -priors can be scaled by a constant factor by using the in_scale and out_scale keywords +Calculate number of hyperparameters required to create the default prior in the given input/output space dimensions (determined from the `CovarianceStructureType`) """ -function build_default_prior( +function calculate_n_hyperparameters( input_dim::Int, output_dim::Int, - diagonalize_input::Bool, - diagonalize_output::Bool; - in_scale = 1.0, - out_scale = 1.0, -) - n_hp_in = n_hyperparameters_cov(input_dim, diagonalize_input) - n_hp_out = n_hyperparameters_cov(output_dim, diagonalize_output) - - # aspect ratio for mean:sd for positive parameters in constrained_gaussian() - # 10 seems to give a decent range, this isn't very tuneable so the alternative would be to use the basic constructor - pos_asp_ratio = 10 - # if dim = 1 , positive constant - # elseif diagonalized, positive on diagonal and positive scalings - # else chol factor, and positive scalings - if input_dim > 1 - if diagonalize_input - param_diag = - constrained_gaussian("input_prior_diagonal", 1.0, pos_asp_ratio, 0.0, Inf, repeats = n_hp_in - 2) - param_scaling = - constrained_gaussian("input_prior_scaling", in_scale, in_scale * pos_asp_ratio, 0.0, Inf, repeats = 2) - input_prior = combine_distributions([param_diag, param_scaling]) - else - param_chol = constrained_gaussian("input_prior_cholesky", 0.0, 1.0, -Inf, Inf, repeats = n_hp_in - 2) - param_scaling = - constrained_gaussian("input_prior_scaling", in_scale, in_scale * pos_asp_ratio, 0.0, Inf, repeats = 2) - input_prior = combine_distributions([param_chol, param_scaling]) - end - else - input_prior = constrained_gaussian("input_prior", in_scale, in_scale, 0.0, Inf) - end - - if output_dim > 1 - if diagonalize_output - param_diag = - constrained_gaussian("output_prior_diagonal", 1.0, pos_asp_ratio, 0.0, Inf, repeats = n_hp_out - 2) - param_scaling = constrained_gaussian( - "output_prior_scaling", - out_scale, - out_scale * pos_asp_ratio, - 0.0, - Inf, - repeats = 2, - ) - output_prior = combine_distributions([param_diag, param_scaling]) - else - param_chol = constrained_gaussian("output_prior_cholesky", 0.0, 1.0, -Inf, Inf, repeats = n_hp_out - 2) - param_scaling = constrained_gaussian( - "output_prior_scaling", - out_scale, - out_scale * pos_asp_ratio, - 0.0, - Inf, - repeats = 2, - ) - output_prior = combine_distributions([param_chol, param_scaling]) - end - else - output_prior = constrained_gaussian("output_prior", out_scale, 10 * out_scale, 0.0, Inf) + kernel_structure::SK, +) where {SK <: SeparableKernel} + + input_cov_structure = get_input_cov_structure(kernel_structure) + n_hp_in = calculate_n_hyperparameters(input_dim, input_cov_structure) + output_cov_structure = get_output_cov_structure(kernel_structure) + n_hp_out = calculate_n_hyperparameters(output_dim, output_cov_structure) + n_scalings = 1 + + # if both in and output dim are "OneDimFactors" we still learn 1 hp + if n_hp_in + n_hp_out == 0 + return 1 + n_scalings end - return combine_distributions([input_prior, output_prior]) - + return n_hp_in + n_hp_out + n_scalings end -function build_default_prior(input_dim::Int, diagonalize_input::Bool; in_scale = 1.0) - #scalar case consistent with vector case +function calculate_n_hyperparameters(input_dim::Int, kernel_structure::KST) where {KST <: KernelStructureType} output_dim = 1 - diagonalize_output = false - out_scale = 1.0 - return build_default_prior( - input_dim, - output_dim, - diagonalize_input, - diagonalize_output, - in_scale = in_scale, - out_scale = out_scale, - ) + return calculate_n_hyperparameters(input_dim, output_dim, kernel_structure) end -""" -$(DocStringExtensions.TYPEDSIGNATURES) +function calculate_n_hyperparameters( + input_dim::Int, + output_dim::Int, + kernel_structure::NK, +) where {NK <: NonseparableKernel} + cov_structure = get_cov_structure(kernel_structure) + n_hp = calculate_n_hyperparameters(input_dim * output_dim, cov_structure) + n_scalings = 1 + # if both in and output dim are "OneDimFactors" we still learn 1 hp + if n_hp == 0 + return 1 + n_scalings + end -The number of hyperparameters required to characterize covariance (determined from `hyperparameters_from_flat`). -""" -function n_hyperparameters_cov(dim::Int, diagonalize::Bool) - n_hp = 0 - n_hp += diagonalize ? dim : Int(dim * (dim + 1) / 2) # inputs: diagonal or cholesky - n_hp += (dim > 1) ? 2 : 0 # add two more for constant diagonal in each case. - return n_hp + return n_hp + n_scalings end + + """ $(DocStringExtensions.TYPEDSIGNATURES) -Calculate number of hyperparameters required to create the default prior in the given input/output space (determined from `hyperparameters_from_flat`) +Builds a prior over the hyperparameters (i.e. the low-rank/cholesky/diagaonal or individaul entries of the input/output covariances). +For example, the case where the input covariance ``U = γ_1 * (LL^T + γ_2 I)``, +we set priors for the entries of the lower triangular matrix ``L`` as normal, and constant scalings ``γ_i`` as log-normal to retain positivity. """ -function calculate_n_hyperparameters(input_dim::Int, output_dim::Int, diagonalize_input::Bool, diagonalize_output::Bool) - n_hp_in = n_hyperparameters_cov(input_dim, diagonalize_input) - n_hp_out = n_hyperparameters_cov(output_dim, diagonalize_output) - return n_hp_in + n_hp_out +function build_default_prior(input_dim::Int, output_dim::Int, kernel_structure::SK) where {SK <: SeparableKernel} + input_cov_structure = get_input_cov_structure(kernel_structure) + output_cov_structure = get_output_cov_structure(kernel_structure) + + n_hp_in = calculate_n_hyperparameters(input_dim, input_cov_structure) + input_prior = build_default_prior("input", n_hp_in, input_cov_structure) + n_hp_out = calculate_n_hyperparameters(output_dim, output_cov_structure) + output_prior = build_default_prior("output", n_hp_out, output_cov_structure) + + scaling_kern = constrained_gaussian("sigma", 1, 5, 0, Inf) + + # We only use OneDimFactor values, default to input if both in and output dimension are one dimensional.Otherwise they are ignored + if isnothing(input_prior) && isnothing(output_prior) + onedim_prior = constrained_gaussian("onedim", 1.0, 1.0, 0.0, Inf) + return combine_distributions([onedim_prior, scaling_kern]) + elseif isnothing(input_prior) + return combine_distributions([output_prior, scaling_kern]) + elseif isnothing(output_prior) + return combine_distributions([input_prior, scaling_kern]) + else + return combine_distributions([input_prior, output_prior, scaling_kern]) + end end -function calculate_n_hyperparameters(input_dim::Int, diagonalize_input::Bool) - #scalar case consistent with vector case +function build_default_prior(input_dim::Int, kernel_structure::KST) where {KST <: KernelStructureType} output_dim = 1 - diagonalize_output = false - return calculate_n_hyperparameters(input_dim, output_dim, diagonalize_input, diagonalize_output) + return build_default_prior(input_dim, output_dim, kernel_structure) +end + +function build_default_prior(input_dim::Int, output_dim::Int, kernel_structure::NK) where {NK <: NonseparableKernel} + cov_structure = get_cov_structure(kernel_structure) + n_hp = calculate_n_hyperparameters(input_dim * output_dim, cov_structure) + if n_hp == 0 + pd_kern = constrained_gaussian("onedim", 1.0, 1.0, 0.0, Inf) + else + pd_kern = build_default_prior("full", n_hp, cov_structure) + end + + scaling_kern = constrained_gaussian("sigma", 1, 5, 0, Inf) + return combine_distributions([pd_kern, scaling_kern]) end +function hyperparameters_from_flat( + x::VV, + input_dim::Int, + output_dim::Int, + kernel_structure::SK, +) where {VV <: AbstractVector, SK <: SeparableKernel} + input_cov_structure = get_input_cov_structure(kernel_structure) + output_cov_structure = get_output_cov_structure(kernel_structure) + + n_hp_in = calculate_n_hyperparameters(input_dim, input_cov_structure) + U = hyperparameters_from_flat(x[1:n_hp_in], input_cov_structure) + V = hyperparameters_from_flat(x[(n_hp_in + 1):end], output_cov_structure) + + if isnothing(U) && isnothing(V) #i.e. both are from OneDimFactor cov structures + return x[1] * ones(1, 1), V + elseif isnothing(U) + return U, 0.5 * (V + permutedims(V, (2, 1))) + elseif isnothing(V) + return 0.5 * (U + permutedims(U, (2, 1))), V + else + # symmetrize (sometimes there are numerical artifacts) + U = 0.5 * (U + permutedims(U, (2, 1))) + V = 0.5 * (V + permutedims(V, (2, 1))) + return U, V + end +end + +function hyperparameters_from_flat( + x::VV, + input_dim::Int, + kernel_structure::SK, +) where {VV <: AbstractVector, SK <: SeparableKernel} + output_dim = 1 + U, V = hyperparameters_from_flat(x, input_dim, output_dim, kernel_structure) + + return U +end + +function hyperparameters_from_flat( + x::VV, + input_dim::Int, + output_dim::Int, + kernel_structure::NK, +) where {VV <: AbstractVector, NK <: NonseparableKernel} + cov_structure = get_cov_structure(kernel_structure) + C = hyperparameters_from_flat(x, cov_structure) + if isnothing(C) + return x[1] * ones(1, 1) + else + return 0.5 * (C + permutedims(C, (2, 1))) + end + +end """ $(DocStringExtensions.TYPEDSIGNATURES) @@ -252,7 +403,7 @@ function calculate_mean_cov_and_coeffs( otest = get_outputs(io_pairs)[:, (n_train + 1):end] input_dim = size(itrain, 1) output_dim = size(otrain, 1) - + n_test = size(itest, 2) # build and fit the RF rfm = RFM_from_hyperparameters( rfi, @@ -278,11 +429,15 @@ function calculate_mean_cov_and_coeffs( buffer, tullio_threading = thread_opt, ) + # sizes (output_dim x n_test), (output_dim x output_dim x n_test) - scaled_coeffs = sqrt(1 / n_features) * RF.Methods.get_coeffs(fitted_features) - # we add the additional complexity term - # TODO only cholesky and SVD available + ## TODO - the theory states that the following should be set: + # scaled_coeffs = sqrt(1 / (n_features)) * RF.Methods.get_coeffs(fitted_features) + # However the convergence is much improved with setting this to zero: + scaled_coeffs = 0 + + if decomp_type == "cholesky" chol_fac = RF.Methods.get_decomposition(RF.Methods.get_feature_factors(fitted_features)).L complexity = 2 * sum(log(chol_fac[i, i]) for i in 1:size(chol_fac, 1)) @@ -290,10 +445,47 @@ function calculate_mean_cov_and_coeffs( svd_singval = RF.Methods.get_decomposition(RF.Methods.get_feature_factors(fitted_features)).S complexity = sum(log, svd_singval) # note this is log(abs(det)) end + complexity = sqrt(abs(complexity)) #abs can introduce nonconvexity, + return scaled_coeffs, complexity end + + +""" +$(DocStringExtensions.TYPEDSIGNATURES) + +Calculate the empirical covariance, additionally applying a shrinkage operator (here the Ledoit Wolf 2004 shrinkage operation). Known to have better stability properties than Monte-Carlo for low sample sizes +""" +function shrinkage_cov(sample_mat::AA) where {AA <: AbstractMatrix} + n_out, n_sample = size(sample_mat) + + # de-mean (as we will use the samples directly for calculation of β) + sample_mat_zeromean = sample_mat .- mean(sample_mat, dims = 2) + # Ledoit Wolf shrinkage to I + + # get sample covariance + Γ = cov(sample_mat_zeromean, dims = 2) + # estimate opt shrinkage + μ_shrink = 1 / n_out * tr(Γ) + δ_shrink = norm(Γ - μ_shrink * I)^2 / n_out # (scaled) frob norm of Γ_m + #once de-meaning, we need to correct the sample covariance with an n_sample -> n_sample-1 + β_shrink = sum([norm(c * c' - -Γ)^2 / n_out for c in eachcol(sample_mat_zeromean)]) / (n_sample - 1)^2 + + γ_shrink = min(β_shrink / δ_shrink, 1) # clipping is typically rare + # γμI + (1-γ)Γ + Γ .*= (1 - γ_shrink) + for i in 1:n_out + Γ[i, i] += γ_shrink * μ_shrink + end + + @info "Shrinkage scale: $(γ_shrink), (0 = none, 1 = revert to scaled Identity)\n shrinkage covariance condition number: $(cond(Γ))" + return Γ +end + + + """ $(DocStringExtensions.TYPEDSIGNATURES) @@ -377,14 +569,19 @@ function estimate_mean_and_coeffnorm_covariance( blockmeans[id, :] = permutedims(means[i, :, :], (2, 1)) end + sample_mat = vcat(blockmeans, coeffl2norm, complexity) + shrinkage = true + if shrinkage + Γ = shrinkage_cov(sample_mat) + else + Γ = cov(sample_mat, dims = 2) + end + if !isposdef(approx_σ2) println("approx_σ2 not posdef") approx_σ2 = posdef_correct(approx_σ2) end - Γ = cov(vcat(blockmeans, coeffl2norm, complexity), dims = 2) - - return Γ, approx_σ2 end @@ -579,16 +776,27 @@ function estimate_mean_and_coeffnorm_covariance( blockmeans[id, :] = permutedims(means[i, :, :], (2, 1)) end + + sample_mat = vcat(blockmeans, coeffl2norm, complexity) + shrinkage = true + if shrinkage + Γ = shrinkage_cov(sample_mat) + else + Γ = cov(sample_mat, dims = 2) + end + if !isposdef(approx_σ2) println("approx_σ2 not posdef") approx_σ2 = posdef_correct(approx_σ2) end - Γ = cov(vcat(blockmeans, coeffl2norm, complexity), dims = 2) return Γ, approx_σ2 end + + + function calculate_ensemble_mean_and_coeffnorm( rfi::RFI, rng::RNG, @@ -622,11 +830,6 @@ function calculate_ensemble_mean_and_coeffnorm( println("calculating " * string(N_ens * repeats) * " ensemble members...") nthreads = Threads.nthreads() - - means = zeros(output_dim, N_ens, n_test) - complexity = zeros(1, N_ens) - coeffl2norm = zeros(1, N_ens) - c_list = [zeros(1, N_ens) for i in 1:nthreads] cp_list = [zeros(1, N_ens) for i in 1:nthreads] m_list = [zeros(output_dim, N_ens, n_test) for i in 1:nthreads] diff --git a/src/ScalarRandomFeature.jl b/src/ScalarRandomFeature.jl index 0a3c0ae9c..a23586127 100644 --- a/src/ScalarRandomFeature.jl +++ b/src/ScalarRandomFeature.jl @@ -6,11 +6,12 @@ $(DocStringExtensions.TYPEDEF) Structure holding the Scalar Random Feature models. -# Fields +# FieldsWhen calibrated with ocean LES, $(DocStringExtensions.TYPEDFIELDS) """ -struct ScalarRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG} <: RandomFeatureInterface +struct ScalarRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG, KST <: KernelStructureType} <: + RandomFeatureInterface "vector of `RandomFeatureMethod`s, contains the feature structure, batch-sizes and regularization" rfms::Vector{RF.Methods.RandomFeatureMethod} "vector of `Fit`s, containing the matrix decomposition and coefficients of RF when fitted to data" @@ -23,8 +24,8 @@ struct ScalarRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG} <: input_dim::Int "choice of random number generator" rng::RNG - "Assume inputs are decorrelated for ML" - diagonalize_input::Bool + "Kernel structure type (e.g. Separable or Nonseparable)" + kernel_structure::KST "Random Feature decomposition, choose from \"svd\" or \"cholesky\" (default)" feature_decomposition::S "dictionary of options for hyperparameter optimizer" @@ -76,9 +77,9 @@ get_rng(srfi::ScalarRandomFeatureInterface) = srfi.rng """ $(DocStringExtensions.TYPEDSIGNATURES) -gets the diagonalize_input field +Gets the kernel_structure field """ -get_diagonalize_input(srfi::ScalarRandomFeatureInterface) = srfi.diagonalize_input +get_kernel_structure(srfi::ScalarRandomFeatureInterface) = srfi.kernel_structure """ $(DocStringExtensions.TYPEDSIGNATURES) @@ -101,7 +102,7 @@ $(DocStringExtensions.TYPEDSIGNATURES) Constructs a `ScalarRandomFeatureInterface <: MachineLearningTool` interface for the `RandomFeatures.jl` package for multi-input and single- (or decorrelated-)output emulators. - `n_features` - the number of random features - `input_dim` - the dimension of the input space - - `diagonalize_input = false` - whether to assume independence in input space (Note, whens set `true`, this tool will approximate directly the behaviour of the scalar-valued GaussianProcess with ARD kernel) + - `kernel_structure` - - a prescribed form of kernel structure - `batch_sizes = nothing` - Dictionary of batch sizes passed `RandomFeatures.jl` object (see definition there) - `rng = Random.GLOBAL_RNG` - random number generator - `feature_decomposition = "cholesky"` - choice of how to store decompositions of random features, `cholesky` or `svd` available @@ -110,7 +111,7 @@ Constructs a `ScalarRandomFeatureInterface <: MachineLearningTool` interface for - "prior_in_scale": use this to tune the input prior scale - "n_ensemble": number of ensemble members - "n_iteration": number of eki iterations - - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 1.0) + - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 0.0) - "scheduler": Learning rate Scheduler (a.k.a. EKP timestepper) Default: DataMisfitController - "tikhonov": tikhonov regularization parameter if >0 - "inflation": additive inflation ∈ [0,1] with 0 being no inflation @@ -121,22 +122,23 @@ Constructs a `ScalarRandomFeatureInterface <: MachineLearningTool` interface for function ScalarRandomFeatureInterface( n_features::Int, input_dim::Int; - diagonalize_input::Bool = false, + kernel_structure::Union{KST, Nothing} = nothing, batch_sizes::Union{Dict{S, Int}, Nothing} = nothing, rng::RNG = Random.GLOBAL_RNG, feature_decomposition::S = "cholesky", optimizer_options::Union{Dict{S}, Nothing} = nothing, -) where {S <: AbstractString, RNG <: AbstractRNG} +) where {S <: AbstractString, RNG <: AbstractRNG, KST <: KernelStructureType} # Initialize vector for GP models rfms = Vector{RF.Methods.RandomFeatureMethod}(undef, 0) fitted_features = Vector{RF.Methods.Fit}(undef, 0) - if !isnothing(optimizer_options) - prior_in_scale = "prior_in_scale" ∈ keys(optimizer_options) ? optimizer_options["prior_in_scale"] : 1.0 - else - prior_in_scale = 1.0 + if isnothing(kernel_structure) + kernel_structure = SeparableKernel(cov_structure_from_string("lowrank", input_dim), OneDimFactor()) end - prior = build_default_prior(input_dim, diagonalize_input, in_scale = prior_in_scale) + KSType = typeof(kernel_structure) + + prior = build_default_prior(input_dim, kernel_structure) + # default optimizer settings optimizer_opts = Dict( "prior" => prior, #the hyperparameter_prior @@ -164,23 +166,60 @@ function ScalarRandomFeatureInterface( end @info("hyperparameter optimization with EKI configured with $opt_tmp") - return ScalarRandomFeatureInterface{S, RNG}( + return ScalarRandomFeatureInterface{S, RNG, KSType}( rfms, fitted_features, batch_sizes, n_features, input_dim, rng, - diagonalize_input, + kernel_structure, feature_decomposition, optimizer_opts, ) end +function hyperparameter_distribution_from_flat( + x::VV, + input_dim::Int, + kernel_structure::SK, +) where {VV <: AbstractVector, SK <: SeparableKernel} + + M = zeros(input_dim) #scalar output + U = hyperparameters_from_flat(x, input_dim, kernel_structure) + if !isposdef(U) + println("U not posdef - correcting") + U = posdef_correct(U) + end + + dist = MvNormal(M, U) + pd = ParameterDistribution( + Dict( + "distribution" => Parameterized(dist), + "constraint" => repeat([no_constraint()], input_dim), + "name" => "xi", + ), + ) + + return pd +end + +function hyperparameter_distribution_from_flat( + x::VV, + input_dim::Int, + kernel_structure::NK, +) where {VV <: AbstractVector, NK <: NonseparableKernel} + throw( + ArgumentError( + "Scalar Kernels must be of type `Separable( *** , OneDimFactor())`, received $(kernel_structure)", + ), + ) +end + """ $(DocStringExtensions.TYPEDSIGNATURES) -Builds the random feature method from hyperparameters. We use cosine activation functions and a MatrixVariateNormal(M,U,V) distribution (from `Distributions.jl`) with mean M=0, and input covariance U built from cholesky factorization or diagonal components based on the diagonalize_input flag. +Builds the random feature method from hyperparameters. We use cosine activation functions and a MatrixVariateNormal(M,U,V) distribution (from `Distributions.jl`) with mean M=0, and input covariance U built using a `CovarianceStructureType`. """ function RFM_from_hyperparameters( srfi::ScalarRandomFeatureInterface, @@ -198,33 +237,22 @@ function RFM_from_hyperparameters( S <: AbstractString, MT <: MultithreadType, } - diagonalize_input = get_diagonalize_input(srfi) - M = zeros(input_dim) #scalar output - U = hyperparameters_from_flat(l, input_dim, diagonalize_input) - if !isposdef(U) - println("U not posdef - correcting") - U = posdef_correct(U) - end + xi_hp = l[1:(end - 1)] + sigma_hp = l[end] + kernel_structure = get_kernel_structure(srfi) + pd = hyperparameter_distribution_from_flat(xi_hp, input_dim, kernel_structure) - dist = MvNormal(M, U) - pd = ParameterDistribution( - Dict( - "distribution" => Parameterized(dist), - "constraint" => repeat([no_constraint()], input_dim), - "name" => "xi", - ), - ) feature_sampler = RF.Samplers.FeatureSampler(pd, rng = rng) # Learn hyperparameters for different feature types - - sf = RF.Features.ScalarFourierFeature(n_features, feature_sampler) + feature_parameters = Dict("sigma" => sigma_hp) + sff = RF.Features.ScalarFourierFeature(n_features, feature_sampler, feature_parameters = feature_parameters) thread_opt = isa(multithread_type, TullioThreading) # if we want to multithread with tullio if isnothing(batch_sizes) - return RF.Methods.RandomFeatureMethod(sf, regularization = regularization, tullio_threading = thread_opt) + return RF.Methods.RandomFeatureMethod(sff, regularization = regularization, tullio_threading = thread_opt) else return RF.Methods.RandomFeatureMethod( - sf, + sff, regularization = regularization, batch_sizes = batch_sizes, tullio_threading = thread_opt, @@ -254,7 +282,7 @@ RFM_from_hyperparameters( """ $(DocStringExtensions.TYPEDSIGNATURES) -Builds the random feature method from hyperparameters. We use cosine activation functions and a Multivariate Normal distribution (from `Distributions.jl`) with mean M=0, and input covariance U built from cholesky factorization or diagonal components based on the diagonalize_input flag. +Builds the random feature method from hyperparameters. We use cosine activation functions and a Multivariate Normal distribution (from `Distributions.jl`) with mean M=0, and input covariance U built with the `CovarianceStructureType`. """ function build_models!( srfi::ScalarRandomFeatureInterface, @@ -268,6 +296,10 @@ function build_models!( input_dim = size(input_values, 1) + kernel_structure = get_kernel_structure(srfi) + n_hp = calculate_n_hyperparameters(input_dim, kernel_structure) + + rfms = get_rfms(srfi) fitted_features = get_fitted_features(srfi) n_features = get_n_features(srfi) @@ -276,16 +308,19 @@ function build_models!( decomp_type = get_feature_decomposition(srfi) optimizer_options = get_optimizer_options(srfi) - #regularization = I = 1.0 in scalar case - regularization = I + # Optimize features with EKP for each output dim # [1.] Split data into test/train 80/20 train_fraction = optimizer_options["train_fraction"] n_train = Int(floor(train_fraction * n_data)) n_test = n_data - n_train - n_features_opt = min(n_features, Int(floor(2 * n_test))) #we take a specified number of features for optimization. + n_features_opt = min(n_features, Int(floor(10 * n_test)))#Int(floor(2 * n_test))) #we take a specified number of features for optimization. idx_shuffle = randperm(rng, n_data) + #regularization = I = 1.0 in scalar case + regularization = I + + @info ( "hyperparameter learning for $n_rfms models using $n_train training points, $n_test validation points and $n_features_opt features" ) @@ -311,11 +346,22 @@ function build_models!( end # [2.] Estimate covariance at mean value prior = optimizer_options["prior"] + + # where prior space has changed we need to rebuild the priors + if ndims(prior) > n_hp + + # comes from having a truncated output_dimension + # TODO not really a truncation here, resetting to default + @info "Original input space of dimension $(get_input_dim(srfi)) has been truncated to $(input_dim). \n Rebuilding prior... number of hyperparameters reduced from $(ndims(prior)) to $(n_hp)." + prior = build_default_prior(input_dim, kernel_structure) + + end + μ_hp = transform_unconstrained_to_constrained(prior, mean(prior)) cov_sample_multiplier = optimizer_options["cov_sample_multiplier"] n_cov_samples_min = n_test + 2 - n_cov_samples = Int(floor(n_cov_samples_min * max(cov_sample_multiplier, 1.0))) + n_cov_samples = Int(floor(n_cov_samples_min * max(cov_sample_multiplier, 0.0))) internal_Γ, approx_σ2 = estimate_mean_and_coeffnorm_covariance( srfi, @@ -331,9 +377,8 @@ function build_models!( decomp_type, multithread_type, ) - Γ = internal_Γ - Γ[1:n_test, 1:n_test] += approx_σ2 + Γ[1:n_test, 1:n_test] += regularization # + approx_σ2 Γ[(n_test + 1):end, (n_test + 1):end] += I # [3.] set up EKP optimization @@ -343,7 +388,8 @@ function build_models!( scheduler = optimizer_options["scheduler"] initial_params = construct_initial_ensemble(rng, prior, n_ensemble) - min_complexity = log(det(regularization)) + min_complexity = log(regularization.λ) + min_complexity = sqrt(abs(min_complexity)) data = vcat(get_outputs(io_pairs_opt)[(n_train + 1):end], 0.0, min_complexity) ekiobj = EKP.EnsembleKalmanProcess( initial_params, @@ -415,6 +461,7 @@ function build_models!( io_pairs_i = PairedDataContainer(input_values, reshape(output_values[i, :], 1, size(output_values, 2))) # Now, fit new RF model with the optimized hyperparameters + rfm_i = RFM_from_hyperparameters( srfi, rng, diff --git a/src/VectorRandomFeature.jl b/src/VectorRandomFeature.jl index bbc7807c5..5326d1678 100644 --- a/src/VectorRandomFeature.jl +++ b/src/VectorRandomFeature.jl @@ -8,8 +8,7 @@ export get_rfms, get_input_dim, get_output_dim, get_rng, - get_diagonalize_input, - get_diagonalize_output, + get_kernel_structure, get_optimizer_options """ @@ -21,7 +20,8 @@ Structure holding the Vector Random Feature models. $(DocStringExtensions.TYPEDFIELDS) """ -struct VectorRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG} <: RandomFeatureInterface +struct VectorRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG, KST <: KernelStructureType} <: + RandomFeatureInterface "A vector of `RandomFeatureMethod`s, contains the feature structure, batch-sizes and regularization" rfms::Vector{RF.Methods.RandomFeatureMethod} "vector of `Fit`s, containing the matrix decomposition and coefficients of RF when fitted to data" @@ -38,10 +38,8 @@ struct VectorRandomFeatureInterface{S <: AbstractString, RNG <: AbstractRNG} <: rng::RNG "regularization" regularization::Vector{Union{Matrix, UniformScaling, Diagonal}} - "Assume inputs are decorrelated for ML" - diagonalize_input::Bool - "Assume outputs are decorrelated for ML. Note: emulator(..., decorrelate=true) by default" - diagonalize_output::Bool + "Kernel structure type (e.g. Separable or Nonseparable)" + kernel_structure::KST "Random Feature decomposition, choose from \"svd\" or \"cholesky\" (default)" feature_decomposition::S "dictionary of options for hyperparameter optimizer" @@ -107,16 +105,9 @@ get_regularization(vrfi::VectorRandomFeatureInterface) = vrfi.regularization """ $(DocStringExtensions.TYPEDSIGNATURES) -Gets the diagonalize_input field +Gets the kernel_structure field """ -get_diagonalize_input(vrfi::VectorRandomFeatureInterface) = vrfi.diagonalize_input - -""" -$(DocStringExtensions.TYPEDSIGNATURES) - -Gets the diagonalize_output field -""" -get_diagonalize_output(vrfi::VectorRandomFeatureInterface) = vrfi.diagonalize_output +get_kernel_structure(vrfi::VectorRandomFeatureInterface) = vrfi.kernel_structure """ $(DocStringExtensions.TYPEDSIGNATURES) @@ -139,8 +130,7 @@ Constructs a `VectorRandomFeatureInterface <: MachineLearningTool` interface for - `n_features` - the number of random features - `input_dim` - the dimension of the input space - `output_dim` - the dimension of the output space - - `diagonalize_input = false` - whether to assume independence in input space - - `diagonalize_output::Bool = false` - whether to assume independence in output space (even if set true, this is not the same as a list of scalar-valued models, as hyperparameters will be shared between models in this case) + - `kernel_structure` - - a prescribed form of kernel structure - `batch_sizes = nothing` - Dictionary of batch sizes passed `RandomFeatures.jl` object (see definition there) - `rng = Random.GLOBAL_RNG` - random number generator - `feature_decomposition = "cholesky"` - choice of how to store decompositions of random features, `cholesky` or `svd` available @@ -150,7 +140,7 @@ Constructs a `VectorRandomFeatureInterface <: MachineLearningTool` interface for - "n_ensemble": number of ensemble members - "n_iteration": number of eki iterations - "scheduler": Learning rate Scheduler (a.k.a. EKP timestepper) Default: DataMisfitController - - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 1.0) + - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 0.0) - "tikhonov": tikhonov regularization parameter if > 0 - "inflation": additive inflation ∈ [0,1] with 0 being no inflation - "train_fraction": e.g. 0.8 (default) means 80:20 train - test split @@ -162,35 +152,28 @@ function VectorRandomFeatureInterface( n_features::Int, input_dim::Int, output_dim::Int; - diagonalize_input::Bool = false, - diagonalize_output::Bool = false, + kernel_structure::Union{KST, Nothing} = nothing, batch_sizes::Union{Dict{S, Int}, Nothing} = nothing, rng::RNG = Random.GLOBAL_RNG, feature_decomposition::S = "cholesky", optimizer_options::Union{Dict{S}, Nothing} = nothing, -) where {S <: AbstractString, RNG <: AbstractRNG} +) where {S <: AbstractString, RNG <: AbstractRNG, KST <: KernelStructureType} # Initialize vector for RF model rfms = Vector{RF.Methods.RandomFeatureMethod}(undef, 0) fitted_features = Vector{RF.Methods.Fit}(undef, 0) regularization = Vector{Union{Matrix, UniformScaling, Nothing}}(undef, 0) - #Optimization Defaults - if !isnothing(optimizer_options) - prior_in_scale = "prior_in_scale" ∈ keys(optimizer_options) ? optimizer_options["prior_in_scale"] : 1.0 - prior_out_scale = "prior_out_scale" ∈ keys(optimizer_options) ? optimizer_options["prior_out_scale"] : 1.0 - else - prior_in_scale = 1.0 - prior_out_scale = 1.0 + if isnothing(kernel_structure) + kernel_structure = SeparableKernel( + cov_structure_from_string("lowrank", input_dim), + cov_structure_from_string("lowrank", output_dim), + ) end - prior = build_default_prior( - input_dim, - output_dim, - diagonalize_input, - diagonalize_output, - in_scale = prior_in_scale, - out_scale = prior_out_scale, - ) + KSType = typeof(kernel_structure) + prior = build_default_prior(input_dim, output_dim, kernel_structure) + + #Optimization Defaults optimizer_opts = Dict( "prior" => prior, #the hyperparameter_prior (note scalings have already been applied) "n_ensemble" => max(ndims(prior) + 1, 10), #number of ensemble @@ -202,6 +185,7 @@ function VectorRandomFeatureInterface( "train_fraction" => 0.8, # 80:20 train - test split "multithread" => "ensemble", # instead of "tullio" "verbose" => false, # verbose optimizer statements + "localization" => EKP.Localizers.NoLocalization(), # localization / sample error correction for small ensembles ) if !isnothing(optimizer_options) @@ -218,7 +202,7 @@ function VectorRandomFeatureInterface( end @info("hyperparameter optimization with EKI configured with $opt_tmp") - return VectorRandomFeatureInterface{S, RNG}( + return VectorRandomFeatureInterface{S, RNG, KSType}( rfms, fitted_features, batch_sizes, @@ -227,17 +211,72 @@ function VectorRandomFeatureInterface( output_dim, rng, regularization, - diagonalize_input, - diagonalize_output, + kernel_structure, feature_decomposition, optimizer_opts, ) end + + +function hyperparameter_distribution_from_flat( + x::VV, + input_dim::Int, + output_dim::Int, + kernel_structure::SK, +) where {VV <: AbstractVector, SK <: SeparableKernel} + M = zeros(input_dim, output_dim) # n x p mean + + U, V = hyperparameters_from_flat(x, input_dim, output_dim, kernel_structure) + + if !isposdef(U) + println("U not posdef - correcting") + U = posdef_correct(U) + end + if !isposdef(V) + println("V not posdef - correcting") + V = posdef_correct(V) + end + + dist = MatrixNormal(M, U, V) + pd = ParameterDistribution( + Dict( + "distribution" => Parameterized(dist), + "constraint" => repeat([no_constraint()], input_dim * output_dim), + "name" => "xi", + ), + ) + return pd +end + +function hyperparameter_distribution_from_flat( + x::VV, + input_dim::Int, + output_dim::Int, + kernel_structure::NK, +) where {VV <: AbstractVector, NK <: NonseparableKernel} + + C = hyperparameters_from_flat(x, input_dim, output_dim, kernel_structure) + if !isposdef(C) + println("C not posdef - correcting") + C = posdef_correct(C) + end + dist = MvNormal(zeros(input_dim * output_dim), C) + pd = ParameterDistribution( + Dict( + "distribution" => Parameterized(dist), + "constraint" => repeat([no_constraint()], input_dim * output_dim), + "name" => "xi", + ), + ) + return pd +end + + """ $(DocStringExtensions.TYPEDSIGNATURES) -Builds the random feature method from hyperparameters. We use cosine activation functions and a Matrixvariate Normal distribution with mean M=0, and input(output) covariance U(V) built from cholesky factorization or diagonal components based on the diagonalize_input(diagonalize_output) flag. +Builds the random feature method from hyperparameters. We use cosine activation functions and a Matrixvariate Normal distribution with mean M=0, and input(output) covariance U(V) built with a `CovarianceStructureType`. """ function RFM_from_hyperparameters( vrfi::VectorRandomFeatureInterface, @@ -256,31 +295,22 @@ function RFM_from_hyperparameters( MorUSorD <: Union{Matrix, UniformScaling, Diagonal}, MT <: MultithreadType, } - diagonalize_input = get_diagonalize_input(vrfi) - diagonalize_output = get_diagonalize_output(vrfi) - M = zeros(input_dim, output_dim) # n x p mean - U, V = hyperparameters_from_flat(l, input_dim, output_dim, diagonalize_input, diagonalize_output) + xi_hp = l[1:(end - 1)] + sigma_hp = l[end] - if !isposdef(U) - println("U not posdef - correcting") - U = posdef_correct(U) - end - if !isposdef(V) - println("V not posdef - correcting") - V = posdef_correct(V) - end + kernel_structure = get_kernel_structure(vrfi) + pd = hyperparameter_distribution_from_flat(xi_hp, input_dim, output_dim, kernel_structure) - dist = MatrixNormal(M, U, V) - pd = ParameterDistribution( - Dict( - "distribution" => Parameterized(dist), - "constraint" => repeat([no_constraint()], input_dim * output_dim), - "name" => "xi", - ), - ) feature_sampler = RF.Samplers.FeatureSampler(pd, output_dim, rng = rng) - vff = RF.Features.VectorFourierFeature(n_features, output_dim, feature_sampler) + + feature_parameters = Dict("sigma" => sigma_hp) + vff = RF.Features.VectorFourierFeature( + n_features, + output_dim, + feature_sampler, + feature_parameters = feature_parameters, + ) thread_opt = isa(multithread_type, TullioThreading) # if we want to multithread with tullio if isnothing(batch_sizes) @@ -314,10 +344,10 @@ function build_models!( input_dim = size(input_values, 1) output_dim = size(output_values, 1) - # there are less hyperparameters when we diagonalize - diagonalize_input = get_diagonalize_input(vrfi) - diagonalize_output = get_diagonalize_output(vrfi) - n_hp = calculate_n_hyperparameters(input_dim, output_dim, diagonalize_input, diagonalize_output) + + kernel_structure = get_kernel_structure(vrfi) + + n_hp = calculate_n_hyperparameters(input_dim, output_dim, kernel_structure) rfms = get_rfms(vrfi) fitted_features = get_fitted_features(vrfi) @@ -340,12 +370,13 @@ function build_models!( prior = optimizer_options["prior"] rng = get_rng(vrfi) + # where prior space has changed we need to rebuild the priors [TODO just build them here in the first place?] if ndims(prior) > n_hp + # comes from having a truncated output_dimension # TODO not really a truncation here, resetting to default - @info "truncating hyperparameter space to default in first $n_hp dimensions, due to truncation of output space" - prior = constrained_gaussian("cholesky_factors", 1.0, 1.0, 0.0, Inf, repeats = n_hp) - + @info "Original input-output space of dimension ($(get_input_dim(vrfi)), $(get_output_dim(vrfi))) has been truncated to ($(input_dim), $(output_dim)). \n Rebuilding prior... number of hyperparameters reduced from $(ndims(prior)) to $(n_hp)." + prior = build_default_prior(input_dim, output_dim, kernel_structure) end # Optimize feature cholesky factors with EKP @@ -353,12 +384,20 @@ function build_models!( train_fraction = optimizer_options["train_fraction"] n_train = Int(floor(train_fraction * n_data)) # 20% split n_test = n_data - n_train - if diagonalize_output - n_features_opt = min(n_features, Int(floor(4 * n_train))) #we take a specified number of features for optimization. MAGIC NUMBER + + # there are less hyperparameters when we diagonalize + if nameof(typeof(kernel_structure)) == :SeparableKernel + if nameof(typeof(get_output_cov_structure(kernel_structure))) == :DiagonalFactor + n_features_opt = min(n_features, Int(floor(4 * n_train))) #we take a specified number of features for optimization. MAGIC NUMBER + else + # note the n_features_opt should NOT exceed output_dim * n_train or the regularization is replaced with a diagonal approx. + n_features_opt = max(min(n_features, Int(floor(4 * sqrt(n_train * output_dim)))), Int(floor(1.9 * n_train)))#we take a specified number of features for optimization. MAGIC NUMBER + end else # note the n_features_opt should NOT exceed output_dim * n_train or the regularization is replaced with a diagonal approx. n_features_opt = max(min(n_features, Int(floor(4 * sqrt(n_train * output_dim)))), Int(floor(1.9 * n_train)))#we take a specified number of features for optimization. MAGIC NUMBER end + @info ( "hyperparameter learning using $n_train training points, $n_test validation points and $n_features_opt features" ) @@ -368,10 +407,11 @@ function build_models!( # in this setting, noise = I if regularization_matrix === nothing regularization = I + else reg_mat = regularization_matrix if !isposdef(regularization_matrix) - reg_mat = posdef_correct(regularization_matrix) + regularization = posdef_correct(regularization_matrix) println("RF regularization matrix is not positive definite, correcting") else @@ -382,7 +422,6 @@ function build_models!( end end - idx_shuffle = randperm(rng, n_data) io_pairs_opt = PairedDataContainer( @@ -394,12 +433,16 @@ function build_models!( μ_hp = transform_unconstrained_to_constrained(prior, mean(prior)) cov_sample_multiplier = optimizer_options["cov_sample_multiplier"] - if diagonalize_output - n_cov_samples_min = n_test + 2 # diagonal case + if nameof(typeof(kernel_structure)) == :SeparableKernel + if nameof(typeof(get_output_cov_structure(kernel_structure))) == :DiagonalFactor + n_cov_samples_min = n_test + 2 # diagonal case + else + n_cov_samples_min = (n_test * output_dim + 2) + end else - n_cov_samples_min = (n_test * output_dim + 2) # min required for + n_cov_samples_min = (n_test * output_dim + 2) end - n_cov_samples = Int(floor(n_cov_samples_min * max(cov_sample_multiplier, 1.0))) + n_cov_samples = Int(floor(n_cov_samples_min * max(cov_sample_multiplier, 0.0))) internal_Γ, approx_σ2 = estimate_mean_and_coeffnorm_covariance( vrfi, @@ -420,13 +463,15 @@ function build_models!( if tikhonov_opt_val == 0 # Build the covariance Γ = internal_Γ - Γ[1:(n_test * output_dim), 1:(n_test * output_dim)] += approx_σ2 + Γ[1:(n_test * output_dim), 1:(n_test * output_dim)] += regularization # + approx_σ2 Γ[(n_test * output_dim + 1):end, (n_test * output_dim + 1):end] += I #in diag case we have data logdet = λ^m, in non diag case we have logdet(Λ^) to match the different reg matrices. min_complexity = isa(regularization, UniformScaling) ? n_features_opt * log(regularization.λ) : n_features_opt / output_dim * 2 * sum(log.(diag(cholesky(regularization).L))) + min_complexity = sqrt(abs(min_complexity)) + data = vcat(reshape(get_outputs(io_pairs_opt)[:, (n_train + 1):end], :, 1), 0.0, min_complexity) #flatten data @@ -435,7 +480,7 @@ function build_models!( outsize = size(internal_Γ, 1) Γ = zeros(outsize + n_hp, outsize + n_hp) Γ[1:outsize, 1:outsize] = internal_Γ - Γ[1:(n_test * output_dim), 1:(n_test * output_dim)] += approx_σ2 + Γ[1:(n_test * output_dim), 1:(n_test * output_dim)] += approx_σ2 + regularization Γ[(n_test * output_dim + 1):outsize, (n_test * output_dim + 1):outsize] += I Γ[(outsize + 1):end, (outsize + 1):end] = tikhonov_opt_val .* cov(prior) @@ -444,6 +489,7 @@ function build_models!( min_complexity = isa(regularization, UniformScaling) ? n_features_opt * log(regularization.λ) : n_features_opt / output_dim * 2 * sum(log.(diag(cholesky(regularization).L))) + min_complexity = sqrt(abs(min_complexity)) data = vcat( reshape(get_outputs(io_pairs_opt)[:, (n_train + 1):end], :, 1), @@ -464,6 +510,7 @@ function build_models!( n_iteration = optimizer_options["n_iteration"] opt_verbose_flag = optimizer_options["verbose"] scheduler = optimizer_options["scheduler"] + localization = optimizer_options["localization"] if !isposdef(Γ) Γ = posdef_correct(Γ) end @@ -478,6 +525,7 @@ function build_models!( scheduler = scheduler, rng = rng, verbose = opt_verbose_flag, + localization_method = localization, ) err = zeros(n_iteration) @@ -527,6 +575,7 @@ function build_models!( # [5.] extract optimal hyperparameters hp_optimal = get_ϕ_mean_final(prior, ekiobj)[:] + # Now, fit new RF model with the optimized hyperparameters if opt_verbose_flag names = get_name(prior) @@ -538,6 +587,7 @@ function build_models!( pcic = [(pci_constrained[1][i], pci_constrained[2][i]) for i in 1:length(pci_constrained[1])] pcic_batched = [pcic[b][1] for b in batch(prior)] @info("EKI Optimization result:") + println( display( [ @@ -547,6 +597,7 @@ function build_models!( ), ) end + rfm_tmp = RFM_from_hyperparameters( vrfi, rng, @@ -596,7 +647,8 @@ function predict(vrfi::VectorRandomFeatureInterface, new_inputs::M) where {M <: N_samples = size(new_inputs, 2) # Predicts columns of inputs: input_dim × N_samples - μ, σ2 = RF.Methods.predict(rfm, ff, DataContainer(new_inputs), tullio_threading = tullio_threading) + μ, σ2 = RF.Methods.predict(rfm, ff, DataContainer(new_inputs), tullio_threading = "tullio") + # μ, σ2 = RF.Methods.predict(rfm, ff, DataContainer(new_inputs), tullio_threading = tullio_threading) # sizes (output_dim x n_test), (output_dim x output_dim x n_test) # add the noise contribution from the regularization # note this is because we are predicting the data here, not the latent function. @@ -608,6 +660,5 @@ function predict(vrfi::VectorRandomFeatureInterface, new_inputs::M) where {M <: σ2[:, :, i] = posdef_correct(σ2[:, :, i]) end end - return μ, σ2 end diff --git a/test/RandomFeature/runtests.jl b/test/RandomFeature/runtests.jl index b5c04d0f9..3e5b1351c 100644 --- a/test/RandomFeature/runtests.jl +++ b/test/RandomFeature/runtests.jl @@ -14,40 +14,145 @@ rng = Random.MersenneTwister(seed) @testset "RandomFeatures" begin + @testset "hyperparameter prior interface" begin + + # [1. ] CovarianceStructures + # Costruction + eps = 1e-4 + r = 3 + odf = OneDimFactor() + @test typeof(odf) <: CovarianceStructureType + df = DiagonalFactor(eps) + @test typeof(df) <: CovarianceStructureType + @test get_eps(df) == eps + @test get_eps(DiagonalFactor()) == Float64(1.0) + cf = CholeskyFactor(eps) + @test typeof(cf) <: CovarianceStructureType + @test get_eps(cf) == eps + @test get_eps(CholeskyFactor()) == Float64(1.0) + lrf = LowRankFactor(r, eps) + @test typeof(lrf) <: CovarianceStructureType + @test rank(lrf) == r + @test get_eps(lrf) == eps + @test get_eps(LowRankFactor(r)) == Float64(1.0) + hlrf = HierarchicalLowRankFactor(r, eps) + @test typeof(hlrf) <: CovarianceStructureType + @test rank(hlrf) == r + @test get_eps(hlrf) == eps + @test get_eps(HierarchicalLowRankFactor(r)) == Float64(1.0) + + # calculate_n_hyperparameters + d = 6 + @test calculate_n_hyperparameters(d, odf) == 0 + @test calculate_n_hyperparameters(d, df) == d + @test calculate_n_hyperparameters(d, cf) == Int(d * (d + 1) / 2) + @test calculate_n_hyperparameters(d, lrf) == Int(r + d * r) + @test calculate_n_hyperparameters(d, hlrf) == Int(r * (r + 1) / 2 + d * r) + + # hyperparameters_from_flat - only check shape + @test isnothing(hyperparameters_from_flat([1], odf)) + for factor in (df, cf, lrf, hlrf) + x = ones(calculate_n_hyperparameters(d, factor)) + @test size(hyperparameters_from_flat(x, factor)) == (d, d) + end + + # build_default_prior - only check size of distribution + name = "test_name" + @test isnothing(build_default_prior(name, 0, odf)) + for factor in (df, cf, lrf, hlrf) + n_hp = calculate_n_hyperparameters(d, factor) + prior = build_default_prior(name, n_hp, factor) + @test ndims(prior) == n_hp + end + + # [2. ] Kernel Structures + d = 6 + p = 3 + c_in = lrf + c_out = cf + k_sep = SeparableKernel(c_in, c_out) + @test get_input_cov_structure(k_sep) == c_in + @test get_output_cov_structure(k_sep) == c_out + + k_nonsep = NonseparableKernel(c_in) + @test get_cov_structure(k_nonsep) == c_in + + + # calculate_n_hyperparameters + @test calculate_n_hyperparameters(d, p, k_sep) == + calculate_n_hyperparameters(d, c_in) + calculate_n_hyperparameters(p, c_out) + 1 + @test calculate_n_hyperparameters(d, p, k_nonsep) == calculate_n_hyperparameters(d * p, c_in) + 1 + k_sep1d = SeparableKernel(odf, odf) + k_nonsep1d = NonseparableKernel(odf) + @test calculate_n_hyperparameters(1, 1, k_sep1d) == 2 + @test calculate_n_hyperparameters(1, 1, k_nonsep1d) == 2 + + # hyper_parameters_from_flat: not applied with scaling hyperparameter + x = ones(calculate_n_hyperparameters(d, c_in) + calculate_n_hyperparameters(p, c_out)) + @test size(hyperparameters_from_flat(x, d, p, k_sep)[1]) == (d, d) + @test size(hyperparameters_from_flat(x, d, p, k_sep)[2]) == (p, p) + + x = ones(calculate_n_hyperparameters(d * p, c_in)) + @test size(hyperparameters_from_flat(x, d, p, k_nonsep)) == (d * p, d * p) + + x = [1] # in the 1D case, return(U,V) = (1x1 matrix, nothing) + @test size(hyperparameters_from_flat(x, 1, 1, k_sep1d)[1]) == (1, 1) + @test isnothing(hyperparameters_from_flat(x, 1, 1, k_sep1d)[2]) + @test size(hyperparameters_from_flat(x, 1, 1, k_nonsep1d)) == (1, 1) + + # build_default_prior + @test ndims(build_default_prior(d, p, k_sep)) == + ndims(build_default_prior("input", calculate_n_hyperparameters(d, c_in), c_in)) + + ndims(build_default_prior("output", calculate_n_hyperparameters(p, c_out), c_out)) + + 1 + @test ndims(build_default_prior(d, p, k_nonsep)) == + ndims(build_default_prior("full", calculate_n_hyperparameters(d * p, c_in), c_in)) + 1 + + @test ndims(build_default_prior(1, 1, k_sep1d)) == 2 + @test ndims(build_default_prior(1, 1, k_nonsep1d)) == 2 + + + # test shrinkage utility + samples = rand(MvNormal(zeros(100), I), 20) + # normal condition number should be huge around 10^18 + # shrinkage cov will have condition number around 1 and be close to I + good_cov = shrinkage_cov(samples) + @test (cond(good_cov) < 1.1) && ((good_cov[1] < 1.2) && (good_cov[1] > 0.8)) + + + end + @testset "ScalarRandomFeatureInterface" begin input_dim = 2 n_features = 200 batch_sizes = Dict("train" => 100, "test" => 100, "feature" => 100) - diagonalize_input = false #build interface - n_input_hp = Int(input_dim * (input_dim + 1) / 2) + 2 # see calculate_n_hyperparameters for details - n_output_hp = 1 + # prior built from: - # input: γ₁*(Cholesky_factor + γ₂ * I ) - # output: γ₃ - # where γᵢ positive - prior = build_default_prior(input_dim, diagonalize_input) + eps = 1e-8 + kernel_structure = SeparableKernel(CholeskyFactor(eps), OneDimFactor()) # Cholesky factorized input, 1D output + prior = build_default_prior(input_dim, kernel_structure) optimizer_options = Dict( "prior" => prior, "n_ensemble" => max(ndims(prior) + 1, 10), "n_iteration" => 5, + "scheduler" => DataMisfitController(), "cov_sample_multiplier" => 10.0, "inflation" => 1e-4, "train_fraction" => 0.8, "multithread" => "ensemble", "verbose" => false, - "scheduler" => DataMisfitController(), ) srfi = ScalarRandomFeatureInterface( n_features, input_dim, + kernel_structure = kernel_structure, batch_sizes = batch_sizes, rng = rng, - diagonalize_input = diagonalize_input, optimizer_options = optimizer_options, ) @@ -57,17 +162,19 @@ rng = Random.MersenneTwister(seed) @test get_n_features(srfi) == n_features @test get_input_dim(srfi) == input_dim @test get_rng(srfi) == rng - @test get_diagonalize_input(srfi) == diagonalize_input + @test get_kernel_structure(srfi) == kernel_structure @test get_optimizer_options(srfi) == optimizer_options # check defaults srfi2 = ScalarRandomFeatureInterface(n_features, input_dim) @test get_batch_sizes(srfi2) === nothing @test get_rng(srfi2) == Random.GLOBAL_RNG - @test get_diagonalize_input(srfi2) == false - # currently the "scheduler" doesn't always satisfy X() = X(), bug so we need to remove this for now + @test get_kernel_structure(srfi2) == + SeparableKernel(cov_structure_from_string("lowrank", input_dim), OneDimFactor()) + + # currently the "scheduler" doesn't always satisfy X() = X(), bug so we need to remove this for now for key in keys(optimizer_options) - if !(key == "scheduler") + if !(key ∈ ["scheduler", "prior", "n_ensemble"]) @test get_optimizer_options(srfi2)[key] == optimizer_options[key] # we just set the defaults above end end @@ -81,29 +188,25 @@ rng = Random.MersenneTwister(seed) output_dim = 3 n_features = 200 batch_sizes = Dict("train" => 100, "test" => 100, "feature" => 100) - diagonalize_input = true - diagonalize_output = false - - n_input_hp = Int(input_dim) + 2 - n_output_hp = Int(output_dim * (output_dim + 1) / 2) + 2 # prior built from: - # input: γ₁*(Cholesky_factor_in + γ₂ * I) - # output: γ₃*(Cholesky_factor_out + γ₄ * I) - # where γᵢ positive - prior = build_default_prior(input_dim, output_dim, diagonalize_input, diagonalize_output) + eps = 1e-8 + kernel_structure = SeparableKernel(DiagonalFactor(eps), CholeskyFactor(eps)) # Diagonal input, Cholesky factorized output + prior = build_default_prior(input_dim, output_dim, kernel_structure) + optimizer_options = Dict( "prior" => prior, "n_ensemble" => max(ndims(prior) + 1, 10), "n_iteration" => 5, - "tikhonov" => 0, + "scheduler" => DataMisfitController(), "cov_sample_multiplier" => 10.0, + "tikhonov" => 0, "inflation" => 1e-4, "train_fraction" => 0.8, "multithread" => "ensemble", "verbose" => false, - "scheduler" => DataMisfitController(), + "localization" => EnsembleKalmanProcesses.Localizers.NoLocalization(), ) #build interfaces @@ -111,10 +214,9 @@ rng = Random.MersenneTwister(seed) n_features, input_dim, output_dim, + kernel_structure = kernel_structure, rng = rng, batch_sizes = batch_sizes, - diagonalize_input = diagonalize_input, - diagonalize_output = diagonalize_output, optimizer_options = optimizer_options, ) @@ -124,21 +226,22 @@ rng = Random.MersenneTwister(seed) @test get_n_features(vrfi) == n_features @test get_input_dim(vrfi) == input_dim @test get_output_dim(vrfi) == output_dim + @test get_kernel_structure(vrfi) == kernel_structure @test get_rng(vrfi) == rng - @test get_diagonalize_input(vrfi) == diagonalize_input - @test get_diagonalize_output(vrfi) == diagonalize_output @test get_optimizer_options(vrfi) == optimizer_options - #check defaults - vrfi2 = VectorRandomFeatureInterface(n_features, input_dim, output_dim, diagonalize_input = diagonalize_input) + vrfi2 = VectorRandomFeatureInterface(n_features, input_dim, output_dim) @test get_batch_sizes(vrfi2) === nothing @test get_rng(vrfi2) == Random.GLOBAL_RNG - @test get_diagonalize_input(vrfi2) == true - @test get_diagonalize_output(vrfi2) == false + @test get_kernel_structure(vrfi2) == SeparableKernel( + cov_structure_from_string("lowrank", input_dim), + cov_structure_from_string("lowrank", output_dim), + ) + for key in keys(optimizer_options) - if !(key == "scheduler") + if !(key ∈ ["scheduler", "prior", "n_ensemble"]) @test get_optimizer_options(vrfi2)[key] == optimizer_options[key] # we just set the defaults above end end @@ -162,18 +265,14 @@ rng = Random.MersenneTwister(seed) # RF parameters n_features = 100 - diagonalize_input = true + eps = 1e-8 + scalar_ks = SeparableKernel(DiagonalFactor(eps), OneDimFactor()) # Diagonalize input (ARD-type kernel) + vector_ks = SeparableKernel(DiagonalFactor(eps), CholeskyFactor()) # Diagonalize input (ARD-type kernel) # Scalar RF options to mimic squared-exp ARD kernel - srfi = ScalarRandomFeatureInterface(n_features, input_dim, diagonalize_input = diagonalize_input, rng = rng) + srfi = ScalarRandomFeatureInterface(n_features, input_dim, kernel_structure = scalar_ks, rng = rng) # Vector RF options to mimic squared-exp ARD kernel (in 1D) - vrfi = VectorRandomFeatureInterface( - n_features, - input_dim, - output_dim, - diagonalize_input = diagonalize_input, - rng = rng, - ) + vrfi = VectorRandomFeatureInterface(n_features, input_dim, output_dim, kernel_structure = vector_ks, rng = rng) # build emulators em_srfi = Emulator(srfi, iopairs, obs_noise_cov = obs_noise_cov) @@ -231,28 +330,35 @@ rng = Random.MersenneTwister(seed) # 1) scalar + diag in # 2) scalar # 3) vector + diag out - # 4) vector - - srfi_diagin = ScalarRandomFeatureInterface(n_features, input_dim, diagonalize_input = true, rng = rng) - srfi = ScalarRandomFeatureInterface(n_features, input_dim, diagonalize_input = false, rng = rng) + # 4) vector + # 5) vector nonseparable + eps = 1e-8 + r = 1 + scalar_diagin_ks = SeparableKernel(DiagonalFactor(eps), OneDimFactor()) + scalar_ks = SeparableKernel(CholeskyFactor(eps), OneDimFactor()) + vector_diagout_ks = SeparableKernel(CholeskyFactor(eps), DiagonalFactor(eps)) + vector_ks = SeparableKernel(LowRankFactor(r, eps), HierarchicalLowRankFactor(r, eps)) + vector_nonsep_ks = NonseparableKernel(LowRankFactor(r + 1, eps)) + + srfi_diagin = + ScalarRandomFeatureInterface(n_features, input_dim, kernel_structure = scalar_diagin_ks, rng = rng) + srfi = ScalarRandomFeatureInterface(n_features, input_dim, kernel_structure = scalar_ks, rng = rng) vrfi_diagout = VectorRandomFeatureInterface( n_features, input_dim, output_dim, - diagonalize_input = false, - diagonalize_output = true, - optimizer_options = Dict("train_fraction" => 0.7), + kernel_structure = vector_diagout_ks, rng = rng, ) - vrfi = VectorRandomFeatureInterface( + vrfi = VectorRandomFeatureInterface(n_features, input_dim, output_dim, kernel_structure = vector_ks, rng = rng) + + vrfi_nonsep = VectorRandomFeatureInterface( n_features, input_dim, output_dim, - diagonalize_input = false, - diagonalize_output = false, - optimizer_options = Dict("train_fraction" => 0.7), + kernel_structure = vector_nonsep_ks, rng = rng, ) @@ -266,6 +372,7 @@ rng = Random.MersenneTwister(seed) em_vrfi_svd = Emulator(vrfi, iopairs, obs_noise_cov = Σ) em_vrfi = Emulator(vrfi, iopairs, obs_noise_cov = Σ, decorrelate = false) + em_vrfi_nonsep = Emulator(vrfi_nonsep, iopairs, obs_noise_cov = Σ, decorrelate = false) #TODO truncated SVD option for vector (involves resizing prior) @@ -281,21 +388,22 @@ rng = Random.MersenneTwister(seed) μ_vsd, σ²_vsd = Emulators.predict(em_vrfi_svd_diagout, new_inputs, transform_to_real = true) μ_vs, σ²_vs = Emulators.predict(em_vrfi_svd, new_inputs, transform_to_real = true) μ_v, σ²_v = Emulators.predict(em_vrfi, new_inputs) + μ_vns, σ²_vns = Emulators.predict(em_vrfi_nonsep, new_inputs) tol_μ = 0.1 * ntest * output_dim @test isapprox.(norm(μ_ssd - outx), 0, atol = tol_μ) @test isapprox.(norm(μ_ss - outx), 0, atol = tol_μ) @test isapprox.(norm(μ_vsd - outx), 0, atol = tol_μ) @test isapprox.(norm(μ_vs - outx), 0, atol = tol_μ) - @test isapprox.(norm(μ_v - outx), 0, atol = 5 * tol_μ) # a more approximate option so likely less good approx + @test isapprox.(norm(μ_v - outx), 0, atol = 5 * tol_μ) # approximate option so likely less good approx + @test isapprox.(norm(μ_vns - outx), 0, atol = 5 * tol_μ) # approximate option so likely less good approx # An example with the other threading option vrfi_tul = VectorRandomFeatureInterface( n_features, input_dim, output_dim, - diagonalize_input = false, - diagonalize_output = false, + kernel_structure = vector_ks, optimizer_options = Dict("train_fraction" => 0.7, "multithread" => "tullio"), rng = rng, )