From 49e3b806b54cdd757c9b0b6dc11ba748d1b7049f Mon Sep 17 00:00:00 2001 From: "David A. van Leeuwen" Date: Sun, 3 Jan 2021 17:18:09 +0100 Subject: [PATCH] Control logging in kmeans via Logging level, addressing #84 --- Project.toml | 16 +++---- src/train.jl | 126 +++++++++++++++++++++++++-------------------------- 2 files changed, 71 insertions(+), 71 deletions(-) diff --git a/Project.toml b/Project.toml index 285c842..4d9a656 100644 --- a/Project.toml +++ b/Project.toml @@ -24,15 +24,15 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [compat] -julia = "1" -PDMats = "0.10" -StatsBase = "0.33" -SpecialFunctions = "0.8, 0.9, 0.10, 1" -FileIO = "1.2" +Arpack = "0.3, 0.4" Clustering = "0.14" -RDatasets = "0.6, 0.7" -ScikitLearnBase = "0.5" Compat = "3.6" Distributions = "0.23, 0.24" -Arpack = "0.3, 0.4" +FileIO = "1.2" JLD2 = "0.2, 0.3" +PDMats = "0.10" +RDatasets = "0.6, 0.7" +ScikitLearnBase = "0.5" +SpecialFunctions = "0.8, 0.9, 0.10, 1" +StatsBase = "0.33" +julia = "1" diff --git a/src/train.jl b/src/train.jl index b865600..b16b759 100644 --- a/src/train.jl +++ b/src/train.jl @@ -21,7 +21,7 @@ function GMM(x::DataOrMatrix{T}; kind=:diag) where T <: AbstractFloat end hist = History(@sprintf("Initlialized single Gaussian d=%d kind=%s with %d data points", d, kind, n)) - GMM(ones(T,1), μ, Σ, [hist], n) + return GMM(ones(T,1), μ, Σ, [hist], n) end ## Also allow a Vector, :full makes no sense GMM(x::Vector{T}) where T <: AbstractFloat = GMM(reshape(x, length(x), 1)) # strange idiom... @@ -30,11 +30,11 @@ GMM(x::Vector{T}) where T <: AbstractFloat = GMM(reshape(x, length(x), 1)) # st function GMM(n::Int, x::DataOrMatrix{T}; method::Symbol=:kmeans, kind=:diag, nInit::Int=50, nIter::Int=10, nFinal::Int=nIter, sparse=0) where T <: AbstractFloat if n < 2 - GMM(x, kind=kind) - elseif method==:split - GMM2(n, x, kind=kind, nIter=nIter, nFinal=nFinal, sparse=sparse) - elseif method==:kmeans - GMMk(n, x, kind=kind, nInit=nInit, nIter=nIter, sparse=sparse) + return GMM(x, kind=kind) + elseif method == :split + return GMM2(n, x, kind=kind, nIter=nIter, nFinal=nFinal, sparse=sparse) + elseif method == :kmeans + return GMMk(n, x, kind=kind, nInit=nInit, nIter=nIter, sparse=sparse) else error("Unknown method ", method) end @@ -44,7 +44,7 @@ GMM(n::Int, x::Vector{T}; method::Symbol=:kmeans, nInit::Int=50, nIter::Int=10, ## we sometimes end up with pathological gmms function sanitycheck!(gmm::GMM) - pathological=NTuple{2}[] + pathological = NTuple{2}[] for i in findall(isnan.(gmm.μ) .| isinf.(gmm.μ)) gmm.μ[i] = 0 push!(pathological, CartesianIndices(gmm.μ)[i]) @@ -62,12 +62,13 @@ function sanitycheck!(gmm::GMM) end end end - if (np=length(pathological)) > 0 + np = length(pathological) + if np > 0 mesg = string(np, " pathological elements normalized") addhist!(gmm, mesg) @warn(mesg) end - pathological + return pathological end @@ -77,7 +78,7 @@ function GMMk(n::Int, x::DataOrMatrix{T}; kind=:diag, nInit::Int=50, nIter::Int= hist = [History(@sprintf("Initializing GMM, %d Gaussians %s covariance %d dimensions using %d data points", n, diag, d, nₓ))] @info(last(hist).s) ## subsample x to max 1000 points per mean - nneeded = 1000*n + nneeded = 1000 * n if nₓ < nneeded if isa(x, Matrix) xx = x @@ -92,20 +93,20 @@ function GMMk(n::Int, x::DataOrMatrix{T}; kind=:diag, nInit::Int=50, nIter::Int= ## all data, and may require a lot of memory for very long lists. yy = Matrix[] for y in x - ny = size(y,1) + ny = size(y, 1) nsample = min(ny, @compat ceil(Integer, nneeded / length(x))) push!(yy, y[sample(1:ny, nsample, replace=false),:]) end xx = vcat(yy...) end end - #if Logging._root.level ≤ Logging.DEBUG + if Logging.global_logger().min_level ≤ Logging.Debug loglevel = :iter - #elseif Logging._root.level ≤ Logging.INFO - # loglevel = :final - #else - # loglevel = :none - #end + elseif Logging.global_logger().min_level ≤ Logging.Info + loglevel = :final + else + loglevel = :none + end km = Clustering.kmeans(xx'[:,:], n, maxiter=nInit, display = loglevel) μ::Matrix{T} = km.centers' if kind == :diag @@ -115,7 +116,7 @@ function GMMk(n::Int, x::DataOrMatrix{T}; kind=:diag, nInit::Int=50, nIter::Int= if length(sel) < 2 return ones(1,d) else - return var(xx[sel,:],dims=1) + return var(xx[sel,:], dims=1) end end Σ = convert(Matrix{T},vcat(map(variance, 1:n)...)) @@ -125,22 +126,22 @@ function GMMk(n::Int, x::DataOrMatrix{T}; kind=:diag, nInit::Int=50, nIter::Int= if sum(sel) < d return cholinv(eye(d)) else - return cholinv(cov(xx[sel,:])) + return cholinv(cov(xx[sel, :])) end end - Σ = convert(FullCov{T},[cholinvcov(i) for i=1:n]) + Σ = convert(FullCov{T}, [cholinvcov(i) for i in 1:n]) else error("Unknown kind") end w::Vector{T} = km.counts ./ sum(km.counts) nxx = size(xx,1) ng = length(w) - push!(hist, History(string("K-means with ", nxx, " data points using ", km.iterations, " iterations\n", @sprintf("%3.1f data points per parameter",nxx/((d+1)ng))))) + push!(hist, History(string("K-means with ", nxx, " data points using ", km.iterations, " iterations\n", @sprintf("%3.1f data points per parameter", nxx / ((d+1)ng))))) @info(last(hist).s) gmm = GMM(w, μ, Σ, hist, nxx) sanitycheck!(gmm) em!(gmm, x; nIter=nIter, sparse=sparse) - gmm + return gmm end ## Train a GMM by consecutively splitting all means. n most be a power of 2 @@ -150,22 +151,22 @@ function GMM2(n::Int, x::DataOrMatrix; kind=:diag, nIter::Int=10, nFinal::Int=nI log2n = round(Int,log2(n)) 2^log2n == n || error("n must be power of 2") gmm = GMM(x, kind=kind) - tll = [avll(gmm,x)] + tll = [avll(gmm, x)] @info("0: avll = ", tll[1]) - for i=1:log2n + for i in 1:log2n gmm = gmmsplit(gmm) - avll = em!(gmm, x; nIter=i==log2n ? nFinal : nIter, sparse=sparse) - @info(i, ": avll = ", avll) + avll = em!(gmm, x; nIter=(i==log2n ? nFinal : nIter), sparse=sparse) + @info(i, avll) append!(tll, avll) end @info("Total log likelihood: ", tll) - gmm + return gmm end ## weighted logsumexp function logsumexpw(x::Matrix, w::Vector) y = x .+ log.(w)' - logsumexp(y, 2) + return logsumexp(y, 2) end ## split a mean according to the covariance matrix @@ -173,7 +174,7 @@ function gmmsplit(μ::Vector{T}, Σ::Matrix{T}, sep=0.2) where T tsep::T = sep d, v = eigs(Σ, nev=1) p1 = tsep * d[1] * v[:,1] # first principal component - μ - p1, μ + p1 + return μ - p1, μ + p1 end function gmmsplit(μ::Vector{T}, Σ::Vector{T}, sep=0.2) where T @@ -181,7 +182,7 @@ function gmmsplit(μ::Vector{T}, Σ::Vector{T}, sep=0.2) where T maxi = argmax(Σ) p1 = zeros(length(μ)) p1[maxi] = tsep * Σ[maxi] - μ - p1, μ + p1 + return μ - p1, μ + p1 end ## Split a gmm in order to to double the amount of gaussians @@ -193,10 +194,10 @@ function gmmsplit(gmm::GMM{T}; minweight=1e-5, sep=0.2) where T if (length(offInd)>0) @info("Removing Gaussians with no data"); end - for i=1:length(offInd) + for i in 1:length(offInd) gmm.w[maxi[i]] = gmm.w[offInd[i]] = gmm.w[maxi[i]]/2; - gmm.μ[offInd[i],:] = gmm.μ[maxi[i],:] + tsep * √gmm.Σ[maxi[i],:] - gmm.μ[maxi[i],:] = gmm.μ[maxi[i],:] - tsep * √gmm.Σ[maxi[i],:] + gmm.μ[offInd[i],:] = gmm.μ[maxi[i],:] + tsep * √gmm.Σ[maxi[i], :] + gmm.μ[maxi[i],:] = gmm.μ[maxi[i],:] - tsep * √gmm.Σ[maxi[i], :] end gmmkind = kind(gmm) n = gmm.n @@ -208,17 +209,17 @@ function gmmsplit(gmm::GMM{T}; minweight=1e-5, sep=0.2) where T else Σ = similar(gmm.Σ, 2n) end - for oi=1:n + for oi in 1:n ni = 2oi-1 : 2oi w[ni] .= gmm.w[oi]/2 if gmmkind == :diag - μ[ni,:] = hcat(gmmsplit(vec(gmm.μ[oi,:]), vec(gmm.Σ[oi,:]), tsep)...)' - for k=ni - Σ[k,:] = gmm.Σ[oi,:] # implicity copy + μ[ni, :] = hcat(gmmsplit(vec(gmm.μ[oi, :]), vec(gmm.Σ[oi, :]), tsep)...)' + for k in ni + Σ[k, :] = gmm.Σ[oi,:] # implicity copy end elseif gmmkind == :full - μ[ni,:] = hcat(gmmsplit(vec(gmm.μ[oi,:]), covar(gmm.Σ[oi]), tsep)...)' - for k=ni + μ[ni, :] = hcat(gmmsplit(vec(gmm.μ[oi, :]), covar(gmm.Σ[oi]), tsep)...)' + for k in ni Σ[k] = copy(gmm.Σ[oi]) end else @@ -226,7 +227,7 @@ function gmmsplit(gmm::GMM{T}; minweight=1e-5, sep=0.2) where T end end hist = vcat(gmm.hist, History(@sprintf("split to %d Gaussians", 2n))) - GMM(w, μ, Σ, hist, gmm.nx) + return GMM(w, μ, Σ, hist, gmm.nx) end # This function runs the Expectation Maximization algorithm on the GMM, and returns @@ -243,7 +244,7 @@ function em!(gmm::GMM, x::DataOrMatrix; nIter::Int = 10, varfloor::Float64=1e-3, @logmsg moreInfo "Running $nIter iterations EM on $gmmkind cov GMM with $ng Gaussians in $d dimensions" - for i=1:nIter + for i in 1:nIter ## E-step nₓ, ll[i], N, F, S = stats(gmm, x, parallel=true) ## M-step @@ -256,10 +257,10 @@ function em!(gmm::GMM, x::DataOrMatrix; nIter::Int = 10, varfloor::Float64=1e-3, if (any(tooSmall)) ind = findall(tooSmall) @warn("Variances had to be floored ", ind) - gmm.Σ[ind,:] = initc[ind,:] + gmm.Σ[ind,:] = initc[ind, :] end elseif gmmkind == :full - for k=1:ng + for k in 1:ng if N[k] < d @warn(@sprintf("Too low occupancy count %3.1f for Gausian %d", N[k], k)) else @@ -271,8 +272,7 @@ function em!(gmm::GMM, x::DataOrMatrix; nIter::Int = 10, varfloor::Float64=1e-3, error("Unknown kind") end sanitycheck!(gmm) - loginfo = @sprintf("iteration %d, average log likelihood %f", - i, ll[i] / (nₓ*d)) + loginfo = @sprintf("iteration %d, average log likelihood %f", i, ll[i] / (nₓ * d)) addhist!(gmm, loginfo) end if nIter>0 @@ -280,11 +280,11 @@ function em!(gmm::GMM, x::DataOrMatrix; nIter::Int = 10, varfloor::Float64=1e-3, finalll = ll[nIter] else finalll = avll(gmm, x) - nₓ = size(x,1) + nₓ = size(x, 1) end gmm.nx = nₓ - addhist!(gmm,@sprintf("EM with %d data points %d iterations avll %f\n%3.1f data points per parameter",nₓ,nIter,finalll,nₓ/nparams(gmm))) - ll + addhist!(gmm, @sprintf("EM with %d data points %d iterations avll %f\n%3.1f data points per parameter",nₓ,nIter,finalll,nₓ/nparams(gmm))) + return ll end ## this function returns the contributions of the individual Gaussians to the LL @@ -302,11 +302,11 @@ function llpg(gmm::GMM{GT,DCT}, x::Matrix{T}) where DCT <: DiagCov{GT} where {GT normalization = 0.5 * (d * log(2π) .+ sum(log.(gmm.Σ), dims=2)) # ng × 1 sm2p = sum(mp .* gmm.μ, dims=2) # sum over d mean^2*precision, ng × 1 ## from here on data-dependent calculations - xx = x.^2 # nₓ × d + xx = x .^ 2 # nₓ × d pxx = sm2p' .+ xx * prec' # nₓ × ng mpx = x * mp' # nₓ × ng - # L = broadcast(*, a', exp(mpx-0.5pxx)) # nₓ × ng, Likelihood per frame per Gaussian - mpx-0.5pxx .- normalization' + # L = broadcast(*, a', exp(mpx - 0.5pxx)) # nₓ × ng, Likelihood per frame per Gaussian + return mpx - 0.5pxx .- normalization' end ## A function we see more often... Λ is in chol(inv(Σ)) form @@ -315,15 +315,15 @@ end function xμTΛxμ!(Δ::Matrix, x::Matrix, μ::Vector, ciΣ::UpperTriangular) # broadcast!(-, Δ, x, μ) # size: nₓ × d, add ops: nₓ * d (nₓ, d) = size(x) - @inbounds for j = 1:d + @inbounds for j in 1:d μj = μ[j] - for i = 1:nₓ - Δ[i,j] = x[i,j] - μj + for i in 1:nₓ + Δ[i, j] = x[i, j] - μj end end - tmp = Δ*ciΣ' # size: nₓ × d, mult ops nₓ*d^2 + tmp = Δ * ciΣ' # size: nₓ × d, mult ops nₓ*d^2 - Δ[:,:] .= tmp[:,:] + Δ[:, :] .= tmp[:, :] end ## full covariance version of llpg() @@ -331,15 +331,15 @@ function llpg(gmm::GMM{GT,FCT}, x::Matrix{T}) where FCT <: FullCov{GT} where {GT RT = promote_type(GT,T) (nₓ, d) = size(x) ng = gmm.n - d==gmm.d || error("Inconsistent size gmm and x") + d == gmm.d || error("Inconsistent size gmm and x") ll = Array{RT}(undef, nₓ, ng) Δ = Array{RT}(undef, nₓ, d) ## Σ's now are inverse choleski's, so logdet becomes -2sum(log(diag)) - normalization = [0.5d*log(2π) - sum(log.(diag((gmm.Σ[k])))) for k=1:ng] - for k=1:ng + normalization = [0.5d*log(2π) - sum(log.(diag((gmm.Σ[k])))) for k in 1:ng] + for k in 1:ng ## Δ = (x_i - μ_k)' Λ_κ (x_i - m_k) xμTΛxμ!(Δ, x, vec(gmm.μ[k,:]), gmm.Σ[k]) - ll[:,k] = -0.5 * sum(abs2,Δ,dims=2) .- normalization[k] + ll[:, k] = -0.5 * sum(abs2, Δ, dims=2) .- normalization[k] end return ll::Matrix{RT} end @@ -347,13 +347,13 @@ end ## Average log-likelihood per data point and per dimension for a given GMM function avll(gmm::GMM, x::Matrix{T}) where T<:AbstractFloat gmm.d == size(x,2) || error("Inconsistent size gmm and x") - mean(logsumexpw(llpg(gmm, x), gmm.w)) / gmm.d + return mean(logsumexpw(llpg(gmm, x), gmm.w)) / gmm.d end ## Data version function avll(gmm::GMM, d::Data) llpf = dmap(x->logsumexpw(llpg(gmm,x), gmm.w), d) - sum(map(sum, llpf)) / sum(map(length, llpf)) / gmm.d + return sum(map(sum, llpf)) / sum(map(length, llpf)) / gmm.d end ## import Distributions.posterior @@ -369,5 +369,5 @@ function gmmposterior(gmm::GMM{GT}, x::Matrix{T}) where {GT, T} # nₓ × ng logp = ll .+ log.(gmm.w') logsump = logsumexp(logp, 2) broadcast!(-, logp, logp, logsump) - exp.(logp)::Matrix{RT}, ll::Matrix{RT} + return exp.(logp)::Matrix{RT}, ll::Matrix{RT} end