diff --git a/Project.toml b/Project.toml
index cfb0d85f..7ef04f15 100644
--- a/Project.toml
+++ b/Project.toml
@@ -13,8 +13,8 @@ Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 InvertedIndices = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
+LearnAPI = "92ad9a40-7767-427a-9ee6-6e577f1266cb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7"
 MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
 Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
@@ -22,13 +22,22 @@ Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
 PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541"
+StatisticalMeasuresBase = "c062fc1d-0d66-479b-b6ac-8b44719de4cc"
 StatisticalTraits = "64bff920-2084-43da-a3e6-9bb72801c0c9"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
+[weakdeps]
+StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541"
+
+[extensions]
+DefaultMeasuresExt = "StatisticalMeasures"
+
 [compat]
 CategoricalArrays = "0.9, 0.10"
 CategoricalDistributions = "0.1"
@@ -36,14 +45,17 @@ ComputationalResources = "0.3"
 DelimitedFiles = "1"
 Distributions = "0.25.3"
 InvertedIndices = "1"
-LossFunctions = "0.11"
+LearnAPI = "0.1"
 MLJModelInterface = "1.7"
 Missings = "0.4, 1"
 OrderedCollections = "1.1"
 Parameters = "0.12"
 PrettyTables = "1, 2"
 ProgressMeter = "1.7.1"
+Reexport = "1.2"
 ScientificTypes = "3"
+StatisticalMeasures = "0.1.1"
+StatisticalMeasuresBase = "0.1.1"
 StatisticalTraits = "3.2"
 StatsBase = "0.32, 0.33, 0.34"
 Tables = "0.2, 1.0"
@@ -57,8 +69,9 @@ Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411"
 NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
+StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9"
 
 [targets]
-test = ["DataFrames", "DecisionTree", "Distances", "Logging", "MultivariateStats", "NearestNeighbors", "StableRNGs", "Test", "TypedTables"]
+test = ["DataFrames", "DecisionTree", "Distances", "Logging", "MultivariateStats", "NearestNeighbors", "StableRNGs", "StatisticalMeasures", "Test", "TypedTables"]
diff --git a/README.md b/README.md
index 9323a9c2..8e9fc1e5 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,11 @@ repository provides core functionality for MLJ, including:
 
 - basic utilities for **manipulating datasets** and for **synthesizing datasets** (src/data)
   
-- a [small interface](https://alan-turing-institute.github.io/MLJ.jl/dev/evaluating_model_performance/#Custom-resampling-strategies-1) for **resampling strategies** and implementations, including `CV()`, `StratifiedCV` and `Holdout` (src/resampling.jl)
+- a [small
+  interface](https://alan-turing-institute.github.io/MLJ.jl/dev/evaluating_model_performance/#Custom-resampling-strategies-1)
+  for **resampling strategies** and implementations, including `CV()`, `StratifiedCV` and
+  `Holdout` (src/resampling.jl). Actual performance evaluation measures (aka metrics), which previously
+  were provided by MLJBase.jl, now live in [StatisticalMeasures.jl](https://juliaai.github.io/StatisticalMeasures.jl/dev/).
 
 - methods for **performance evaluation**, based on those resampling strategies (src/resampling.jl)
 
@@ -44,9 +48,4 @@ repository provides core functionality for MLJ, including:
   associated methods, for use with
   [MLJTuning](https://github.com/JuliaAI/MLJTuning.jl) (src/hyperparam)
 
-- a [small
-  interface](https://alan-turing-institute.github.io/MLJ.jl/dev/performance_measures/#Traits-and-custom-measures-1)
-  for **performance measures** (losses and scores), implementation of about 60 such measures, including integration of the
-  [LossFunctions.jl](https://github.com/JuliaML/LossFunctions.jl)
-  library (src/measures). To be migrated into separate package in the near future. 
 
diff --git a/ext/DefaultMeasuresExt.jl b/ext/DefaultMeasuresExt.jl
new file mode 100644
index 00000000..a06cd00f
--- /dev/null
+++ b/ext/DefaultMeasuresExt.jl
@@ -0,0 +1,15 @@
+module DefaultMeasuresExt
+
+using MLJBase
+import MLJBase:default_measure, ProbabilisticDetector, DeterministicDetector
+using StatisticalMeasures
+using StatisticalMeasures.ScientificTypesBase
+
+default_measure(::Deterministic, ::Type{<:Union{Continuous,Count}}) = l2
+default_measure(::Deterministic, ::Type{<:Finite}) = misclassification_rate
+default_measure(::Probabilistic, ::Type{<:Union{Finite,Count}}) = log_loss
+default_measure(::Probabilistic, ::Type{<:Continuous}) = log_loss
+default_measure(::ProbabilisticDetector, ::Type{<:OrderedFactor{2}}) = area_under_curve
+default_measure(::DeterministicDetector, ::Type{<:OrderedFactor{2}}) = balanced_accuracy
+
+end # module
diff --git a/src/MLJBase.jl b/src/MLJBase.jl
index 40e00dc8..c186d1de 100644
--- a/src/MLJBase.jl
+++ b/src/MLJBase.jl
@@ -1,8 +1,9 @@
-module MLJBase 
+module MLJBase
 
 # ===================================================================
 # IMPORTS
 
+using Reexport
 import Base: ==, precision, getindex, setindex!
 import Base.+, Base.*, Base./
 
@@ -16,7 +17,7 @@ for trait in StatisticalTraits.TRAITS
     eval(:(import StatisticalTraits.$trait))
 end
 
-import Base.instances # considered a trait for measures
+import LearnAPI
 import StatisticalTraits.snakecase
 import StatisticalTraits.info
 
@@ -47,7 +48,7 @@ end
 ###################
 # Hack Block ends #
 ###################
-
+import MLJModelInterface: ProbabilisticDetector, DeterministicDetector
 import MLJModelInterface: fit, update, update_data, transform,
     inverse_transform, fitted_params, predict, predict_mode,
     predict_mean, predict_median, predict_joint,
@@ -78,8 +79,6 @@ using ProgressMeter
 import .Threads
 
 # Operations & extensions
-import LossFunctions
-import LossFunctions.Traits
 import StatsBase
 import StatsBase: fit!, mode, countmap
 import Missings: levels
@@ -89,6 +88,9 @@ using CategoricalDistributions
 import Distributions: pdf, logpdf, sampler
 const Dist = Distributions
 
+# Measures
+import StatisticalMeasuresBase
+
 # from Standard Library:
 using Statistics, LinearAlgebra, Random, InteractiveUtils
 
@@ -128,57 +130,6 @@ const CatArrMissing{T,N} = ArrMissing{CategoricalValue{T},N}
 const MMI = MLJModelInterface
 const FI  = MLJModelInterface.FullInterface
 
-const MARGIN_LOSSES = [
-    :DWDMarginLoss,
-    :ExpLoss,
-    :L1HingeLoss,
-    :L2HingeLoss,
-    :L2MarginLoss,
-    :LogitMarginLoss,
-    :ModifiedHuberLoss,
-    :PerceptronLoss,
-    :SigmoidLoss,
-    :SmoothedL1HingeLoss,
-    :ZeroOneLoss
-]
-
-const DISTANCE_LOSSES = [
-    :HuberLoss,
-    :L1EpsilonInsLoss,
-    :L2EpsilonInsLoss,
-    :LPDistLoss,
-    :LogitDistLoss,
-    :PeriodicLoss,
-    :QuantileLoss
-]
-
-const WITH_PARAMETERS = [
-    :DWDMarginLoss,
-    :SmoothedL1HingeLoss,
-    :HuberLoss,
-    :L1EpsilonInsLoss,
-    :L2EpsilonInsLoss,
-    :LPDistLoss,
-    :QuantileLoss,
-]
-
-const MEASURE_TYPE_ALIASES = [
-    :FPR, :FNR, :TPR, :TNR,
-    :FDR, :PPV, :NPV, :Recall, :Specificity,
-    :MFPR, :MFNR, :MTPR, :MTNR,
-    :MFDR, :MPPV, :MNPV, :MulticlassRecall, :MulticlassSpecificity,
-    :MCR,
-    :MCC,
-    :BAC, :BACC,
-    :RMS, :RMSPV, :RMSL, :RMSLP, :RMSP,
-    :MAV, :MAE, :MAPE,
-    :RSQ, :LogCosh,
-    :CrossEntropy,
-    :AUC
-]
-
-const LOSS_FUNCTIONS = vcat(MARGIN_LOSSES, DISTANCE_LOSSES)
-
 # ===================================================================
 # Computational Resource
 # default_resource allows to switch the mode of parallelization
@@ -225,15 +176,10 @@ include("data/data.jl")
 include("data/datasets.jl")
 include("data/datasets_synthetic.jl")
 
-include("measures/measures.jl")
-include("measures/measure_search.jl")
-include("measures/doc_strings.jl")
+include("default_measures.jl")
 
 include("composition/models/stacking.jl")
 
-# function on the right-hand side is defined in src/measures/meta_utilities.jl:
-const MEASURE_TYPES_ALIASES_AND_INSTANCES = measures_for_export()
-
 const EXTENDED_ABSTRACT_MODEL_TYPES = vcat(
     MLJBase.MLJModelInterface.ABSTRACT_MODEL_SUBTYPES,
     MLJBase.NETWORK_COMPOSITE_TYPES, # src/composition/models/network_composite_types.jl
@@ -357,23 +303,8 @@ export ResamplingStrategy, Holdout, CV, StratifiedCV, TimeSeriesCV,
 # -------------------------------------------------------------------
 # exports from MLJBase specific to measures
 
-# measure names:
-for m in MEASURE_TYPES_ALIASES_AND_INSTANCES
-    :(export $m) |> eval
-end
-
-# measures/registry.jl:
-export measures, metadata_measure
-
 # measure/measures.jl (excluding traits):
-export aggregate, default_measure, value, skipinvalid
-
-# measures/probabilistic:
-export roc_curve, roc
-
-# measures/finite.jl (averaging modes for multiclass scores)
-export no_avg, macro_avg, micro_avg
-
+export default_measure
 
 # -------------------------------------------------------------------
 # re-export from Random, StatsBase, Statistics, Distributions,
@@ -381,4 +312,10 @@ export no_avg, macro_avg, micro_avg
 export pdf, sampler, mode, median, mean, shuffle!, categorical, shuffle,
    levels, levels!, std, Not, support, logpdf, LittleDict
 
+# for julia < 1.9
+if !isdefined(Base, :get_extension)
+    include(joinpath("..","ext", "DefaultMeasuresExt.jl"))
+    @reexport using .DefaultMeasuresExt.StatisticalMeasures
+end
+
 end # module
diff --git a/src/composition/models/stacking.jl b/src/composition/models/stacking.jl
index 4a760e24..ec872c16 100644
--- a/src/composition/models/stacking.jl
+++ b/src/composition/models/stacking.jl
@@ -378,14 +378,23 @@ model_2, ...), ...)
 function internal_stack_report(
     stack::Stack{modelnames,},
     verbosity::Int,
-    tt_pairs,
+    tt_pairs, # train_test_pairs
     folds_evaluations...
 ) where modelnames
 
     n_measures = length(stack.measures)
     nfolds = length(tt_pairs)
 
-    # For each model we record the results mimicking the fields PerformanceEvaluation
+    test_fold_sizes = map(tt_pairs) do train_test_pair
+        test = last(train_test_pair)
+        length(test)
+    end
+
+    # weights to be used to aggregate per-fold measurements (averaging to 1):
+    fold_weights(mode) = nfolds .* test_fold_sizes ./ sum(test_fold_sizes)
+    fold_weights(::StatisticalMeasuresBase.Sum) = nothing
+
+    # For each model we record the results mimicking the fields of PerformanceEvaluation
     results = NamedTuple{modelnames}(
         [(
             model = model,
@@ -393,7 +402,7 @@ function internal_stack_report(
             measurement = Vector{Any}(undef, n_measures),
             operation = _actual_operations(nothing, stack.measures, model, verbosity),
             per_fold = [Vector{Any}(undef, nfolds) for _ in 1:n_measures],
-            per_observation = Vector{Union{Missing, Vector{Any}}}(missing, n_measures),
+            per_observation = [Vector{Vector{Any}}(undef, nfolds) for _ in 1:n_measures],
             fitted_params_per_fold = [],
             report_per_fold = [],
             train_test_pairs = tt_pairs,
@@ -419,30 +428,29 @@ function internal_stack_report(
                 model_results.operation,
             ))
                 ypred = operation(mach, Xtest)
-                loss = measure(ypred, ytest)
-                # Update per_observation
-                if reports_each_observation(measure)
-                    if model_results.per_observation[i] === missing
-                        model_results.per_observation[i] = Vector{Any}(undef, nfolds)
-                    end
-                    model_results.per_observation[i][foldid] = loss
-                end
+                measurements = StatisticalMeasuresBase.measurements(measure, ypred, ytest)
+
+                # Update per observation:
+                model_results.per_observation[i][foldid] = measurements
 
                 # Update per_fold
-                model_results.per_fold[i][foldid] =
-                    reports_each_observation(measure) ?
-                    MLJBase.aggregate(loss, measure) : loss
+                model_results.per_fold[i][foldid] = measure(ypred, ytest)
             end
             index += 1
         end
     end
 
-    # Update measurement field by aggregation
+    # Update measurement field by aggregating per-fold measurements
     for modelname in modelnames
         for (i, measure) in enumerate(stack.measures)
             model_results = results[modelname]
+            mode = StatisticalMeasuresBase.external_aggregation_mode(measure)
             model_results.measurement[i] =
-                MLJBase.aggregate(model_results.per_fold[i], measure)
+                StatisticalMeasuresBase.aggregate(
+                    model_results.per_fold[i];
+                    mode,
+                    weights=fold_weights(mode),
+                )
         end
     end
 
diff --git a/src/default_measures.jl b/src/default_measures.jl
new file mode 100644
index 00000000..2488bbf5
--- /dev/null
+++ b/src/default_measures.jl
@@ -0,0 +1,23 @@
+# # DEFAULT MEASURES
+
+"""
+    default_measure(model)
+
+Return a measure that should work with `model`, or return `nothing` if none can be
+reliably inferred.
+
+For Julia 1.9 and higher, `nothing` is returned, unless StatisticalMeasures.jl is
+loaded.
+
+# New implementations
+
+This method dispatches `default_measure(model, observation_scitype)`, which has
+`nothing` as the fallback return value. Extend `default_measure` by overloading this
+version of the method. See for example the MLJBase.jl package extension,
+DefaultMeausuresExt.jl.
+
+"""
+default_measure(m) = nothing
+default_measure(m::Union{Supervised,Annotator}) =
+    default_measure(m, nonmissingtype(guess_model_target_observation_scitype(m)))
+default_measure(m, S) = nothing
diff --git a/src/measures/README.md b/src/measures/README.md
deleted file mode 100644
index 0097d2f7..00000000
--- a/src/measures/README.md
+++ /dev/null
@@ -1,117 +0,0 @@
-## Adding new measures
-
-This document assumes familiarity with the traits provided for
-measures. For a summary, query the docstring for
-`MLJBase.metadata_measures`.
-
-A measure is ordinarily called on data directly, as in
-
-```julia
-ŷ = rand(3) # predictions
-y = rand(3) # ground truth observations
-
-m = LPLoss(p=3)
-
-julia> m(ŷ, y)
-3-element Vector{Float64}:
- 0.07060087052171798
- 0.003020044780949528
- 0.019067038457889922
-```
-
-To call a measure without performing dimension or pool checks, one
-uses `MLJBase.call` instead:
-
-```julia
-MLJBase.call(m, ŷ, y)
-```
-
-A new measure reporting an aggregate measurement, such as
-`AreaUnderCurve`, will subtype `Aggregate`, and only needs to
-implement `call`. A measure that reports a measurement for each
-observation , such as `LPLoss`, subtypes `Unaggregated` and only needs
-to implement an evaluation method for single observations called
-`single`.
-
-Recall also that if a measure reports each observation, it does so
-even in the case that weights are additionally specified:
-
-```julia
-w = rand(3) # per-observation weights
-
-julia> m(ŷ, y, rand(3))
-3-element Vector{Float64}:
- 0.049333392516241206
- 0.0017612002314472718
- 0.003157450446692638
- ```
-
-This behaviour differs from other places where weights can only be
-specified as part of an aggregation of multi-observation measurements.
-
-
-### Unaggregated measures implement `single`
-
-To implement an `Unaggregated` measure, it suffices to implement `single(measure, η̂, η)`,
-which should return a measurement (e.g., a float) for a single example `(η̂, η)` (e.g., a
-pair of floats). There is no need for `single` to handle `missing` values. (Internally, a
-wrapper function `robust_single` handles these.)
-
-If only `single` is implemented, then the measure will automatically
-support per-observation weights and, where that makes sense, per-class
-weights. However, `supports_class_weights` may need to be overloaded,
-as this defaults to `false`.
-
-#### Special cases
-
-If `single` is *not* implemented, then `call(measure, ŷ, y)`, and optionally
-`call(measure, ŷ, y, w)`, must be implemented (the fallbacks call `robust_single`, a
-wrapped version of `single` that handles `missing` values).  In this case `y` and `ŷ` are
-arrays of matching size and the method should return an array of that size *without
-performing size or pool checks*. The method should handle `missing` and `NaN` values if
-possible, which should be propagated to relevant elements of the returned array.
-
-The `supports_weights` trait, which defaults to `true`, will need to
-be overloaded to return `false` if neither `single(::MyMeasure,
-args...)` nor `call(::MyMeasure, ŷ, y, w::AbstractArray)` are
-overloaded.
-
-### Aggregated measures implement `call`
-
-To implement an `Aggregated` measure, implement
-`call(measure::MyMeasure, ŷ, y)`. Optionally implement 
-`call(measure::MyMeasure, ŷ, y, w)`.
-
-
-### Trait declarations 
-
-Measure traits can be set using the `metadata_measure`
-function (query the doc-string) or individually, as in 
-
-```julia
-supports_weights(::Type{<:MyMeasure}) = false
-```
-
-Defaults are shown below
-
-trait                    | allowed values               | default 
--------------------------|------------------------------|--------------
-`target_scitype`         | some scientific type         | `Unknown`
-`human_name`             | any `String`                 | string version of type name
-`instances`              | any `Vector{String}`         | empty
-`prediction_type`        | `:deterministic`, `:probabilistic`, `:interval` `:unknown` | `:unknown`
-`orientation`            | `:score`, `:loss`, `:unknown`| `:unknown`
-`aggregation`            | `Mean()`, `Sum()`, `RootMeanSqaure()` | `Mean()`
-`supports_weights`       | `true` or `false`            | `true`
-`supports_class_weights` | `true` or `false`            | `false`
-`docstring`              | any `String`                 | includes `name`, `human_name` and `instances`
-`distribution_type`      | any `Distribution` subtype or `Unknown`   | `Unknown`
-
-### Exporting the measure and its aliases
-
-If you create a type alias, as in `const MAE = MeanAbsoluteValue`,
-then you must add this alias to the constant
-`MEASURE_TYPE_ALIASES`. That is the only step needed, as the the macro
-`@export_measures` programmatically exports all measure types and
-their instances, and those aliases listed in = MeanAbsoluteValue`,
-then you must add this alias to the constant `MEASURE_TYPE_ALIASES`.
diff --git a/src/measures/confusion_matrix.jl b/src/measures/confusion_matrix.jl
deleted file mode 100644
index fd35dd26..00000000
--- a/src/measures/confusion_matrix.jl
+++ /dev/null
@@ -1,273 +0,0 @@
-## CONFUSION MATRIX OBJECT
-
-"""
-    ConfusionMatrixObject{C}
-
-Confusion matrix with `C ≥ 2` classes. Rows correspond to predicted values
-and columns to the ground truth.
-"""
-struct ConfusionMatrixObject{C}
-    mat::Matrix
-    labels::Vector{String}
-end
-
-"""
-    ConfusionMatrixObject(m, labels)
-
-Instantiates a confusion matrix out of a square integer matrix `m`.
-Rows are the predicted class, columns the ground truth. See also the
-[wikipedia article](https://en.wikipedia.org/wiki/Confusion_matrix).
-
-"""
-function ConfusionMatrixObject(m::Matrix{Int}, labels::Vector{String})
-    s = size(m)
-    s[1] == s[2] || throw(ArgumentError("Expected a square matrix."))
-    s[1] > 1 || throw(ArgumentError("Expected a matrix of size ≥ 2x2."))
-    length(labels) == s[1] ||
-        throw(ArgumentError("As many labels as classes must be provided."))
-    ConfusionMatrixObject{s[1]}(m, labels)
-end
-
-# allow to access cm[i,j] but not set (it's immutable)
-Base.getindex(cm::ConfusionMatrixObject, inds...) = getindex(cm.mat, inds...)
-
-_levels(y1, y2) = vcat(levels(y1), levels(y2)) |> unique
-
-# simultaneous coercion of two vectors into categorical vectors having
-# the same pool:
-function _categorical(y1, y2)
-    L = _levels(y1, y2)
-    return categorical(y1, levels=L), categorical(y2, levels=L)
-end
-_categorical(y1::CategoricalArray{V1,N},
-             y2::CategoricalArray{V2,N}) where
-    {V, V1<:Union{Missing,V}, V2<:Union{Missing,V}, N} =
-    y1, y2
-_categorical(y1::AbstractArray{<:CategoricalArrays.CategoricalValue},
-             y2::AbstractArray{<:CategoricalArrays.CategoricalValue}) =
-    broadcast(identity, y1), broadcast(identity, y2)
-
-
-"""
-    _confmat(ŷ, y; rev=false)
-
-A private method. General users should use `confmat` or other instances
-of the measure type [`ConfusionMatrix`](@ref).
-
-Computes the confusion matrix given a predicted `ŷ` with categorical elements
-and the actual `y`. Rows are the predicted class, columns the ground truth.
-The ordering follows that of `levels(y)`.
-
-## Keywords
-
-* `rev=false`: in the binary case, this keyword allows to swap the ordering of
-               classes.
-* `perm=[]`:   in the general case, this keyword allows to specify a permutation
-               re-ordering the classes.
-* `warn=true`: whether to show a warning in case `y` does not have scientific
-               type `OrderedFactor{2}` (see note below).
-
-## Note
-
-To decrease the risk of unexpected errors, if `y` does not have
-scientific type `OrderedFactor{2}` (and so does not have a "natural
-ordering" negative-positive), a warning is shown indicating the
-current order unless the user explicitly specifies either `rev` or
-`perm` in which case it's assumed the user is aware of the class
-ordering.
-
-The `confusion_matrix` is a measure (although neither a score nor a
-loss) and so may be specified as such in calls to `evaluate`,
-`evaluate!`, although not in `TunedModel`s.  In this case, however,
-there no way to specify an ordering different from `levels(y)`, where
-`y` is the target.
-
-"""
-function _confmat(ŷraw::Union{Arr{V1,N}, CategoricalArray{V1,N}},
-                  yraw::Union{Arr{V2,N}, CategoricalArray{V2,N}};
-                  rev::Union{Nothing,Bool}=nothing,
-                  perm::Union{Nothing,Vector{<:Integer}}=nothing,
-                  warn::Bool=true) where
-    {V,V1<:Union{Missing,V}, V2<:Union{Missing,V},N}
-
-    # no-op if vectors already categorical arrays:
-    ŷ, y = _categorical(ŷraw, yraw)
-
-    levels_ = levels(y)
-    nc = length(levels_)
-    if rev !== nothing && rev && nc > 2
-        throw(ArgumentError("Keyword `rev` can only be used in binary case."))
-    end
-    if perm !== nothing && !isempty(perm)
-        length(perm) == nc ||
-            throw(ArgumentError("`perm` must be of length matching the "*
-                                "number of classes."))
-        Set(perm) == Set(collect(1:nc)) ||
-            throw(ArgumentError("`perm` must specify a valid permutation of "*
-                                "`[1, 2, ..., c]`, where `c` is "*
-                                "number of classes."))
-    end
-
-    # warning
-    if rev === nothing && perm === nothing
-        S = nonmissingtype(elscitype(y))
-        if warn
-            if nc==2 &&  !(S <: OrderedFactor)
-                @warn "The classes are un-ordered,\n" *
-                    "using: negative='$(levels_[1])' "*
-                    "and positive='$(levels_[2])'.\n" *
-                    "To suppress this warning, consider coercing "*
-                    "to OrderedFactor."
-            elseif !(S <: OrderedFactor)
-                @warn "The classes are un-ordered,\n" *
-                      "using order: $([l for l in levels_]).\n" *
-                      "To suppress this warning, consider "*
-                      "coercing to OrderedFactor."
-            end
-        end
-        rev  = false
-        perm = Int[]
-    elseif rev !== nothing && nc == 2
-        # rev takes precedence in binary case
-        if rev
-            perm = [2, 1]
-        else
-            perm = Int[]
-        end
-    end
-
-    # No permutation
-    if isempty(perm)
-        cmat = zeros(Int, nc, nc)
-        @inbounds for i in eachindex(y)
-            (isinvalid(y[i]) || isinvalid(ŷ[i])) && continue
-            cmat[int(ŷ[i]), int(y[i])] += 1
-        end
-        return ConfusionMatrixObject(cmat, string.(levels_))
-    end
-
-    # With permutation
-    cmat = zeros(Int, nc, nc)
-    iperm = invperm(perm)
-    @inbounds for i in eachindex(y)
-        (isinvalid(y[i]) || isinvalid(ŷ[i])) && continue
-        cmat[iperm[int(ŷ[i])], iperm[int(y[i])]] += 1
-    end
-    return ConfusionMatrixObject(cmat, string.(levels_[perm]))
-end
-
-
-# Machinery to display the confusion matrix in a non-confusing way
-# (provided the REPL is wide enough)
-
-splitw(w::Int) = (sp1 = div(w, 2); sp2 = w - sp1; (sp1, sp2))
-
-function Base.show(stream::IO, m::MIME"text/plain", cm::ConfusionMatrixObject{C}
-                   ) where C
-    width    = displaysize(stream)[2]
-    mincw    = ceil(Int, 12/C)
-    cw       = max(length(string(maximum(cm.mat))),maximum(length.(cm.labels)),mincw)
-    firstcw  = max(length(string(maximum(cm.mat))),maximum(length.(cm.labels)),9)
-    textlim  = 9
-    totalwidth = firstcw + cw * C + C + 2
-    width < totalwidth && (show(stream, m, cm.mat); return)
-
-    iob     = IOBuffer()
-    wline   = s -> write(iob, s * "\n")
-    splitcw = s -> (w = cw - length(s); splitw(w))
-    splitfirstcw = s -> (w = firstcw - length(s); splitw(w))
-    cropw   = s -> length(s) > textlim ? s[1:prevind(s, textlim)] * "…" : s
-
-    # 1.a top box
-    " "^(firstcw+1) * "┌" * "─"^((cw + 1) * C - 1) * "┐" |> wline
-    gt = "Ground Truth"
-    w  = (cw + 1) * C - 1 - length(gt)
-    sp1, sp2 = splitw(w)
-    " "^(firstcw+1) * "│" * " "^sp1 * gt * " "^sp2 * "│" |> wline
-    # 1.b separator
-    "┌" * "─"^firstcw * "┼" * ("─"^cw * "┬")^(C-1) * "─"^cw * "┤" |> wline
-    # 2.a description line
-    pr = "Predicted"
-    sp1, sp2 = splitfirstcw(pr)
-    partial = "│" * " "^sp1 * pr * " "^sp2 * "│"
-    for c in 1:C
-        # max = 10
-        s = cm.labels[c] |> cropw
-        sp1, sp2 = splitcw(s)
-        partial *= " "^sp1 * s * " "^sp2 * "│"
-    end
-    partial |> wline
-    # 2.b separating line
-    "├" * "─"^firstcw * "┼" * ("─"^cw * "┼")^(C-1) * ("─"^cw * "┤") |> wline
-    # 2.c line by line
-    for c in 1:C
-        # line
-        s  = cm.labels[c] |> cropw
-        sp1, sp2 = splitfirstcw(s)
-        partial = "│" * " "^sp1 * s * " "^sp2 * "│"
-        for r in 1:C
-            e = string(cm[c, r])
-            sp1, sp2 = splitcw(e)
-            partial *= " "^sp1 * e * " "^sp2 * "│"
-        end
-        partial |> wline
-        # separator
-        if c < C
-            "├" * "─"^firstcw * "┼" * ("─"^cw * "┼")^(C-1) * ("─"^cw * "┤") |> wline
-        end
-    end
-    # 2.d final line
-    "└" * "─"^firstcw * "┴" * ("─"^cw * "┴")^(C-1) * ("─"^cw * "┘") |> wline
-    write(stream, take!(iob))
-end
-
-
-## CONFUSION MATRIX AS MEASURE
-
-struct ConfusionMatrix <: Aggregated
-    perm::Union{Nothing,Vector{<:Integer}}
-end
-
-ConfusionMatrix(; perm=nothing) = ConfusionMatrix(perm)
-
-is_measure(::ConfusionMatrix) = true
-is_measure_type(::Type{ConfusionMatrix}) = true
-human_name(::Type{<:ConfusionMatrix}) = "confusion matrix"
-target_scitype(::Type{ConfusionMatrix}) =
-    Union{AbstractVector{<:Union{Missing,OrderedFactor}},
-          AbstractVector{<:Union{Missing,OrderedFactor}}}
-supports_weights(::Type{ConfusionMatrix}) = false
-prediction_type(::Type{ConfusionMatrix}) = :deterministic
-instances(::Type{<:ConfusionMatrix}) = ["confusion_matrix", "confmat"]
-orientation(::Type{ConfusionMatrix}) = :other
-reports_each_observation(::Type{ConfusionMatrix}) = false
-is_feature_dependent(::Type{ConfusionMatrix}) = false
-aggregation(::Type{ConfusionMatrix}) = Sum()
-
-@create_aliases ConfusionMatrix
-
-@create_docs(ConfusionMatrix,
-body=
-"""
-If `r` is the return value, then the raw confusion matrix is `r.mat`,
-whose rows correspond to predictions, and columns to ground truth.
-The ordering follows that of `levels(y)`.
-
-Use `ConfusionMatrix(perm=[2, 1])` to reverse the class order for binary
-data. For more than two classes, specify an appropriate permutation, as in
-`ConfusionMatrix(perm=[2, 3, 1])`.
-
-""",
-scitype=DOC_ORDERED_FACTOR_BINARY)
-
-# calling behaviour:
-call(m::ConfusionMatrix, ŷ, y) = _confmat(ŷ, y, perm=m.perm)
-
-# overloading addition to make aggregation work:
-Base.round(m::MLJBase.ConfusionMatrixObject; kws...) = m
-function Base.:+(m1::ConfusionMatrixObject, m2::ConfusionMatrixObject)
-    if m1.labels != m2.labels
-        throw(ArgumentError("Confusion matrix labels must agree"))
-    end
-    ConfusionMatrixObject(m1.mat + m2.mat, m1.labels)
-end
diff --git a/src/measures/continuous.jl b/src/measures/continuous.jl
deleted file mode 100644
index 33670216..00000000
--- a/src/measures/continuous.jl
+++ /dev/null
@@ -1,315 +0,0 @@
-const InfiniteArrMissing = Union{
-    AbstractArray{<:Union{Missing,Continuous}},
-    AbstractArray{<:Union{Missing,Count}}}
-
-# -----------------------------------------------------------
-# MeanAbsoluteError
-
-struct MeanAbsoluteError <: Aggregated end
-
-metadata_measure(MeanAbsoluteError;
-                 instances = ["mae", "mav", "mean_absolute_error",
-                              "mean_absolute_value"],
-                 target_scitype           = InfiniteArrMissing,
-                 prediction_type          = :deterministic,
-                 orientation              = :loss),
-
-const MAE = MeanAbsoluteError
-const MAV = MeanAbsoluteError
-@create_aliases MeanAbsoluteError
-
-@create_docs(MeanAbsoluteError,
-body=
-"""
-``\\text{mean absolute error} =  n^{-1}∑ᵢ|yᵢ-ŷᵢ|`` or
-``\\text{mean absolute error} = n^{-1}∑ᵢwᵢ|yᵢ-ŷᵢ|``
-""",
-scitype=DOC_INFINITE)
-
-call(::MeanAbsoluteError, ŷ, y) = abs.(ŷ .- y) |> skipinvalid |> mean
-call(::MeanAbsoluteError, ŷ, y, w) = abs.(ŷ .- y) .* w |> skipinvalid |> mean
-
-# ----------------------------------------------------------------
-# RootMeanSquaredError
-
-struct RootMeanSquaredError <: Aggregated end
-
-metadata_measure(RootMeanSquaredError;
-                 instances                = ["rms", "rmse",
-                                             "root_mean_squared_error"],
-                 target_scitype           = InfiniteArrMissing,
-                 prediction_type          = :deterministic,
-                 orientation              = :loss,
-                 aggregation              = RootMeanSquare())
-
-const RMS = RootMeanSquaredError
-@create_aliases RootMeanSquaredError
-
-@create_docs(RootMeanSquaredError,
-body=
-"""
-``\\text{root mean squared error} = \\sqrt{n^{-1}∑ᵢ|yᵢ-ŷᵢ|^2}`` or
-``\\text{root mean squared error} = \\sqrt{\\frac{∑ᵢwᵢ|yᵢ-ŷᵢ|^2}{∑ᵢwᵢ}}``
-""",
-scitype=DOC_INFINITE)
-
-call(::RootMeanSquaredError, ŷ, y) = (y .- ŷ).^2 |> skipinvalid |> mean |> sqrt
-call(::RootMeanSquaredError, ŷ, y, w) = (y .- ŷ).^2 .* w |> skipinvalid |> mean |> sqrt
-
-# -------------------------------------------------------------------------
-# R-squared (coefficient of determination)
-
-struct RSquared <: Aggregated end
-
-metadata_measure(RSquared;
-                 instances               = ["rsq", "rsquared"],
-                 target_scitype          = InfiniteArrMissing,
-                 prediction_type         = :deterministic,
-                 orientation             = :score,
-                 supports_weights        = false)
-
-const RSQ = RSquared
-@create_aliases RSquared
-
-@create_docs(RSquared,
-body=
-"""
-The R² (also known as R-squared or coefficient of determination) is suitable for
-interpreting linear regression analysis (Chicco et al., [2021](https://doi.org/10.7717/peerj-cs.623)).
-
-Let ``\\overline{y}`` denote the mean of ``y``, then
-
-``\\text{R^2} = 1 - \\frac{∑ (\\hat{y} - y)^2}{∑ \\overline{y} - y)^2}.``
-""",
-scitype=DOC_INFINITE)
-
-function call(::RSquared, ŷ, y)
-    num = (ŷ .- y).^2 |> skipinvalid |> sum
-    mean_y = mean(y)
-    denom = (mean_y .- y).^2 |> skipinvalid |> sum
-    return 1 - (num / denom)
-end
-
-# -------------------------------------------------------------------
-# LP
-
-struct LPLoss{T<:Real} <: Unaggregated
-    p::T
-end
-
-LPLoss(; p=2.0) = LPLoss(p)
-
-metadata_measure(LPLoss;
-                 instances = ["l1", "l2"],
-                 target_scitype           = InfiniteArrMissing,
-                 prediction_type          = :deterministic,
-                 orientation              = :loss)
-
-const l1 = LPLoss(1)
-const l2 = LPLoss(2)
-
-@create_docs(LPLoss,
-body=
-"""
-Constructor signature: `LPLoss(p=2)`. Reports
-`|ŷ[i] - y[i]|^p` for every index `i`.
-""",
-scitype=DOC_INFINITE)
-
-single(m::LPLoss, ŷ, y) =  abs(y - ŷ)^(m.p)
-
-# ----------------------------------------------------------------------------
-# RootMeanSquaredLogError
-
-struct RootMeanSquaredLogError <: Aggregated end
-
-metadata_measure(RootMeanSquaredLogError;
-                 instances = ["rmsl", "rmsle", "root_mean_squared_log_error"],
-                 target_scitype           = InfiniteArrMissing,
-                 prediction_type          = :deterministic,
-                 orientation              = :loss,
-                 aggregation              = RootMeanSquare())
-
-const RMSL = RootMeanSquaredLogError
-@create_aliases RootMeanSquaredLogError
-
-@create_docs(RootMeanSquaredLogError,
-body=
-"""
-``\\text{root mean squared log error} =
-\\sqrt{n^{-1}∑ᵢ\\log\\left({yᵢ \\over ŷᵢ}\\right)^2}``
-""",
-footer="See also [`rmslp1`](@ref).",
-scitype=DOC_INFINITE)
-
-call(::RootMeanSquaredLogError, ŷ, y) =
-    (log.(y) - log.(ŷ)).^2 |> skipinvalid |> mean |> sqrt
-call(::RootMeanSquaredLogError, ŷ, y, w) =
-    (log.(y) - log.(ŷ)).^2 .* w |> skipinvalid |> mean |> sqrt
-
-# ---------------------------------------------------------------------------
-#  RootMeanSquaredLogProportionalError
-
-struct RootMeanSquaredLogProportionalError{T<:Real} <: Aggregated
-    offset::T
-end
-
-RootMeanSquaredLogProportionalError(; offset=1.0) =
-    RootMeanSquaredLogProportionalError(offset)
-
-metadata_measure(RootMeanSquaredLogProportionalError;
-                 instances                = ["rmslp1", ],
-                 target_scitype           = InfiniteArrMissing,
-                 prediction_type          = :deterministic,
-                 orientation              = :loss,
-                 aggregation              = RootMeanSquare())
-
-const RMSLP = RootMeanSquaredLogProportionalError
-@create_aliases RootMeanSquaredLogProportionalError
-
-@create_docs(RootMeanSquaredLogProportionalError,
-body=
-"""
-Constructor signature: `RootMeanSquaredLogProportionalError(; offset = 1.0)`.
-
-``\\text{root mean squared log proportional error} =
-\\sqrt{n^{-1}∑ᵢ\\log\\left({yᵢ + \\text{offset} \\over ŷᵢ + \\text{offset}}\\right)}``
-""",
-footer="See also [`rmsl`](@ref). ",
-scitype=DOC_INFINITE)
-
-call(m::RMSLP, ŷ, y) =
-    (log.(y .+ m.offset) - log.(ŷ .+ m.offset)).^2 |>
-    skipinvalid |> mean |> sqrt
-
-call(m::RMSLP, ŷ, y, w) =
-    (log.(y .+ m.offset) - log.(ŷ .+ m.offset)).^2 .* w |>
-    skipinvalid |> mean |> sqrt
-
-# --------------------------------------------------------------------------
-# RootMeanSquaredProportionalError
-
-struct RootMeanSquaredProportionalError{T<:Real} <: Aggregated
-    tol::T
-end
-
-RootMeanSquaredProportionalError(; tol=eps()) =
-    RootMeanSquaredProportionalError(tol)
-
-metadata_measure(RootMeanSquaredProportionalError;
-    instances                = ["rmsp", ],
-    target_scitype           = InfiniteArrMissing,
-    prediction_type          = :deterministic,
-    orientation              = :loss,
-    aggregation              = RootMeanSquare())
-
-const RMSP = RootMeanSquaredProportionalError
-@create_aliases RMSP
-
-@create_docs(RootMeanSquaredProportionalError,
-body=
-"""
-Constructor keyword arguments: `tol` (default = `eps()`).
-
-``\\text{root mean squared proportional error} =
-\\sqrt{m^{-1}∑ᵢ \\left({yᵢ-ŷᵢ \\over yᵢ}\\right)^2}``
-
-where the sum is over indices such that `abs(yᵢ) > tol` and `m` is the number
-of such indices.
-
-""", scitype=DOC_INFINITE)
-
-function call(
-    m::RootMeanSquaredProportionalError,
-    ŷ,
-    y,
-    w=nothing,
-    )
-    ret = 0
-    count = 0
-    @inbounds for i in eachindex(y)
-        (isinvalid(y[i]) || isinvalid(ŷ[i])) && continue
-        ayi = abs(y[i])
-        if ayi > m.tol
-            dev = ((y[i] - ŷ[i]) / ayi)^2
-            ret += dev
-            ret = _scale(ret, w, i)
-            count += 1
-        end
-    end
-    return sqrt(ret / count)
-end
-
-# -----------------------------------------------------------------------
-# MeanAbsoluteProportionalError
-
-struct MeanAbsoluteProportionalError{T} <: Aggregated
-    tol::T
-end
-
-MeanAbsoluteProportionalError(; tol=eps()) = MeanAbsoluteProportionalError(tol)
-
-metadata_measure(MeanAbsoluteProportionalError;
-    instances                = ["mape", ],
-    target_scitype           = InfiniteArrMissing,
-    prediction_type          = :deterministic,
-    orientation              = :loss)
-
-const MAPE = MeanAbsoluteProportionalError
-@create_aliases MAPE
-
-@create_docs(MeanAbsoluteProportionalError,
-body=
-"""
-Constructor key-word arguments: `tol` (default = `eps()`).
-
-``\\text{mean absolute proportional error} =  m^{-1}∑ᵢ|{(yᵢ-ŷᵢ) \\over yᵢ}|``
-
-where the sum is over indices such that `abs(yᵢ) > tol` and `m` is the number
-of such indices.
-""", scitype=DOC_INFINITE)
-
-function call(
-    m::MeanAbsoluteProportionalError,
-    ŷ,
-    y,
-    w=nothing,
-    )
-    ret = 0
-    count = 0
-    @inbounds for i in eachindex(y)
-        (isinvalid(y[i]) || isinvalid(ŷ[i])) && continue
-        ayi = abs(y[i])
-        if ayi > m.tol
-        #if y[i] != zero(eltype(y))
-            dev = abs((y[i] - ŷ[i]) / ayi)
-            ret += dev
-            ret =_scale(ret, w, i)
-            count += 1
-        end
-    end
-    return ret / count
-end
-
-# -------------------------------------------------------------------------
-# LogCoshLoss
-
-struct LogCoshLoss <: Unaggregated end
-
-metadata_measure(LogCoshLoss;
-    instances                = ["log_cosh", "log_cosh_loss"],
-    target_scitype           = InfiniteArrMissing,
-    prediction_type          = :deterministic,
-    orientation              = :loss)
-
-const LogCosh = LogCoshLoss
-@create_aliases LogCoshLoss
-
-@create_docs(LogCoshLoss,
-             body="Reports ``\\log(\\cosh(ŷᵢ-yᵢ))`` for each index `i`. ",
-             scitype=DOC_INFINITE)
-
-_softplus(x::T) where T<:Real = x > zero(T) ? x + log1p(exp(-x)) : log1p(exp(x))
-_log_cosh(x::T) where T<:Real = x + _softplus(-2x) - log(convert(T, 2))
-
-single(::LogCoshLoss, ŷ, y) = _log_cosh(ŷ - y)
diff --git a/src/measures/doc_strings.jl b/src/measures/doc_strings.jl
deleted file mode 100644
index 03ed76df..00000000
--- a/src/measures/doc_strings.jl
+++ /dev/null
@@ -1,12 +0,0 @@
-# the following creates doc-strings for the aliases (`instances`) of each measure:
-
-for m in measures()
-    name = m.name
-    for instance in m.instances
-        alias = Symbol(instance)
-        quote
-            @doc "An instance of type [`$($name)`](@ref). "*
-                "Query the [`$($name)`](@ref) doc-string for details. " $alias
-        end |> eval
-    end
-end
diff --git a/src/measures/finite.jl b/src/measures/finite.jl
deleted file mode 100644
index 908525ab..00000000
--- a/src/measures/finite.jl
+++ /dev/null
@@ -1,1247 +0,0 @@
-const FiniteArrMissing{N} = Union{
-    AbstractArray{<:Union{Missing,Multiclass{N}}},
-    AbstractArray{<:Union{Missing,OrderedFactor{N}}}}
-
-# ---------------------------------------------------
-# misclassification rate
-
-struct MisclassificationRate <: Aggregated end
-
-metadata_measure(MisclassificationRate;
-                 instances  = ["misclassification_rate", "mcr"],
-                 target_scitype           = FiniteArrMissing,
-                 prediction_type          = :deterministic,
-                 orientation              = :loss)
-
-const MCR = MisclassificationRate
-@create_aliases MCR
-
-@create_docs(MisclassificationRate,
-body=
-"""
-A confusion matrix can also be passed as argument.
-$INVARIANT_LABEL
-""",
-scitype=DOC_FINITE)
-
-# calling behaviour:
-call(::MCR, ŷ, y) = (y .!= ŷ) |> Mean()
-call(::MCR, ŷ, y, w) = (y .!= ŷ) .* w |> Mean()
-(::MCR)(cm::ConfusionMatrixObject) = 1.0 - sum(diag(cm.mat)) / sum(cm.mat)
-
-# -------------------------------------------------------------
-# accuracy
-
-struct Accuracy <: Aggregated end
-
-metadata_measure(Accuracy;
-                 instances = ["accuracy",],
-                 target_scitype           = FiniteArrMissing,
-                 prediction_type          = :deterministic,
-                 orientation              = :score)
-
-@create_aliases Accuracy
-
-@create_docs(Accuracy,
-body=
-"""
-Accuracy is proportion of correct predictions `ŷ[i]` that match the
-ground truth `y[i]` observations. $INVARIANT_LABEL
-""",
-scitype=DOC_FINITE)
-
-# calling behaviour:
-call(::Accuracy, args...) = 1.0 - call(misclassification_rate, args...)
-(::Accuracy)(m::ConfusionMatrixObject) = sum(diag(m.mat)) / sum(m.mat)
-
-# -----------------------------------------------------------
-# balanced accuracy
-
-struct BalancedAccuracy <: Aggregated
-    adjusted::Bool
-end
-BalancedAccuracy(; adjusted=false) = BalancedAccuracy(adjusted)
-
-metadata_measure(BalancedAccuracy;
-                 instances = ["balanced_accuracy", "bacc", "bac"],
-                 target_scitype           = FiniteArrMissing,
-                 prediction_type          = :deterministic,
-                 orientation              = :score)
-
-const BACC = BalancedAccuracy
-@create_aliases BACC
-
-@create_docs(BalancedAccuracy,
-body=
-"""
-Balanced accuracy compensates standard [`Accuracy`](@ref) for class imbalance.
-See [https://en.wikipedia.org/wiki/Precision_and_recall#Imbalanced_data](https://en.wikipedia.org/wiki/Precision_and_recall#Imbalanced_data). 
-
-Setting `adjusted=true` rescales the score in the way prescribed in
-[L. Mosley (2013): A balanced approach to the multi-class imbalance
-problem. PhD thesis, Iowa State
-University](https://lib.dr.iastate.edu/etd/13537/). In the binary
-case, the adjusted balanced accuracy is also known as *Youden’s J
-statistic*, or *informedness*.
-
-$INVARIANT_LABEL
-""",
-scitype=DOC_FINITE)
-
-function call(m::BACC, ŷm, ym, wm=nothing)
-
-    ŷ, y, w = _skipinvalid(ŷm, ym, wm)
-
-    if w === nothing
-        n_given_class = StatsBase.countmap(y)
-        freq(i) = @inbounds n_given_class[y[i]]
-        ŵ = 1 ./ freq.(eachindex(y))
-    else # following sklearn, which is non-linear
-        ŵ = similar(w)
-        @inbounds for i in eachindex(w)
-            ŵ[i] = w[i] / sum(w .* (y .== y[i]))
-        end
-    end
-    s = sum(ŵ)
-    score = sum((ŷ .== y) .* ŵ) / sum(ŵ)
-    if m.adjusted
-        n_classes = length(levels(y))
-        chance = 1 / n_classes
-        score -= chance
-        score /= 1 - chance
-    end
-    return score
-end
-
-# ---------------------------------------------------
-# kappa
-
-struct Kappa <: Aggregated end
-
-metadata_measure(Kappa;
-                 instances  = ["kappa"],
-                 target_scitype           = FiniteArrMissing,
-                 prediction_type          = :deterministic,
-                 orientation              = :score,
-                 supports_weights         = false)
-
-@create_aliases Kappa
-
-@create_docs(Kappa,
-body=
-"""
-A metric to measure agreement between predicted labels and the ground truth. 
-See [https://en.wikipedia.org/wiki/Cohen%27s_kappa](https://en.wikipedia.org/wiki/Cohen%27s_kappa)
-
-$INVARIANT_LABEL
-""",
-scitype=DOC_FINITE)
-
-# calling behaviour:
-function (::Kappa)(cm::ConfusionMatrixObject{C}) where C
-    # relative observed agreement - same as accuracy
-    p₀ = sum(diag(cm.mat))/sum(cm.mat)
-
-    # probability of agreement due to chance - for each class cᵢ, this
-    # would be: (#predicted=cᵢ)/(#instances) x (#observed=cᵢ)/(#instances)
-    rows_sum = sum!(similar(cm.mat, 1, C), cm.mat) # 1 x C matrix
-    cols_sum = sum!(similar(cm.mat, C, 1), cm.mat) # C X 1 matrix
-    pₑ = first(rows_sum*cols_sum)/sum(rows_sum)^2
-
-    # Kappa calculation
-    κ = (p₀ - pₑ)/(1 - pₑ)
-
-    return κ
-end
-
-call(k::Kappa, ŷ, y) = _confmat(ŷ, y, warn=false) |> k
-
-
-# ==================================================================
-## DETERMINISTIC BINARY PREDICTIONS - ORDER-INDEPENDENT
-
-# ------------------------------------------------------------------
-# Matthew's correlation
-
-struct MatthewsCorrelation <: Aggregated end
-
-metadata_measure(MatthewsCorrelation;
-                 instances = ["matthews_correlation", "mcc"],
-                 target_scitype           = FiniteArrMissing{2},
-                 prediction_type          = :deterministic,
-                 orientation              = :score,
-                 supports_weights         = false)
-const MCC = MatthewsCorrelation
-@create_aliases MCC
-
-@create_docs(MatthewsCorrelation,
-body=
-"""
-[https://en.wikipedia.org/wiki/Matthews_correlation_coefficient](https://en.wikipedia.org/wiki/Matthews_correlation_coefficient)
-$INVARIANT_LABEL
-""",
-scitype=DOC_FINITE_BINARY)
-
-# calling behaviour:
-function (::MCC)(cm::ConfusionMatrixObject{C}) where C
-    # http://rk.kvl.dk/introduction/index.html
-    # NOTE: this is O(C^3), there may be a clever way to
-    # speed this up though in general this is only used for low  C
-    num = 0
-    @inbounds for k in 1:C, l in 1:C, m in 1:C
-        num += cm[k,k] * cm[l,m] - cm[k,l] * cm[m,k]
-    end
-    den1 = 0
-    den2 = 0
-    @inbounds for k in 1:C
-        a = sum(cm[k, :])
-        b = sum(cm[setdiff(1:C, k), :])
-        den1 += a * b
-        a = sum(cm[:, k])
-        b = sum(cm[:, setdiff(1:C, k)])
-        den2 += a * b
-    end
-    mcc = num / sqrt(float(den1) * float(den2))
-
-    isnan(mcc) && return 0
-    return mcc
-end
-
-call(m::MCC, ŷ, y) = _confmat(ŷ, y, warn=false) |> m
-
-
-# ==========================================================================
-# DETERMINISTIC BINARY PREDICTIONS - ORDER DEPENDENT
-
-const CM2 = ConfusionMatrixObject{2}
-
-# --------------------------------------------------------------------------
-# FScore
-
-struct FScore{T<:Real} <: Aggregated
-    β::T
-    rev::Union{Nothing,Bool}
-end
-
-FScore(; β=1.0, rev=nothing) = FScore(β, rev)
-
-metadata_measure(FScore;
-                 human_name = "F-Score",
-                 instances = ["f1score",],
-                 target_scitype           = FiniteArrMissing{2},
-                 prediction_type          = :deterministic,
-                 orientation              = :score,
-                 supports_weights         = false)
-
-@create_aliases FScore
-
-@create_docs(FScore,
-body=
-"""
-This is the one-parameter generalization, ``F_β``, of the F-measure or
-balanced F-score.
-
-[https://en.wikipedia.org/wiki/F1_score](https://en.wikipedia.org/wiki/F1_score)
-
-Constructor signature: `FScore(; β=1.0, rev=true)`.
-
-By default, the second element of `levels(y)` is designated as
-`true`. To reverse roles, specify `rev=true`.
-""",
-scitype=DOC_ORDERED_FACTOR_BINARY,
-footer="Constructor signature: `FScore(β=1.0, rev=false)`. ")
-
-# calling on conf matrix:
-function (score::FScore)(m::CM2)
-    β = score.β
-    β2   = β^2
-    tp = _tp(m)
-    fn = _fn(m)
-    fp = _fp(m)
-    return (1 + β2) * tp / ((1 + β2)*tp + β2*fn + fp)
-end
-
-# calling on arrays:
-call(m::FScore, ŷ, y) = _confmat(ŷ, y; rev=m.rev) |> m
-
-# -------------------------------------------------------------------------
-# TruePositive and its cousins - struct and metadata declerations
-
-const TRUE_POSITIVE_AND_COUSINS =
-    (:TruePositive, :TrueNegative, :FalsePositive, :FalseNegative,
-     :TruePositiveRate, :TrueNegativeRate, :FalsePositiveRate,
-     :FalseNegativeRate, :FalseDiscoveryRate, :Precision,
-     :NegativePredictiveValue)
-
-for M in TRUE_POSITIVE_AND_COUSINS
-    ex = quote
-        struct $M <: Aggregated rev::Union{Nothing,Bool} end
-        $M(; rev=nothing) = $M(rev)
-    end
-    eval(ex)
-end
-
-metadata_measure.((FalsePositive, FalseNegative);
-    target_scitype           = FiniteArrMissing{2},
-    prediction_type          = :deterministic,
-    orientation              = :loss,
-    aggregation              = Sum(),
-    supports_weights         = false)
-
-metadata_measure.((FalsePositiveRate, FalseNegativeRate, FalseDiscoveryRate);
-    target_scitype           = FiniteArrMissing{2},
-    prediction_type          = :deterministic,
-    orientation              = :loss,
-    supports_weights         = false)
-
-metadata_measure.((TruePositive, TrueNegative);
-    target_scitype           = FiniteArrMissing{2},
-    prediction_type          = :deterministic,
-    orientation              = :score,
-    aggregation              = Sum(),
-    supports_weights         = false)
-
-metadata_measure.((TruePositiveRate, TrueNegativeRate, Precision,
-                   NegativePredictiveValue);
-    target_scitype           = FiniteArrMissing{2},
-    prediction_type          = :deterministic,
-    orientation              = :score,
-    supports_weights         = false)
-
-# adjustments:
-instances(::Type{<:TruePositive}) = ["true_positive", "truepositive"]
-human_name(::Type{<:TruePositive})  = "number of true positives"
-
-instances(::Type{<:TrueNegative}) = ["true_negative", "truenegative"]
-human_name(::Type{<:TrueNegative}) = "number of true negatives"
-
-instances(::Type{<:FalsePositive}) = ["false_positive", "falsepositive"]
-human_name(::Type{<:FalsePositive}) = "number of false positives"
-
-instances(::Type{<:FalseNegative}) = ["false_negative", "falsenegative"]
-human_name(::Type{<:FalseNegative}) = "number of false negatives"
-
-instances(::Type{<:TruePositiveRate}) =
-    ["true_positive_rate", "truepositive_rate",
-     "tpr", "sensitivity", "recall", "hit_rate"]
-human_name(::Type{<:TruePositiveRate}) =
-    "true positive rate (a.k.a recall)"
-
-instances(::Type{<:TrueNegativeRate}) =
-    ["true_negative_rate", "truenegative_rate", "tnr",
-     "specificity", "selectivity"]
-
-instances(::Type{<:FalsePositiveRate}) =
-    ["false_positive_rate", "falsepositive_rate",
-     "fpr", "fallout"]
-                               "."
-instances(::Type{<:FalseNegativeRate}) =
-    ["false_negative_rate", "falsenegative_rate", "fnr", "miss_rate"]
-                               "."
-instances(::Type{<:FalseDiscoveryRate}) =
-    ["false_discovery_rate", "falsediscovery_rate", "fdr"]
-
-instances(::Type{<:NegativePredictiveValue}) =
-    ["negative_predictive_value", "negativepredictive_value", "npv"]
-
-instances(::Type{<:Precision}) =
-    ["positive_predictive_value", "ppv", "positivepredictive_value", "precision"]
-human_name(::Type{<:Precision}) =
-    "precision (a.k.a. positive predictive value)"
-
-
-# ---------------------------------------------------------------------
-# TruePositive and its cousins - doc-string building and alias creation
-
-for M in TRUE_POSITIVE_AND_COUSINS
-    eval(quote
-         $M == Precision || @create_aliases $M # precision handled separately
-
-         @create_docs($M,
-         body=
-         """
-         Assigns `false` to first element of `levels(y)`. To reverse roles,
-         use `$(name($M))(rev=true)`.
-         """,
-         scitype=DOC_ORDERED_FACTOR_BINARY)
-         end)
-end
-
-# type aliases:
-const TNR = TrueNegativeRate
-const Specificity = TrueNegativeRate
-const TPR = TruePositiveRate
-const Recall = TPR
-const FPR = FalsePositiveRate
-const FNR = FalseNegativeRate
-const FDR = FalseDiscoveryRate
-const NPV = NegativePredictiveValue
-const PPV = Precision
-
-# special case of precision; cannot generate alias's automatically due
-# to conflict with Base.precision:
-const positive_predictive_value = Precision()
-const ppv = Precision()
-const positivepredictive_value = Precision()
-
-# ----------------------------------------------------------------------
-# TruePositive and its cousins - helper functions for confusion matrices
-
-_tp(m::CM2) = m[2,2]
-_tn(m::CM2) = m[1,1]
-_fp(m::CM2) = m[2,1]
-_fn(m::CM2) = m[1,2]
-
-_tpr(m::CM2) = _tp(m) / (_tp(m) + _fn(m))
-_tnr(m::CM2) = _tn(m) / (_tn(m) + _fp(m))
-_fpr(m::CM2) = 1 - _tnr(m)
-_fnr(m::CM2) = 1 - _tpr(m)
-
-_fdr(m::CM2) = _fp(m) / (_tp(m) + _fp(m))
-_npv(m::CM2) = _tn(m) / (_tn(m) + _fn(m))
-
-# ----------------------------------------------------------------------
-# TruePositive and its cousins - calling behaviour
-
-# NOTE: here we assume the CM was constructed a priori with the
-# proper ordering so the field `rev` in the measure is ignored
-
-# on confusion matrices:
-(::TruePositive)(m::CM2)  = _tp(m)
-(::TrueNegative)(m::CM2)  = _tn(m)
-(::FalsePositive)(m::CM2) = _fp(m)
-(::FalseNegative)(m::CM2) = _fn(m)
-(::TPR)(m::CM2) = _tpr(m)
-(::TNR)(m::CM2) = _tnr(m)
-(::FPR)(m::CM2) = _fpr(m)
-(::FNR)(m::CM2) = _fnr(m)
-(::FDR)(m::CM2) = _fdr(m)
-(::NPV)(m::CM2) = _npv(m)
-(::Precision)(m::CM2) = 1.0 - _fdr(m)
-
-# on arrays (ŷ, y):
-for M_ex in TRUE_POSITIVE_AND_COUSINS
-    @eval call(m::$M_ex, ŷ, y) = _confmat(ŷ, y; rev=m.rev) |> m
-end
-
-# since Base.precision exists (as single argument function) we
-# manually overload Base.precision:
-Base.precision(m::CM2) = m |> Precision()
-function Base.precision(ŷ, y)
-    _check(Precision(), ŷ, y)
-    call(Precision(), ŷ, y)
-end
-
-
-# =================================================================
-# MULTICLASS AND ORDER INDEPENDENT
-
-const CM = ConfusionMatrixObject{N} where N
-
-abstract type MulticlassAvg end
-struct MacroAvg <: MulticlassAvg end
-struct MicroAvg <: MulticlassAvg end
-struct NoAvg <: MulticlassAvg end
-
-const macro_avg = MacroAvg()
-const micro_avg = MicroAvg()
-const no_avg    = NoAvg()
-
-const DS_AVG_RET = "Options for `average` are: `no_avg`, `macro_avg` "*
-    "(default) and `micro_avg`. Options for `return_type`, "*
-    "applying in the `no_avg` case, are: `LittleDict` (default) or "*
-    "`Vector`. "
-
-const DS_RET = "Options for `return_type` are: "*
-    "`LittleDict`(default) or "*
-    "`Vector`. "
-
-const CLASS_W = "An optional `AbstractDict`, denoted `class_w` above, "*
-    "keyed on `levels(y)`, specifies class weights. It applies if "*
-    "`average=macro_avg` or `average=no_avg`."
-
-"""
-    MulticlassFScore(; β=1.0, average=macro_avg, return_type=LittleDict)
-
-One-parameter generalization, ``F_β``, of the F-measure or balanced F-score for
-multiclass observations.
-
-    MulticlassFScore()(ŷ, y)
-    MulticlassFScore()(ŷ, y, class_w)
-
-Evaluate the default score on multiclass observations, `ŷ`, given
-ground truth values, `y`. $DS_AVG_RET $CLASS_W
-
-For more information, run `info(MulticlassFScore)`.
-
-"""
-struct MulticlassFScore{T<:Real,
-                        M<:MulticlassAvg,
-                        U<:Union{Vector, LittleDict}} <:Aggregated
-    β::T
-    average::M
-    return_type::Type{U}
-end
-
-MulticlassFScore(; β=1.0, average=macro_avg, return_type=LittleDict) =
-    MulticlassFScore(β, average, return_type)
-
-metadata_measure(MulticlassFScore;
-                 instances = ["macro_f1score", "micro_f1score",
-                              "multiclass_f1score"],
-                 target_scitype           = FiniteArrMissing,
-                 prediction_type          = :deterministic,
-                 orientation              = :score,
-                 supports_weights         = false,
-                 supports_class_weights   = true)
-
-MLJModelInterface.docstring(::Type{<:MulticlassFScore}) =
-    "Multiclass F_β score; aliases: " *
-    "`macro_f1score=MulticlassFScore()`, "*
-    "`multiclass_f1score=MulticlassFScore()` " *
-    "`micro_f1score=MulticlassFScore(average=micro_avg)`."
-
-const micro_f1score      = MulticlassFScore(average=micro_avg)
-const macro_f1score      = MulticlassFScore(average=macro_avg)
-const multiclass_f1score = MulticlassFScore(average=macro_avg)
-
-for M in (:MulticlassTruePositive, :MulticlassTrueNegative,
-          :MulticlassFalsePositive, :MulticlassFalseNegative)
-    ex = quote
-        struct $M{U<:Union{Vector, LittleDict}} <: Aggregated
-            return_type::Type{U}
-        end
-#        $M(return_type::Type{U}) where {U} = $M(return_type)
-        $M(; return_type=LittleDict) = $M(return_type)
-    end
-    eval(ex)
-end
-
-const _mtp_vec = MulticlassTruePositive(return_type=Vector)
-const _mfn_vec = MulticlassFalseNegative(return_type=Vector)
-const _mfp_vec = MulticlassFalsePositive(return_type=Vector)
-const _mtn_vec = MulticlassTrueNegative(return_type=Vector)
-
-for M in (:MulticlassTruePositiveRate, :MulticlassTrueNegativeRate,
-          :MulticlassFalsePositiveRate, :MulticlassFalseNegativeRate,
-          :MulticlassFalseDiscoveryRate, :MulticlassPrecision,
-          :MulticlassNegativePredictiveValue)
-    ex = quote
-        struct $M{T<:MulticlassAvg, U<:Union{Vector, LittleDict}} <: Aggregated
-            average::T
-            return_type::Type{U}
-        end
-        $M(; average=macro_avg, return_type=LittleDict) = $M(average, return_type)
-    end
-    eval(ex)
-end
-
-metadata_measure.((MulticlassFalsePositive, MulticlassFalseNegative);
-    target_scitype           = FiniteArrMissing,
-    prediction_type          = :deterministic,
-    orientation              = :loss,
-    aggregation               = Sum(),
-    is_feature_dependent     = false,
-    supports_weights         = false,
-    supports_class_weights   = false)
-
-metadata_measure.((MulticlassFalsePositiveRate, MulticlassFalseNegativeRate,
-                   MulticlassFalseDiscoveryRate);
-    target_scitype           = FiniteArrMissing,
-    prediction_type          = :deterministic,
-    orientation              = :loss,
-    is_feature_dependent     = false,
-    supports_weights         = false,
-    supports_class_weights   = true)
-
-metadata_measure.((MulticlassTruePositive, MulticlassTrueNegative);
-    target_scitype           = FiniteArrMissing,
-    prediction_type          = :deterministic,
-    orientation              = :score,
-    aggregation              = Sum(),
-    is_feature_dependent     = false,
-    supports_weights         = false,
-    supports_class_weights   = false)
-
-metadata_measure.((MulticlassTrueNegativeRate, MulticlassNegativePredictiveValue);
-    target_scitype           = FiniteArrMissing,
-    prediction_type          = :deterministic,
-    orientation              = :score,
-    is_feature_dependent     = false,
-    supports_weights         = false,
-    supports_class_weights   = true)
-
-metadata_measure.((MulticlassTruePositiveRate, MulticlassPrecision);
-    target_scitype           = FiniteArrMissing,
-    prediction_type          = :deterministic,
-    orientation              = :score,
-    is_feature_dependent     = false,
-    supports_weights         = false,
-    supports_class_weights   = true)
-
-MMI.docstring(::Type{<:MulticlassTruePositive})  =
-    "Number of true positives; " *
-    "aliases: `multiclass_true_positive`, `multiclass_truepositive`."
-instances(::Type{<:MulticlassTruePositive})  =
-    ["multiclass_true_positive", "multiclass_truepositive"]
-MMI.docstring(::Type{<:MulticlassTrueNegative})  =
-    "Number of true negatives; " *
-    "aliases: `multiclass_true_negative`, `multiclass_truenegative`."
-instances(::Type{<:MulticlassTrueNegative})  =
-    ["multiclass_true_negative", "multiclass_truenegative"]
-MMI.docstring(::Type{<:MulticlassFalsePositive}) =
-    "Number of false positives; " *
-    "aliases: `multiclass_false_positive`, `multiclass_falsepositive`."
-instances(::Type{<:MulticlassFalsePositive}) =
-    ["multiclass_false_positive", "multiclass_falsepositive"]
-MMI.docstring(::Type{<:MulticlassFalseNegative}) =
-    "Number of false negatives; " *
-    "aliases: `multiclass_false_negative`, `multiclass_falsenegative`."
-instances(::Type{<:MulticlassFalseNegative}) =
-    ["multiclass_false_negative", "multiclass_falsenegative"]
-
-MMI.docstring(::Type{<:MulticlassTruePositiveRate}) =
-    "multiclass true positive rate; aliases: " *
-    "`multiclass_true_positive_rate`, `multiclass_tpr`, " *
-    "`multiclass_sensitivity`, `multiclass_recall`, " *
-    "`multiclass_hit_rate`, `multiclass_truepositive_rate`, "
-instances(::Type{<:MulticlassTruePositiveRate}) =
-    ["multiclass_true_positive_rate", "multiclass_tpr",
-    "multiclass_sensitivity", "multiclass_recall",
-    "multiclass_hit_rate", "multiclass_truepositive_rate"]
-MMI.docstring(::Type{<:MulticlassTrueNegativeRate}) =
-    "multiclass true negative rate; aliases: " *
-    "`multiclass_true_negative_rate`, `multiclass_tnr` " *
-    " `multiclass_specificity`, `multiclass_selectivity`, " *
-    "`multiclass_truenegative_rate`."
-instances(::Type{<:MulticlassTrueNegativeRate}) =
-    ["multiclass_true_negative_rate", "multiclass_tnr",
-    "multiclass_specificity", "multiclass_selectivity",
-    "multiclass_truenegative_rate"]
-MMI.docstring(::Type{<:MulticlassFalsePositiveRate}) =
-                       "multiclass false positive rate; aliases: " *
-                       "`multiclass_false_positive_rate`, `multiclass_fpr` " *
-                       "`multiclass_fallout`, `multiclass_falsepositive_rate`."
-instances(::Type{<:MulticlassFalsePositiveRate}) =
-    ["multiclass_false_positive_rate", "multiclass_fpr",
-     "multiclass_fallout", "multiclass_falsepositive_rate"]
-MMI.docstring(::Type{<:MulticlassFalseNegativeRate}) =
-    "multiclass false negative rate; aliases: " *
-    "`multiclass_false_negative_rate`, `multiclass_fnr`, " *
-    "`multiclass_miss_rate`, `multiclass_falsenegative_rate`."
-instances(::Type{<:MulticlassFalseNegativeRate}) =
-    ["multiclass_false_negative_rate", "multiclass_fnr",
-    "multiclass_miss_rate", "multiclass_falsenegative_rate"]
-MMI.docstring(::Type{<:MulticlassFalseDiscoveryRate}) =
-    "multiclass false discovery rate; "*
-    "aliases: `multiclass_false_discovery_rate`, " *
-    "`multiclass_falsediscovery_rate`, `multiclass_fdr`."
-instances(::Type{<:MulticlassFalseDiscoveryRate}) =
-    ["multiclass_falsediscovery_rate", "multiclass_fdr",
-     "multiclass_false_discovery_rate"]
-MMI.docstring(::Type{<:MulticlassNegativePredictiveValue}) =
-    "multiclass negative predictive value; aliases: " *
-    "`multiclass_negative_predictive_value`, " *
-    "`multiclass_negativepredictive_value`, `multiclass_npv`."
-instances(::Type{<:MulticlassNegativePredictiveValue}) =
-    ["multiclass_negative_predictive_value",
-    "multiclass_negativepredictive_value", "multiclass_npv"]
-MMI.docstring(::Type{<:MulticlassPrecision}) =
-  "multiclass positive predictive value (aka precision);"*
-  " aliases: `multiclass_positive_predictive_value`, `multiclass_ppv`, " *
-  "`multiclass_positivepredictive_value`, " *
-  "`multiclass_precision`."
-instances(::Type{<:MulticlassPrecision}) =
-    ["multiclass_positive_predictive_value", "multiclass_ppv",
-     "multiclass_positivepredictive_value", "multiclass_precision"]
-
-const W_KEY_MISMATCH = "Encountered target with levels different from the " *
-                       "keys of user-specified dictionary of class weights."
-const W_PROMOTE_WARN = "Using macro averaging instead of micro averaging, as "*
-    "class weights specified. "
-
-
-# ----------------------------------------------------
-# MulticlassTruePositive
-
-"""
-    MulticlassTruePositive(; return_type=LittleDict)
-
-$(docstring(MulticlassTruePositive()))
-
-    MulticlassTruePositive()(ŷ, y)
-
-Number of true positives for multiclass observations `ŷ` and ground
-truth `y`, using default return type. $DS_RET
-
-For more information, run `info(MulticlassTruePositive)`.
-
-"""
-function MulticlassTruePositive end
-const multiclass_true_positive  = MulticlassTruePositive()
-const multiclass_truepositive   = MulticlassTruePositive()
-const mtp = MulticlassTruePositive()
-
-
-# ----------------------------------------------------
-# MulticlassTrueNegative
-
-"""
-    MulticlassTrueNegative(; return_type=LittleDict)
-
-$(docstring(MulticlassTrueNegative()))
-
-    MulticlassTrueNegative()(ŷ, y)
-
-Number of true negatives for multiclass observations `ŷ` and ground truth
-`y`, using default return type. $DS_RET
-
-For more information, run `info(MulticlassTrueNegative)`.
-
-"""
-function MulticlassTrueNegative end
-const multiclass_true_negative  = MulticlassTrueNegative()
-const multiclass_truenegative   = MulticlassTrueNegative()
-const mtn = MulticlassTrueNegative()
-
-
-# ----------------------------------------------------
-# MulticlassFalsePositive
-
-"""
-    MulticlassFalsePositive(; return_type=LittleDict)
-
-$(docstring(MulticlassFalsePositive()))
-
-    MulticlassFalsePositive()(ŷ, y)
-
-Number of false positives for multiclass observations `ŷ` and ground
-truth `y`, using default return type. $DS_RET
-
-For more information, run `info(MulticlassFalsePositive)`.
-
-"""
-function MulticlassPositive end
-const multiclass_false_positive = MulticlassFalsePositive()
-const multiclass_falsepositive  = MulticlassFalsePositive()
-const mfp = MulticlassFalsePositive()
-
-
-# ----------------------------------------------------
-# MulticlassFalseNegative
-
-"""
-    MulticlassFalseNegative(; return_type=LittleDict)
-
-$(docstring(MulticlassFalseNegative()))
-
-    MulticlassFalseNegative()(ŷ, y)
-
-Number of false negatives for multiclass observations `ŷ` and ground
-truth `y`, using default return type. $DS_RET
-
-For more information, run `info(MulticlassFalseNegative)`.
-
-"""
-function MulticlassNegative end
-const multiclass_false_negative = MulticlassFalseNegative()
-const multiclass_falsenegative  = MulticlassFalseNegative()
-const mfn = MulticlassFalseNegative()
-
-
-# ----------------------------------------------------
-# MulticlassTruePositiveRate
-
-"""
-    MulticlassTruePositiveRate(; average=macro_avg, return_type=LittleDict)
-
-$(docstring(MulticlassTruePositiveRate()))
-
-    MulticlassTruePositiveRate(ŷ, y)
-    MulticlassTruePositiveRate(ŷ, y, class_w)
-
-True positive rate (a.k.a. sensitivity, recall, hit rate) for
-multiclass observations `ŷ` and ground truth `y`, using default
-averaging and return type. $DS_AVG_RET $CLASS_W
-
-For more information, run `info(MulticlassTruePositiveRate)`.
-
-"""
-function MulticlassTruePositiveRate end
-const multiclass_true_positive_rate = MulticlassTruePositiveRate()
-const multiclass_truepositive_rate  = MulticlassTruePositiveRate()
-const multiclass_tpr                = MulticlassTruePositiveRate()
-const multiclass_sensitivity        = MulticlassTruePositiveRate()
-const multiclass_hit_rate           = MulticlassTruePositiveRate()
-const MTPR                          = MulticlassTruePositiveRate
-const multiclass_recall             = MulticlassTruePositiveRate()
-const MulticlassRecall              = MulticlassTruePositiveRate
-
-
-# ----------------------------------------------------
-# MulticlassTrueNegativeRate
-
-"""
-    MulticlassTrueNegativeRate(; average=macro_avg, return_type=LittleDict)
-
-$(docstring(MulticlassTrueNegativeRate()))
-
-    MulticlassTrueNegativeRate()(ŷ, y)
-    MulticlassTrueNegativeRate()(ŷ, y, class_w)
-
-True negative rate for multiclass observations `ŷ` and ground truth
-`y`, using default averaging and return type. $DS_AVG_RET $CLASS_W
-
-For more information, run `info(MulticlassTrueNegativeRate)`.
-
-"""
-function MulticlassTrueNegativeRate end
-const multiclass_true_negative_rate = MulticlassTrueNegativeRate()
-const multiclass_truenegative_rate  = MulticlassTrueNegativeRate()
-const multiclass_tnr                = MulticlassTrueNegativeRate()
-const multiclass_specificity        = MulticlassTrueNegativeRate()
-const multiclass_selectivity        = MulticlassTrueNegativeRate()
-const MulticlassSpecificity         = MulticlassTrueNegativeRate
-const MTNR                          = MulticlassTrueNegativeRate
-
-
-# ----------------------------------------------------
-# MulticlassFalsePositiveRate
-
-"""
-    MulticlassFalsePositiveRate(; average=macro_avg, return_type=LittleDict)
-
-$(docstring(MulticlassFalsePositiveRate()))
-
-    MulticlassFalsePositiveRate()(ŷ, y)
-    MulticlassFalsePositiveRate()(ŷ, y, class_w)
-
-False positive rate for multiclass observations `ŷ` and ground truth
-`y`, using default averaging and return type.  $DS_AVG_RET $CLASS_W
-
-For more information, run `info(MulticlassFalsePositiveRate)`.
-
-"""
-function MulticlassFalsePositiveRate end
-const multiclass_false_positive_rate = MulticlassFalsePositiveRate()
-const multiclass_falsepositive_rate  = MulticlassFalsePositiveRate()
-const multiclass_fpr                 = MulticlassFalsePositiveRate()
-const MFPR                           = MulticlassFalsePositiveRate
-const multiclass_fallout             = MFPR()
-
-
-# ----------------------------------------------------
-# MulticlassFalseNegativeRate
-
-"""
-    MulticlassFalseNegativeRate(; average=macro_avg, return_type=LittleDict)
-
-$(docstring(MulticlassFalseNegativeRate()))
-
-    MulticlassFalseNegativeRate()(ŷ, y)
-    MulticlassFalseNegativeRate()(ŷ, y, class_w)
-
-False negative rate for multiclass observations `ŷ` and ground truth
-`y`, using default averaging and return type.  $DS_AVG_RET $CLASS_W
-
-For more information, run `info(MulticlassFalseNegativeRate)`.
-
-"""
-function MulticlassFalseNegativeRate end
-const multiclass_false_negative_rate = MulticlassFalseNegativeRate()
-const multiclass_falsenegative_rate  = MulticlassFalseNegativeRate()
-const multiclass_fnr                 = MulticlassFalseNegativeRate()
-const MFNR                           = MulticlassFalseNegativeRate
-const multiclass_miss_rate           = MFNR()
-
-
-# ----------------------------------------------------
-# MulticlassFalseDiscoveryRate
-
-"""
-    MulticlassFalseDiscoveryRate(; average=macro_avg, return_type=LittleDict)
-
-$(docstring(MulticlassFalseDiscoveryRate()))
-
-    MulticlassFalseDiscoveryRate()(ŷ, y)
-    MulticlassFalseDiscoveryRate()(ŷ, y, class_w)
-
-False discovery rate for multiclass observations `ŷ` and ground truth
-`y`, using default averaging and return type.  $DS_AVG_RET $CLASS_W
-
-For more information, run `info(MulticlassFalseDiscoveryRate)`.
-
-"""
-function MulticlassFalseDiscoveryRate end
-const multiclass_false_discovery_rate = MulticlassFalseDiscoveryRate()
-const multiclass_falsediscovery_rate  = MulticlassFalseDiscoveryRate()
-const multiclass_fdr                  = MulticlassFalseDiscoveryRate()
-const MFDR                            = MulticlassFalseDiscoveryRate
-
-
-# ----------------------------------------------------
-# MulticlassPrecision
-
-"""
-    MulticlassPrecision(; average=macro_avg, return_type=LittleDict)
-
-$(docstring(MulticlassPrecision()))
-
-    MulticlassPrecision()(ŷ, y)
-    MulticlassPrecision()(ŷ, y, class_w)
-
-Precision for multiclass observations `ŷ` and ground truth `y`, using
-default averaging and return type. $DS_AVG_RET $CLASS_W
-
-For more information, run `info(MulticlassPrecision)`.
-
-"""
-function MulticlassPrecision end
-const multiclass_precision                 = MulticlassPrecision()
-const multiclass_ppv                       = MulticlassPrecision()
-const multiclass_positive_predictive_value = MulticlassPrecision()
-const multiclass_positivepredictive_value  = MulticlassPrecision()
-const MPPV                                 = MulticlassPrecision
-
-
-# ----------------------------------------------------
-# MulticlassNegativePredictiveValue
-
-"""
-    MulticlassNegativePredictiveValue(; average=macro_avg, return_type=LittleDict)
-
-$(docstring(MulticlassNegativePredictiveValue()))
-
-    MulticlassNegativePredictiveValue()(ŷ, y)
-    MulticlassNegativePredictiveValue()(ŷ, y, class_w)
-
-Negative predictive value for multiclass observations `ŷ` and ground truth
-`y`, using default averaging and return type. $DS_AVG_RET $CLASS_W
-
-For more information, run `info(MulticlassNegativePredictiveValue)`.
-
-"""
-function MulticlassNegativePredictiveValue end
-const multiclass_npv                       = MulticlassNegativePredictiveValue()
-const multiclass_negative_predictive_value = MulticlassNegativePredictiveValue()
-const multiclass_negativepredictive_value  = MulticlassNegativePredictiveValue()
-const MNPV                                 = MulticlassNegativePredictiveValue
-
-
-# -----------------------------------------------------
-## INTERNAL FUNCTIONS ON MULTICLASS CONFUSION MATRIX
-
-_mtp(m::CM, return_type::Type{Vector}) = diag(m.mat)
-_mtp(m::CM, return_type::Type{LittleDict}) =
-    LittleDict(m.labels, diag(m.mat))
-
-_mfp(m::CM, return_type::Type{Vector}) =
-    (col_sum = vec(sum(m.mat, dims=2)); col_sum .-= diag(m.mat))
-
-_mfp(m::CM, return_type::Type{LittleDict}) =
-    (col_sum = vec(sum(m.mat, dims=2)); col_sum .-= diag(m.mat);
-     LittleDict(m.labels, col_sum))
-
-_mfn(m::CM, return_type::Type{Vector}) =
-    (row_sum = vec(collect(transpose(sum(m.mat, dims=1))));
-     row_sum .-= diag(m.mat))
-
-_mfn(m::CM, return_type::Type{LittleDict}) =
-    (row_sum = vec(collect(transpose(sum(m.mat, dims=1))));
-     row_sum .-= diag(m.mat); LittleDict(m.labels, row_sum))
-
-function _mtn(m::CM, return_type::Type{Vector})
-    _sum = sum(m.mat, dims=2)
-    _sum .= sum(m.mat) .- (_sum .+= sum(m.mat, dims=1)'.- diag(m.mat))
-    return vec(_sum)
-end
-
-function _mtn(m::CM, return_type::Type{LittleDict})
-    _sum = sum(m.mat, dims=2)
-    _sum .= sum(m.mat) .- (_sum .+= sum(m.mat, dims=1)'.- diag(m.mat))
-    return LittleDict(m.labels, vec(_sum))
-end
-
-@inline _mean(x::Arr{<:Real}) = mean(skipnan(x)) # defined in src/data/data.jl
-
-@inline function _class_w(level_m::Arr{<:String},
-                          class_w::AbstractDict{<:Any, <:Real})
-    class_w_labels = levels(keys(class_w))
-    string.(class_w_labels) == level_m || throw(ArgumentError(W_KEY_MISMATCH))
-    return [class_w[l] for l in class_w_labels]
-end
-
-@inline function _mc_helper(m::CM, a::Arr{<:Real}, b::Arr{<:Real},
-                            average::NoAvg, return_type::Type{Vector})
-    return vec(a ./ (a + b))
-end
-
-@inline function _mc_helper(m::CM, a::Arr{<:Real}, b::Arr{<:Real},
-                            average::NoAvg, return_type::Type{LittleDict})
-    return LittleDict(m.labels, _mc_helper(m, a, b, average, Vector))
-end
-
-@inline function _mc_helper(m::CM, a::Arr{<:Real}, b::Arr{<:Real},
-                            average::MacroAvg, return_type)
-    return _mean(_mc_helper(m, a, b, no_avg, Vector))
-end
-
-@inline function _mc_helper(m::CM, a::Arr{<:Real}, b::Arr{<:Real},
-                            average::MicroAvg, return_type)
-    a_sum, b_sum = sum(a), sum(b)
-    return a_sum / (a_sum + b_sum)
-end
-
-@inline function _mc_helper(m::CM, a::Arr{<:Real}, b::Arr{<:Real},
-                            class_w::AbstractDict{<:Any, <:Real},
-                            average::NoAvg, return_type::Type{Vector})
-    level_w = _class_w(m.labels, class_w)
-    return _mc_helper(m, a, b, no_avg, return_type) .* level_w
-end
-
-@inline function _mc_helper(m::CM, a::Arr{<:Real}, b::Arr{<:Real},
-                            class_w::AbstractDict{<:Any, <:Real},
-                            average::MacroAvg, return_type::Type{Vector})
-    return _mean(_mc_helper(m, a, b, class_w, no_avg, return_type))
-end
-
-@inline function _mc_helper(m::CM, a::Arr{<:Real}, b::Arr{<:Real},
-                            class_w::AbstractDict{<:Any, <:Real},
-                            average::MicroAvg, return_type)
-    @warn W_PROMOTE_WARN
-    return _mc_helper(m, a, b, class_w, macro_avg, Vector)
-end
-
-@inline function _mc_helper_b(m::CM, helper_name,
-                              class_w::AbstractDict{<:Any, <:Real},
-                              average::NoAvg, return_type::Type{Vector})
-    level_w = _class_w(m.labels, class_w)
-    return (1 .- helper_name(m, no_avg, return_type)) .* level_w
-end
-
-@inline function _mc_helper_b(m::CM, helper_name,
-                              class_w::AbstractDict{<:Any, <:Real},
-                              average::NoAvg, return_type::Type{LittleDict})
-    level_w = _class_w(m.labels, class_w)
-    return LittleDict(m.labels, ((1 .- helper_name(m, no_avg, Vector)) .* level_w))
-end
-
-@inline function _mc_helper_b(m::CM, helper_name,
-                              class_w::AbstractDict{<:Any, <:Real},
-                              average::MacroAvg, return_type)
-    return _mean(_mc_helper_b(m, helper_name, class_w, no_avg, Vector))
-end
-
-@inline function _mc_helper_b(m::CM, helper_name,
-                              class_w::AbstractDict{<:Any, <:Real},
-                              average::MicroAvg, return_type)
-    @warn W_PROMOTE_WARN
-    return _mc_helper_b(m, helper_name, class_w, macro_avg, Vector)
-end
-
-@inline function _mc_helper_b(m::CM, helper_name, average::NoAvg,
-                              return_type::Type{LittleDict})
-    return LittleDict(m.labels, 1.0 .- helper_name(m, average, Vector))
-end
-
-@inline function _mc_helper_b(m::CM, helper_name, average::NoAvg,
-                              return_type::Type{Vector})
-    return 1.0 .- helper_name(m, average, Vector)
-end
-
-@inline function _mc_helper_b(m::CM, helper_name, average::MacroAvg,
-                              return_type)
-    return 1.0 .- helper_name(m, average, Vector)
-end
-
-@inline function _mc_helper_b(m::CM, helper_name, average::MicroAvg,
-                              return_type)
-    return 1.0 .- helper_name(m, average, Vector)
-end
-
-@inline function _mc_helper(m::CM, a::Arr{<:Real}, b::Arr{<:Real},
-                            class_w::AbstractDict{<:Any, <:Real},
-                            average::NoAvg, return_type::Type{LittleDict})
-    level_w = _class_w(m.labels, class_w)
-    return LittleDict(m.labels, _mc_helper(m, a, b, class_w, no_avg, Vector))
-end
-
-@inline function _mc_helper(m::CM, a::Arr{<:Real}, b::Arr{<:Real},
-                            class_w::AbstractDict{<:Any, <:Real},
-                            average::MacroAvg, return_type::Type{U}) where U
-    return _mean(_mc_helper(m, a, b, class_w, no_avg, Vector))
-end
-
-@inline function _mc_helper(m::CM, a::Arr{<:Real}, b::Arr{<:Real},
-                            class_w::AbstractDict{<:Any, <:Real},
-                            average::MicroAvg, return_type::Type{U}) where U
-    @warn W_PROMOTE_WARN
-    return _mc_helper(m, a, b, class_w, macro_avg, return_type)
-end
-
-function _mtpr(m::CM, average::A, return_type::Type{U}) where {A, U}
-    mtp_val, mfn_val = _mtp_vec(m), _mfn_vec(m)
-    return _mc_helper(m, mtp_val, mfn_val, average, return_type)
-end
-
-function _mtpr(m::CM, class_w::AbstractDict{<:Any, <:Real}, average::A,
-               return_type::Type{U}) where {A, U}
-    mtp_val, mfn_val = _mtp_vec(m), _mfn_vec(m)
-    return _mc_helper(m, mtp_val, mfn_val, class_w, average, return_type)
-end
-
-function _mtnr(m::CM, average::A, return_type::Type{U}) where {A, U}
-    mtn_val, mfp_val = _mtn_vec(m), _mfp_vec(m)
-    return _mc_helper(m, mtn_val, mfp_val, average, return_type)
-end
-
-function _mtnr(m::CM, class_w::AbstractDict{<:Any, <:Real}, average::A,
-               return_type::Type{U}) where {A, U}
-    mtn_val, mfp_val = _mtn_vec(m), _mfp_vec(m)
-    return _mc_helper(m, mtn_val, mfp_val, class_w, average, return_type)
-end
-
-_mfpr(m::CM, average::A, return_type::Type{U}) where {A, U} =
-    _mc_helper_b(m, _mtnr, average, return_type)
-
-function _mfpr(m::CM, class_w::AbstractDict{<:Any, <:Real}, average::A,
-               return_type::Type{U}) where {A, U}
-    return _mc_helper_b(m, _mtnr, class_w, average, return_type)
-end
-
-_mfnr(m::CM, average::A, return_type::Type{U}) where {A, U} =
-    _mc_helper_b(m, _mtpr, average, return_type)
-
-function _mfnr(m::CM, class_w::AbstractDict{<:Any, <:Real}, average::A,
-               return_type::Type{U}) where {A, U}
-    return _mc_helper_b(m, _mtpr, class_w, average, return_type)
-end
-
-function _mfdr(m::CM, average::A, return_type::Type{U}) where {A, U}
-    mfp_val, mtp_val = _mfp_vec(m), _mtp_vec(m)
-    return _mc_helper(m, mfp_val, mtp_val, average, return_type)
-end
-
-function _mfdr(m::CM, class_w::AbstractDict{<:Any, <:Real}, average::A,
-               return_type::Type{U}) where {A, U}
-    mfp_val, mtp_val = _mfp_vec(m), _mtp_vec(m)
-    return _mc_helper(m, mfp_val, mtp_val, class_w, average, return_type)
-end
-
-function _mnpv(m::CM, average::A, return_type::Type{U}) where {A, U}
-    mtn_val, mfn_val = _mtn_vec(m), _mfn_vec(m)
-    return _mc_helper(m, mtn_val, mfn_val, average, return_type)
-end
-
-function _mnpv(m::CM, class_w::AbstractDict{<:Any, <:Real}, average::A,
-               return_type::Type{U}) where {A, U}
-    mtn_val, mfn_val = _mtn_vec(m), _mfn_vec(m)
-    return _mc_helper(m, mtn_val, mfn_val, class_w, average, return_type)
-end
-
-## CALLABLES ON MULTICLASS CONFUSION MATRIX
-
-(p::MulticlassTruePositive)(m::CM)  = _mtp(m, p.return_type)
-(n::MulticlassTrueNegative)(m::CM)  = _mtn(m, n.return_type)
-(p::MulticlassFalsePositive)(m::CM) = _mfp(m, p.return_type)
-(n::MulticlassFalseNegative)(m::CM) = _mfn(m, n.return_type)
-
-(r::MTPR)(m::CM) = _mtpr(m, r.average, r.return_type)
-(r::MTPR)(m::CM, w::AbstractDict{<:Any, <:Real}) =
-    _mtpr(m, w, r.average, r.return_type)
-
-(r::MTNR)(m::CM) = _mtnr(m, r.average, r.return_type)
-(r::MTNR)(m::CM, w::AbstractDict{<:Any, <:Real}) =
-    _mtnr(m, w, r.average, r.return_type)
-
-(r::MFPR)(m::CM) = _mfpr(m, r.average, r.return_type)
-(r::MFPR)(m::CM, w::AbstractDict{<:Any, <:Real}) =
-    _mfpr(m, w, r.average, r.return_type)
-
-(r::MFNR)(m::CM) = _mfnr(m, r.average, r.return_type)
-(r::MFNR)(m::CM, w::AbstractDict{<:Any, <:Real}) =
-    _mfnr(m, w, r.average, r.return_type)
-
-(r::MFDR)(m::CM) = _mfdr(m, r.average, r.return_type)
-(r::MFDR)(m::CM, w::AbstractDict{<:Any, <:Real}) =
-    _mfdr(m, w, r.average, r.return_type)
-
-(v::MNPV)(m::CM) = _mnpv(m, v.average, v.return_type)
-(v::MNPV)(m::CM, w::AbstractDict{<:Any, <:Real}) =
-    _mnpv(m, w, v.average, v.return_type)
-
-(p::MulticlassPrecision)(m::CM) =
-    _mc_helper_b(m, _mfdr, p.average, p.return_type)
-(p::MulticlassPrecision)(m::CM, class_w::AbstractDict{<:Any, <:Real}) =
-    _mc_helper_b(m, _mfdr, class_w, p.average, p.return_type)
-
-@inline function _fs_helper(m::CM, β::Real, mtp_val::Arr{<:Real}, mfp_val::Arr{<:Real}, mfn_val::Arr{<:Real},
-                    average::NoAvg, return_type::Type{LittleDict})
-    β2 = β^2
-    return LittleDict(m.labels, (1 + β2) * mtp_val ./ ((1 + β2) * mtp_val + β2 * mfn_val + mfp_val))
-end
-
-@inline function _fs_helper(m::CM, β::Real, mtp_val::Arr{<:Real}, mfp_val::Arr{<:Real}, mfn_val::Arr{<:Real},
-                    average::NoAvg, return_type::Type{Vector})
-    β2 = β^2
-    return (1 + β2) * mtp_val ./ ((1 + β2) * mtp_val + β2 * mfn_val + mfp_val)
-end
-
-@inline function _fs_helper(m::CM, β::Real, mtp_val::Arr{<:Real}, mfp_val::Arr{<:Real}, mfn_val::Arr{<:Real},
-                            average::MacroAvg, return_type::Type{U}) where U
-    return _mean(_fs_helper(m, β, mtp_val, mfp_val, mfn_val, no_avg, Vector))
-end
-
-function (f::MulticlassFScore)(m::CM)
-    f.average == micro_avg && return MulticlassRecall(; average=micro_avg, return_type=f.return_type)(m)
-    mtp_val = _mtp(m, Vector)
-    mfp_val = _mfp(m, Vector)
-    mfn_val = _mfn(m, Vector)
-    return _fs_helper(m, f.β, mtp_val, mfp_val, mfn_val, f.average, f.return_type)
-end
-
-@inline function _fs_helper(m::CM, w::AbstractDict{<:Any, <:Real}, β::Real,
-                    average::NoAvg, return_type::Type{LittleDict})
-    level_w = _class_w(m.labels, w)
-    return LittleDict(m.labels,
-                      MulticlassFScore(β=β,
-                                       average=no_avg,
-                                       return_type=Vector)(m) .* level_w)
-end
-
-@inline function _fs_helper(m::CM, w::AbstractDict{<:Any, <:Real}, β::Real,
-                    average::NoAvg, return_type::Type{Vector})
-    level_w = _class_w(m.labels, w)
-    return MulticlassFScore(β=β,
-                            average=no_avg,
-                            return_type=Vector)(m) .* level_w
-end
-
-@inline function _fs_helper(m::CM, w::AbstractDict{<:Any, <:Real}, β::Real,
-                            average::MacroAvg, return_type::Type{U}) where U
-    return _mean(_fs_helper(m, w, β, no_avg, Vector))
-end
-
-@inline function _fs_helper(m::CM, w::AbstractDict{<:Any, <:Real}, β::Real,
-                            average::MicroAvg, return_type::Type{U}) where U
-    @warn W_PROMOTE_WARN
-    return _fs_helper(m, w, β, macro_avg, return_type)
-end
-
-function (f::MulticlassFScore)(m::CM, class_w::AbstractDict{<:Any, <:Real})
-    return _fs_helper(m, class_w, f.β, f.average, f.return_type)
-end
-
-## Callables on arrays
-
-for M_ex in (:MulticlassTruePositive, :MulticlassTrueNegative,
-          :MulticlassFalsePositive, :MulticlassFalseNegative)
-    @eval call(m::$M_ex, ŷ, y) = m(_confmat(ŷ, y, warn=false))
-end
-
-for M_ex in (:MTPR, :MTNR, :MFPR, :MFNR, :MFDR, :MulticlassPrecision, :MNPV,
-          :MulticlassFScore)
-    @eval call(m::$M_ex, ŷ, y) = m(_confmat(ŷ, y, warn=false))
-    @eval call(m::$M_ex, ŷ, y, class_w::AbstractDict{<:Any, <:Real}) =
-        m(_confmat(ŷ, y, warn=false), class_w)
-end
diff --git a/src/measures/loss_functions_interface.jl b/src/measures/loss_functions_interface.jl
deleted file mode 100644
index 5d7d6125..00000000
--- a/src/measures/loss_functions_interface.jl
+++ /dev/null
@@ -1,208 +0,0 @@
-# implementation of MLJ measure interface for LossFunctions.jl
-
-function naked(T::Type)
-    without_module_name = split(string(T), '.') |> last
-    without_type_parameters = split(without_module_name, '{') |> first
-    return Symbol(without_type_parameters)
-end
-
-const WITHOUT_PARAMETERS =
-    setdiff(LOSS_FUNCTIONS, WITH_PARAMETERS)
-
-## WRAPPER
-
-abstract type SupervisedLoss <: Unaggregated end
-
-
-struct MarginLoss{L<:LossFunctions.MarginLoss} <: SupervisedLoss
-    loss::L
-end
-
-struct DistanceLoss{L<:LossFunctions.DistanceLoss} <: SupervisedLoss
-    loss::L
-end
-
-# INTERFACE FOR EXTRACTING PARAMETERS
-
-# LossFunctions.jl does not have a uniform interface for extacting
-# parameters, and hence:
-
-_parameter(loss::LossFunctions.DWDMarginLoss) = loss.q
-_parameter(loss::LossFunctions.SmoothedL1HingeLoss) = loss.gamma
-_parameter(loss::LossFunctions.HuberLoss) = loss.d
-_parameter(loss::LossFunctions.L1EpsilonInsLoss) = loss.ε
-_parameter(loss::LossFunctions.L2EpsilonInsLoss) = loss.ε
-_parameter(::LossFunctions.LPDistLoss{P}) where P = P
-_parameter(::LossFunctions.L1DistLoss) = 1
-_parameter(::LossFunctions.L2DistLoss) = 2
-_parameter(loss::LossFunctions.QuantileLoss) = loss.τ
-
-
-## CONSTRUCTORS AND CALLING BEHAVIOUR
-
-err_wrap(n) = ArgumentError("Bad @wrap syntax: $n. ")
-
-# We define amacro to wrap a concrete `LossFunctions.SupervisedLoss`
-# type and define its constructor, and to define property access in
-# case of parameters; the macro also defines calling behaviour:
-macro wrap_loss(ex)
-    ex.head == :call || throw(err_wrap(1))
-    Loss_ex = ex.args[1]
-    Loss_str = string(Loss_ex)
-    if Loss_ex in MARGIN_LOSSES
-        T = :MarginLoss
-    else
-        T = :DistanceLoss
-    end
-
-    # bind name to wrapped version of LossFunctions loss:
-    program = quote
-        const $Loss_ex = $T{<:LossFunctions.$Loss_ex}
-        name(M::Type{<:$Loss_ex}) = $Loss_str
-    end
-
-    # defined instances
-    alias = snakecase(string(Loss_ex))
-    push!(program.args, quote
-          instances(::Type{<:$Loss_ex}) = [$alias, ]
-          end)
-
-    # define kw constructor and expose any parameter as a property:
-    if length(ex.args) == 1
-        push!(program.args, quote
-              $Loss_ex() = $T(LossFunctions.$Loss_ex())
-              Base.propertynames(::$Loss_ex) = ()
-              end)
-    elseif length(ex.args) > 1
-        sub_ex = ex.args[2]
-        sub_ex.head == :parameters || throw(err_wrap(2))
-        length(sub_ex.args) == 1 || throw(err_wrap("Only 1 kwarg supported"))
-        sub_ex.args[1].head == :kw || throw(err_wrap(3))
-        var_ex = sub_ex.args[1].args[1]
-        var_str = string(var_ex)
-        val_ex = sub_ex.args[1].args[2]
-        push!(program.args, quote
-              $Loss_ex(; $var_ex=$val_ex) =
-                  $T(LossFunctions.$Loss_ex($var_ex))
-              $Loss_ex(p) = $Loss_ex($var_ex=p)
-              Base.propertynames(::$Loss_ex) = (Symbol($var_str), )
-              function Base.getproperty(wrapper::$Loss_ex, name::Symbol)
-                  if name === Symbol($var_str)
-                      return _parameter(getfield(wrapper, :loss)) # see below
-                  end
-                  error("type $($Loss_ex) has no property $name")
-              end
-              end)
-    else
-        throw(err_wrap(4))
-    end
-
-    esc(program)
-end
-
-for Loss in WITHOUT_PARAMETERS
-    eval(:(@wrap_loss $Loss()))
-end
-
-@wrap_loss DWDMarginLoss(; q=1.0)
-@wrap_loss SmoothedL1HingeLoss(; gamma=1.0)
-@wrap_loss HuberLoss(; d=1.0)
-@wrap_loss L1EpsilonInsLoss(; ε=1.0)
-@wrap_loss L2EpsilonInsLoss(; ε=1.0)
-@wrap_loss LPDistLoss(; P=2)
-@wrap_loss QuantileLoss(; τ=0.7)
-
-
-## GENERIC TRAITS
-
-const LossFunctions = LossFunctions
-is_measure_type(::Type{<:SupervisedLoss})          = true
-orientation(::Type{<:SupervisedLoss})              = :loss
-reports_each_observation(::Type{<:SupervisedLoss}) = true
-is_feature_dependent(::Type{<:SupervisedLoss})     = false
-supports_weights(::Type{<:SupervisedLoss}) = true
-docstring(M::Type{<:SupervisedLoss})       = name(M)
-
-
-## CALLING - DISTANCE BASED LOSS FUNCTIONS
-
-MMI.prediction_type(::Type{<:DistanceLoss}) = :deterministic
-MMI.target_scitype(::Type{<:DistanceLoss}) = Union{Vec{Continuous},Vec{Count}}
-
-call(measure::DistanceLoss, yhat, y) =
-    (getfield(measure, :loss)).(yhat, y)
-
-function call(measure::DistanceLoss, yhat, y, w::AbstractArray)
-    return w .* call(measure, yhat, y)
-end
-
-
-## CALLING - MARGIN BASED LOSS FUNCTIONS
-
-MMI.prediction_type(::Type{<:MarginLoss}) = :probabilistic
-MMI.target_scitype(::Type{<:MarginLoss})  = AbstractArray{<:Finite{2}}
-
-# rescale [0, 1] -> [-1, 1]:
-_scale(p) = 2p - 1
-
-function call(measure::MarginLoss, yhat, y)
-    probs_of_observed = broadcast(pdf, yhat, y)
-    loss = getfield(measure, :loss)
-    return loss.(_scale.(probs_of_observed), 1)
-end
-
-call(measure::MarginLoss, yhat, y, w::AbstractArray) =
-    w .* call(measure, yhat, y)
-
-
-## ADJUSTMENTS
-
-human_name(::Type{<:L1EpsilonInsLoss}) = "l1 ϵ-insensitive loss"
-human_name(::Type{<:L2EpsilonInsLoss}) = "l2 ϵ-insensitive loss"
-human_name(::Type{<:DWDMarginLoss}) = "distance weighted discrimination loss"
-
-_signature(::Any) = ""
-_signature(::Type{<:HuberLoss}) = "`HuberLoss(; d=1.0)`"
-_signature(::Type{<:DWDMarginLoss}) = "`DWDMarginLoss(; q=1.0)`"
-_signature(::Type{<:SmoothedL1HingeLoss}) = "`SmoothedL1HingeLoss(; gamma=1.0)`"
-_signature(::Type{<:L1EpsilonInsLoss}) = "`L1EpsilonInsLoss(; ε=1.0)`"
-_signature(::Type{<:L2EpsilonInsLoss}) = "`L2EpsilonInsLoss(; ε=1.0)`"
-_signature(::Type{<:LPDistLoss}) = "`LPDistLoss(; P=2)`"
-_signature(::Type{<:QuantileLoss}) = "`QuantileLoss(; τ=0.7)`"
-
-
-## ALIASES AND DOCSTRINGS
-
-const DOC_LOSS_FUNCTIONS =
-"""
-For more detail, see the original LossFunctions.jl documentation *but
-note differences in the signature.*
-
-Losses from LossFunctions.jl do not support `missing` values. To use
-with `missing` values, replace `(ŷ, y)` with `skipinvalid(ŷ, y))`.
-"""
-
-for Loss_ex in DISTANCE_LOSSES
-    eval(quote
-         sig = _signature($Loss_ex)
-         isempty(sig) || (sig = "Constructor signature: "*sig)
-         @create_aliases $Loss_ex
-         @create_docs($Loss_ex,
-                      typename = name($Loss_ex),
-                      body=DOC_LOSS_FUNCTIONS,
-                      footer=sig)
-         end)
-end
-
-for Loss_ex in MARGIN_LOSSES
-    eval(quote
-         sig = _signature($Loss_ex)
-         isempty(sig) || (sig = "Constructor signature: "*sig)
-         @create_aliases $Loss_ex
-         @create_docs($Loss_ex,
-                      typename = name($Loss_ex),
-                      body=DOC_LOSS_FUNCTIONS,
-                      scitype=DOC_FINITE_BINARY,
-                      footer= sig)
-         end)
-end
diff --git a/src/measures/measure_search.jl b/src/measures/measure_search.jl
deleted file mode 100644
index bd813009..00000000
--- a/src/measures/measure_search.jl
+++ /dev/null
@@ -1,65 +0,0 @@
-const LOCAL_MEASURE_TYPES = filter(x->x != SupervisedLoss,
-                                   vcat(subtypes(MLJBase.Unaggregated),
-                                        subtypes(MLJBase.Aggregated)))
-
-const LOSS_FUNCTIONS_MEASURE_TYPES =
-    [eval(:($Loss)) for Loss in LOSS_FUNCTIONS]
-
-const MEASURE_TYPES = vcat(LOCAL_MEASURE_TYPES, LOSS_FUNCTIONS_MEASURE_TYPES)
-
-const MeasureProxy = NamedTuple{Tuple(MEASURE_TRAITS)}
-
-function Base.show(stream::IO, p::MeasureProxy)
-    instances = "["*join(p.instances, ", ")*"]"
-    print(stream, "(name = $(p.name), instances = $instances, ...)")
-end
-
-function Base.show(stream::IO, ::MIME"text/plain", p::MeasureProxy)
-    printstyled(IOContext(stream, :color=> MLJBase.SHOW_COLOR[]),
-                p.docstring, bold=false, color=:magenta)
-    println(stream)
-    MLJBase.fancy_nt(stream, p)
-end
-
-"""
-    measures()
-
-List all measures as named-tuples keyed on measure traits.
-
-    measures(filters...)
-
-List all measures compatible with the target `y`.
-
-    measures(needle::Union{AbstractString,Regex}
-
-List all measures with `needle` in a measure's `name`, `instances`, or
-`docstring`
-
-
-### Example
-
-Find all classification measures supporting sample weights:
-
-    measures(m -> m.target_scitype <: AbstractVector{<:Finite} &&
-                  m.supports_weights)
-
-Find all measures in the "rms" family:
-
-    measures("rms")
-
-"""
-function measures(conditions...)
-    all_measures = map(info, MEASURE_TYPES)
-    return filter(all_measures) do measure
-        all(c(measure) for c in conditions)
-    end
-end
-
-function measures(needle::Union{AbstractString,Regex})
-    f = m -> occursin(needle, m.name) ||
-        occursin(needle, m.docstring) ||
-        occursin(needle, join(m.instances, " "))
-    return MLJBase.measures(f)
-end
-
-measures() = measures(x->true)
diff --git a/src/measures/measures.jl b/src/measures/measures.jl
deleted file mode 100644
index 3c23a4f9..00000000
--- a/src/measures/measures.jl
+++ /dev/null
@@ -1,302 +0,0 @@
-const PROPER_SCORING_RULES = "[Gneiting and Raftery (2007), \"Strictly"*
-    "Proper Scoring Rules, Prediction, and Estimation\""*
-    "](https://doi.org/10.1198/016214506000001437)"
-const DOC_FINITE =
-    "`AbstractArray{<:Union{Finite,Missing}` (multiclass classification)"
-const DOC_FINITE_BINARY =
-    "`AbstractArray{<:Union{Finite{2},Missing}}` (binary classification)"
-const DOC_ORDERED_FACTOR =
-    "`AbstractArray{<:Union{OrderedFactor,Missing}}` (classification of ordered target)"
-const DOC_ORDERED_FACTOR_BINARY =
-    "`AbstractArray{<:Union{OrderedFactor{2},Missing}}` "*
-    "(binary classification where choice of \"true\" effects the measure)"
-const DOC_CONTINUOUS = "`AbstractArray{<:Union{Continuous,Missing}}` (regression)"
-const DOC_COUNT = "`AbstractArray{<:Union{Count,Missing}}`"
-const DOC_MULTI = "`AbtractArray{<:Union{Missing,T}` where `T` is `Continuous` "*
-    "or `Count` (for respectively continuous or discrete Distribution.jl objects in "*
-    "`ŷ`) or  `OrderedFactor` or `Multiclass` "*
-    "(for `UnivariateFinite` distributions in `ŷ`)"
-
-const DOC_INFINITE = "`AbstractArray{<:Union{Infinite,Missing}}`"
-const INVARIANT_LABEL =
-    "This metric is invariant to class reordering."
-const VARIANT_LABEL =
-    "This metric is *not* invariant to class re-ordering"
-
-is_measure_type(::Any) = false
-
-# Each of the following traits, with fallbacks defined in
-# StatisticalTraits.jl, make sense for some or all measures:
-
-const MEASURE_TRAITS = [
-    :name,
-    :instances,
-    :human_name,
-    :target_scitype,
-    :supports_weights,
-    :supports_class_weights,
-    :prediction_type,
-    :orientation,
-    :reports_each_observation,
-    :aggregation,
-    :is_feature_dependent,
-    :docstring,
-    :distribution_type
-]
-
-# # FOR BUILT-IN MEASURES (subtyping Measure)
-
-abstract type Measure <: MLJType end
-abstract type Aggregated <: Measure end
-abstract type Unaggregated <: Measure end
-
-StatisticalTraits.reports_each_observation(::Type{<:Aggregated}) = false
-StatisticalTraits.reports_each_observation(::Type{<:Unaggregated}) = true
-
-
-# # FALLBACK CHECKS
-extra_check(::Measure, args...) = nothing
-function _check(measure::Measure, yhat, y)
-    check_dimensions(yhat, y)
-    extra_check(measure, yhat, y)
-end
-function _check(measure::Measure, yhat, y, w)
-    check_dimensions(yhat, y)
-    extra_check(measure, yhat, y, w)
-end
-function _check(measure::Measure, yhat, y, w::Arr)
-    check_dimensions(yhat, y)
-    check_dimensions(y, w)
-    extra_check(measure, yhat, y, w)
-end
-function _check(measure::Measure, yhat::Arr{<:UnivariateFinite})
-    check_dimensions(yhat, y)
-    check_pools(yhat, y)
-    extra_check(measure, yhat, y)
-end
-
-function _check(
-    measure::Measure,
-    yhat::Arr{<:UnivariateFinite},
-    y,
-    w::Arr
-)
-    check_dimensions(yhat, y)
-    check_pools(yhat, y)
-    extra_check(measure, yhat, y, w)
-end
-
-function _check(
-    measure::Measure,
-    yhat::Arr{<:UnivariateFinite},
-    y,
-    w::AbstractDict
-)
-    check_dimensions(yhat, y)
-    check_pools(yhat, y)
-    check_pools(yhat, w)
-    extra_check(measure, yhat, y, w)
-end
-
-# # METHODS TO EVALUATE MEASURES
-
-# See measures/README.md for details
-
-# `robust_single` can accept `missing` observations/predictions but is never overloaded;
-# `single` is overloaded but does not need to handle missings. This factoring allows us
-# to avoid method ambiguities which are cumbersome to avoid with only one function.
-
-robust_single(args...) = single(args...)
-robust_single(m, ::Missing, ::Missing) = missing
-robust_single(m, ::Missing, η) = missing
-robust_single(m, η̂, ::Missing) = missing
-
-const Label = Union{CategoricalValue, Number, AbstractString, Symbol, AbstractChar}
-
-# closure for broadcasting:
-robust_single(measure::Measure) = (ηhat, η) -> robust_single(measure, ηhat, η)
-
-call(measure::Unaggregated, yhat, y) = broadcast(robust_single(measure), yhat, y)
-function call(measure::Unaggregated, yhat, y, w::AbstractArray)
-    unweighted = broadcast(robust_single(measure), yhat, y)
-    return w .* unweighted
-end
-function call(measure::Unaggregated, yhat, y, weight_given_class::AbstractDict)
-    unweighted = broadcast(robust_single(measure), yhat, y)
-    w = @inbounds broadcast(η -> weight_given_class[η], y)
-    return w .* unweighted
-end
-
-# ## Top level
-function (measure::Measure)(args...)
-    _check(measure, args...)
-    call(measure, args...)
-end
-
-# # TRAITS
-
-# user-bespoke measures will subtype `Measure` directly and the
-# following will therefore not apply:
-StatisticalTraits.supports_weights(::Type{<:Union{Aggregated, Unaggregated}}) = true
-
-is_measure_type(::Type{<:Measure}) = true
-is_measure(m) = is_measure_type(typeof(m))
-
-# docstring fall-back:
-_decorate(s::AbstractString) = "`$s`"
-_decorate(v::Vector{<:AbstractString}) = join(_decorate.(v), ", ")
-function MMI.docstring(M::Type{<:Measure})
-    list = _decorate(instances(M))
-    ret = "`$(name(M))` - $(human_name(M)) type"
-    isempty(list) || (ret *= " with instances $list")
-    ret *= ". "
-    return ret
-end
-
-# display:
-show_as_constructed(::Type{<:Measure}) = true
-
-# info
-function StatisticalTraits.info(M::Type{<:Measure})
-    values = Tuple(@eval($trait($M)) for trait in MEASURE_TRAITS)
-    return NamedTuple{Tuple(MEASURE_TRAITS)}(values)
-end
-
-StatisticalTraits.info(m::Measure) = StatisticalTraits.info(typeof(m))
-
-
-# # AGGREGATION
-
-(::Sum)(v) = sum(skipinvalid(v))
-(::Sum)(v::LittleDict) = sum(values(v))
-
-(::Mean)(v) = mean(skipinvalid(v))
-(::Mean)(v::LittleDict) = mean(values(v))
-
-(::RootMeanSquare)(v) = sqrt(mean(skipinvalid(v).^2))
-
-aggregate(v, measure) = aggregation(measure)(v)
-
-# aggregation is no-op on scalars:
-const MeasureValue = Union{Real,Tuple{<:Real,<:Real}} # number or interval
-aggregate(x::MeasureValue, measure) = x
-
-
-# # UNIVERSAL CALLING SYNTAX
-
-# yhat - predictions (point or probabilisitic)
-# X - features
-# y - target observations
-# w - per-observation weights
-
-function value(measure, yhat, X, y, w)
-    vfdep     = Val(is_feature_dependent(measure))
-    vsweights = Val(supports_weights(measure) ||
-                    supports_class_weights(measure))
-    return value(measure, yhat, X, y, w, vfdep, vsweights)
-end
-
-# # UNIVERSAL CALLING INTERFACE
-
-#  is feature independent, weights not supported:
-value(m, yhat, X, y, w, ::Val{false}, ::Val{false}) = m(yhat, y)
-
-#  is feature dependent:, weights not supported:
-value(m, yhat, X, y, w, ::Val{true}, ::Val{false}) = m(yhat, X, y)
-
-#  is feature independent, weights supported:
-value(m, yhat, X, y, w,         ::Val{false}, ::Val{true}) = m(yhat, y, w)
-value(m, yhat, X, y, ::Nothing, ::Val{false}, ::Val{true}) = m(yhat, y)
-
-#  is feature dependent, weights supported:
-value(m, yhat, X, y, w,         ::Val{true}, ::Val{true}) = m(yhat, X, y, w)
-value(m, yhat, X, y, ::Nothing, ::Val{true}, ::Val{true}) = m(yhat, X, y)
-
-# # helpers
-
-_scale(x, w::Arr, i) = x*w[i]
-_scale(x, ::Nothing, i::Any) = x
-
-function check_pools(ŷ, y)
-    levels(y) == levels(ŷ[1]) ||
-        error("Conflicting categorical pools found "*
-              "in observations and predictions. ")
-    return nothing
-end
-
-function check_pools(ŷ, w::AbstractDict)
-    Set(levels(ŷ[1])) == Set(keys(w)) ||
-        error("Conflicting categorical pools found "*
-              "in class weights and predictions. ")
-    return nothing
-end
-
-# # INCLUDE SPECIFIC MEASURES AND TOOLS
-
-include("meta_utilities.jl")
-include("roc.jl")
-include("confusion_matrix.jl")
-include("continuous.jl")
-include("finite.jl")
-include("probabilistic.jl")
-include("loss_functions_interface.jl")
-
-
-# # DEFAULT MEASURES
-
-default_measure(T, S) = _default_measure(T, nonmissingtype(S))
-
-_default_measure(T, S) = nothing
-
-# Deterministic + Continuous / Count ==> RMS
-function _default_measure(
-    ::Type{<:Deterministic},
-    ::Type{<:Union{Vec{<:Continuous}, Vec{<:Count}}},
-)
-   return rms
-end
-
-# Deterministic + Finite ==> Misclassification rate
-function _default_measure(
-    ::Type{<:Deterministic},
-    ::Type{<:Vec{<:Finite}},
-)
-    return misclassification_rate
-end
-
-# Probabilistic + Finite / Count ==> log loss
-function _default_measure(
-    ::Type{<:Probabilistic},
-    ::Type{<:Union{Vec{<:Finite},Vec{<:Count}}},
-)
-    return log_loss
-end
-
-# Probabilistic + Continuous ==> Log loss
-function _default_measure(
-    ::Type{<:Probabilistic},
-    ::Type{<:Vec{<:Continuous}},
-)
-    return log_loss
-end
-
-function _default_measure(
-    ::Type{<:MMI.ProbabilisticDetector},
-    ::Type{<:Vec{<:OrderedFactor{2}}},
-)
-    return area_under_curve
-end
-
-function _default_measure(
-    ::Type{<:MMI.DeterministicDetector},
-    ::Type{<:Vec{<:OrderedFactor{2}}},
-)
-    return balanced_accuracy
-end
-
-# Fallbacks
-default_measure(M::Type{<:Supervised}) = default_measure(M, target_scitype(M))
-default_measure(::M) where M <: Supervised = default_measure(M)
-
-default_measure(M::Type{<:Annotator}) = _default_measure(M, target_scitype(M))
-default_measure(::M) where M <: Annotator = default_measure(M)
diff --git a/src/measures/meta_utilities.jl b/src/measures/meta_utilities.jl
deleted file mode 100644
index 3b0de197..00000000
--- a/src/measures/meta_utilities.jl
+++ /dev/null
@@ -1,233 +0,0 @@
-const DOC_OBSERVATIONS =
-    "on predictions `ŷ`, "*
-    "given ground truth observations `y`. "
-const DOC_WEIGHTS =
-    "Optionally specify per-sample weights, `w`. "
-const DOC_CLASS_WEIGHTS =
-    "An optional `AbstractDict`, denoted `class_w` above, "*
-    "keyed on `levels(y)`, specifies class weights. "
-
-macro create_aliases(M_ex)
-    esc(quote
-        M = $M_ex
-        for alias in Symbol.(instances(M))
-        # isdefined(parentmodule(M), alias) || eval(:(const $alias = $M()))
-        eval(:(const $alias = $M()))
-        end
-        end)
-end
-
-function detailed_doc_string(M; typename="", body="", footer="", scitype="")
-
-    _instances = _decorate(instances(M))
-    human_name = MLJBase.human_name(M)
-    if isempty(scitype)
-        scitype = "`$(target_scitype(M))`"
-    end
-
-    if isempty(typename)
-        ret = "    $M\n\n"
-    else
-        ret = "    MLJBase.$typename\n\n"
-    end
-
-    ret *= "A measure type for $(human_name)"
-    isempty(_instances) ||
-        (ret  *= ", which includes the instance(s): "*
-         "$_instances")
-    ret *= ".\n\n"
-    ret *= "    $(name(M))()(ŷ, y)\n"
-    supports_weights(M) &&
-        (ret *= "    $(name(M))()(ŷ, y, w)\n")
-    supports_class_weights(M) &&
-        (ret *= "    $(name(M))()(ŷ, y, class_w)\n")
-    ret *= "\n"
-    if isempty(fieldnames(M))
-            ret *= "Evaluate the $(human_name) "
-    else
-        ret *= "Evaluate the default instance of $(name(M)) "
-    end
-    ret *= "$DOC_OBSERVATIONS"
-    supports_weights(M) &&
-        (ret *= DOC_WEIGHTS)
-    supports_class_weights(M) &&
-        (ret *= DOC_CLASS_WEIGHTS)
-    ret *= "\n\n"
-    isempty(body) || (ret *= "$body\n\n")
-    ret *= "Requires `scitype(y)` to be a subtype of $scitype; "
-    ret *= "`ŷ` must be an array of `$(prediction_type(M))` predictions. "
-    isempty(footer) ||(ret *= "\n\n$footer")
-    ret *= "\n\n"
-    ret *= "For more information, run `info($(name(M)))`. "
-    return ret
-end
-
-
-_err_create_docs() = error(
-    "@create_docs syntax error. Usage: \n"*
-    "@create_docs(MeasureType, typename=..., body=..., scitype=..., footer=...")
-macro create_docs(M_ex, exs...)
-    M_ex isa Symbol || _err_create_docs()
-    t = ""
-    b = ""
-    s = ""
-    f = ""
-    for ex in exs
-        ex.head == :(=) || _err_create_docs()
-        ex.args[1] == :typename && (t = ex.args[2])
-        ex.args[1] == :body &&     (b = ex.args[2])
-        ex.args[1] == :scitype &&  (s = ex.args[2])
-        ex.args[1] == :footer &&   (f = ex.args[2])
-    end
-    esc(quote
-        "$(detailed_doc_string($M_ex, typename=$t, body=$b, scitype=$s, footer=$f))"
-        function $M_ex end
-        end)
-end
-
-# TODO: I wonder why this is not a macro?
-
-"""
-    metadata_measure(T; kw...)
-
-Helper function to write the metadata (trait definitions) for a single
-measure.
-
-### Compulsory keyword arguments
-
-- `target_scitype`: The allowed scientific type of `y` in `measure(ŷ,
-  y, ...)`. This is typically some abstract array. E.g, in single
-  target variable regression this is typically
-  `AbstractArray{<:Union{Missing,Continuous}}`. For a binary
-  classification metric insensitive to class order, this would
-  typically be `Union{AbstractArray{<:Union{Missing,Multiclass{2}}},
-  AbstractArray{<:Union{Missing,OrderedFactor{2}}}}`, which has the
-  alias `FiniteArrMissing`.
-
-- `orientation`: Orientation of the measure.  Use `:loss` when lower is
-    better and `:score` when higher is better.  For example, set
-    `:loss` for root mean square and `:score` for area under the ROC
-    curve.
-
-- `prediction_type`: Refers to `ŷ` in `measure(ŷ, y, ...)` and should
-  be one of: `:deterministic` (`ŷ` has same type as `y`),
-  `:probabilistic` or `:interval`.
-
-
-#### Optional keyword arguments
-
-The following have meaningful defaults but may still require
-overloading:
-
-- `instances`: A vector of strings naming the built-in instances of
-  the measurement type provided by the implementation, which are
-  usually just common aliases for the default instance. E.g., for
-  `RSquared` has the `instances = ["rsq", "rsquared"]` which are both
-  defined as `RSquared()` in the implementation. `MulticlassFScore`
-  has the `instances = ["macro_f1score", "micro_f1score",
-  "multiclass_f1score"]`, where `micro_f1score =
-  MulticlassFScore(average=micro_avg)`, etc.  Default is `String[]`.
-
-- `aggregation`: Aggregation method for measurements, typically
-        `Mean()` (for, e.g., mean absolute error) or `Sum()` (for number
-    of true positives). Default is `Mean()`. Must subtype
-    `StatisticalTraits.AggregationMode`. It is used to:
-
-   - aggregate measurements in resampling (e.g., cross-validation)
-
-   - aggregating per-observation measurements returned by `single` in
-     the fallback definition of `call` for `Unaggregated` measures
-    (such as area under the ROC curve).
-
-- `supports_weights`: Whether the measure can be called with
-  per-observation weights `w`, as in `l2(ŷ, y, w)`. Default is `true`.
-
-- `supports_class_weights`: Whether the measure can be called with a
-  class weight dictionary `w`, as in `micro_f1score(ŷ, y, w)`. Default
-  is `true`. Default is `false`.
-
-- `human_name`: Ordinary name of measure. Used in the full
-  auto-generated docstring, which begins "A measure type for
-  \$human_name ...". Eg, the `human_name` for `TruePositive` is `number
-  of true positives. Default is snake-case version of type name, with
-  underscores replaced by spaces; so `MeanAbsoluteError` becomes "mean
-  absolute error".
-
-- `docstring`: An abbreviated docstring, displayed by
-  `info(measure)`. Fallback uses `human_name` and lists the
-  `instances`.
-
-"""
-function metadata_measure(T; name::String="",
-                          human_name="",
-                          instances::Vector{String}=String[],
-                          target_scitype=Unknown,
-                          prediction_type::Symbol=:unknown,
-                          orientation::Symbol=:unknown,
-                          aggregation=Mean(),
-                          is_feature_dependent::Bool=false,
-                          supports_weights::Bool=true,
-                          supports_class_weights::Bool=false,
-                          docstring::String="",
-                          distribution_type=Unknown)
-    pred_str        = "$prediction_type"
-    orientation_str = "$orientation"
-#    dist = ifelse(ismissing(distribution_type), missing, "$distribution_type")
-    ex = quote
-
-        # traits common with models:
-        if !isempty($name)
-            StatisticalTraits.name(::Type{<:$T}) = $name
-        end
-        if !isempty($docstring)
-            StatisticalTraits.docstring(::Type{<:$T}) = $docstring
-        end
-        StatisticalTraits.target_scitype(::Type{<:$T}) = $target_scitype
-        StatisticalTraits.prediction_type(::Type{<:$T}) = Symbol($pred_str)
-        StatisticalTraits.supports_weights(::Type{<:$T}) = $supports_weights
-
-        # traits specific to measures:
-        if !isempty($instances)
-            StatisticalTraits.instances(::Type{<:$T}) = $instances
-        end
-        if !isempty($human_name)
-            StatisticalTraits.human_name(::Type{<:$T}) = $human_name
-        end
-        StatisticalTraits.orientation(::Type{<:$T}) = Symbol($orientation_str)
-        StatisticalTraits.aggregation(::Type{<:$T}) = $aggregation
-        StatisticalTraits.is_feature_dependent(::Type{<:$T}) =
-            $is_feature_dependent
-        StatisticalTraits.supports_class_weights(::Type{<:$T}) =
-            $supports_class_weights
-        StatisticalTraits.distribution_type(::Type{<:$T}) = $distribution_type
-
-    end
-    parentmodule(T).eval(ex)
-end
-
-"""
-
-    measures_for_export()
-
-Return a list of the symbolic representation of all:
-
-- measure types (subtypes of `Aggregated` and `Unaggregated`) measure
-
-- type aliases (as defined by the constant
-  `MLJBase.MEASURE_TYPE_ALIASES`)
-
-- all built-in measure instances (as declared by `instances` trait)
-
-"""
-function measures_for_export()
-    ret = MLJBase.MEASURE_TYPE_ALIASES
-    for m in measures()
-        name = m.name |> Symbol
-        push!(ret, name)
-        for instance in m.instances
-            alias = Symbol(instance)
-            push!(ret, alias)
-        end
-    end
-    return ret
-end
diff --git a/src/measures/probabilistic.jl b/src/measures/probabilistic.jl
deleted file mode 100644
index 11c3bcdf..00000000
--- a/src/measures/probabilistic.jl
+++ /dev/null
@@ -1,423 +0,0 @@
-const DOC_DISTRIBUTIONS =
-"""
-In the case the predictions `ŷ` are continuous probability
-distributions, such as `Distributions.Normal`, replace the above sum
-with an integral, and interpret `p` as the probablity density
-function. In case of discrete distributions over the integers, such as
-`Distributions.Poisson`, sum over all integers instead of `C`.
-"""
-const WITH_L2NORM_CONTINUOUS =
-    [@eval(Distributions.$d) for d in [
-        :Chisq,
-        :Gamma,
-        :Beta,
-        :Chi,
-        :Cauchy,
-        :Normal,
-        :Uniform,
-        :Logistic,
-        :Exponential]]
-
-const WITH_L2NORM_COUNT =
-    [@eval(Distributions.$d) for d in [
-        :Poisson,
-        :DiscreteUniform,
-        :DiscreteNonParametric]]
-
-const WITH_L2NORM = vcat([UnivariateFinite, ],
-                         WITH_L2NORM_CONTINUOUS,
-                         WITH_L2NORM_COUNT)
-
-const UD = Distributions.UnivariateDistribution
-
-# ========================================================
-# AGGREGATED MEASURES
-
-# ---------------------------------------------------------
-# AreaUnderCurve
-
-# Implementation based on the Mann-Whitney U statistic.
-# see https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
-# and https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test#Area_under_curve_(AUC)_statistic_for_ROC_curves
-
-
-struct AreaUnderCurve <: Aggregated end
-
-metadata_measure(AreaUnderCurve;
-                 human_name = "area under the ROC",
-                 instances = ["area_under_curve", "auc"],
-                 target_scitype           = FiniteArrMissing{2},
-                 prediction_type          = :probabilistic,
-                 orientation              = :score,
-                 supports_weights         = false,
-                 distribution_type        = UnivariateFinite)
-
-const AUC = AreaUnderCurve
-@create_aliases AreaUnderCurve
-
-@create_docs(AreaUnderCurve,
-body=
-"""
-Returns the area under the ROC ([receiver operator
-characteristic](https://en.wikipedia.org/wiki/Receiver_operating_characteristic))
-
-If `missing` or `NaN` values are present, use `auc(skipinvalid(yhat, y)...)`.
-
-$INVARIANT_LABEL
-""",
-scitpye = DOC_FINITE_BINARY)
-
-# core algorithm:
-function _auc(ŷ, y)
-    lab_pos = classes(ŷ)[2] # 'positive' label
-    scores = pdf.(ŷ, lab_pos) # associated scores
-    ranks = StatsBase.tiedrank(scores)
-    n = length(y)
-    n_neg = 0  # to keep of the number of negative preds
-    T = eltype(ranks)
-    R_pos = zero(T) # sum of positive ranks
-    @inbounds for (i,j) in zip(eachindex(y), eachindex(ranks))
-        if y[i] == lab_pos
-            R_pos += ranks[j]
-        else
-            n_neg += 1
-        end
-    end
-    n_pos = n - n_neg # number of positive predictions
-    U = R_pos - T(0.5)*n_pos*(n_pos + 1) # Mann-Whitney U statistic
-    return U / (n_neg * n_pos)
-end
-
-# Missing values not supported, but allow `Missing` in eltype, because
-# `skipinvalid(yhat, y)` does not tighten the type. See doc string above.
-
-call(::AUC, ŷ, y) = _auc(ŷ, y)
-
-# ========================================================
-# UNAGGREGATED MEASURES
-
-# ---------------------------------------------------------------------
-# LogScore
-
-struct LogScore{R <: Real} <: Unaggregated
-    tol::R
-end
-LogScore(;eps=eps(), tol=eps) = LogScore(tol)
-
-metadata_measure(LogScore;
-                 instances                = ["log_score", ],
-                 target_scitype           = Union{
-                     Arr{<:Union{Missing,Multiclass}},
-                     Arr{<:Union{Missing,OrderedFactor}},
-                     Arr{<:Union{Missing,Continuous}},
-                     Arr{<:Union{Missing,Count}}},
-                 prediction_type          = :probabilistic,
-                 orientation              = :score,
-                 distribution_type        = Union{WITH_L2NORM...})
-
-@create_aliases LogScore
-
-@create_docs(LogScore,
-body=
-"""
-Since the score is undefined in the case that the true observation is
-predicted to occur with probability zero, probablities are clamped
-between `tol` and `1-tol`, where `tol` is a constructor key-word
-argument.
-
-If `p` is the predicted probability mass or density function
-corresponding to a *single* ground truth observation `η`, then the
-score for that example is
-
-    log(clamp(p(η), tol), 1 - tol)
-
-For example, for a binary target with "yes"/"no" labels, and
-predicted probability of "yes" equal to 0.8, an observation of "no"
-scores `log(0.2)`.
-
-The predictions `ŷ` should be an array of `UnivariateFinite`
-distributions in the case of `Finite` target `y`, and otherwise a
-supported `Distributions.UnivariateDistribution` such as `Normal` or
-`Poisson`.
-
-See also [`LogLoss`](@ref), which differs only in sign.
-""",
-scitype=DOC_MULTI)
-
-# for single finite observation:
-single(c::LogScore, d::UnivariateFinite, η) =
-    log(clamp(pdf(d, η), c.tol, 1 - c.tol))
-
-# for a single infinite observation:
-single(c::LogScore, d::Distributions.UnivariateDistribution, η) =
-    log(clamp(pdf(d, η), c.tol, 1 - c.tol))
-
-# to resolve method ambiguities:
-single(::LogScore, ::UnivariateFinite, ::Missing) = missing
-single(::LogScore, ::Distributions.UnivariateDistribution, ::Missing) = missing
-single(::LogScore, ::Missing, ::Missing) = missing
-
-# performant broadasting in case of UnivariateFiniteArray:
-call(c::LogScore, ŷ::UnivariateFiniteArray, y) =
-    log.(clamp.(broadcast(pdf, ŷ, y), c.tol, 1 - c.tol))
-call(c::LogScore, ŷ::UnivariateFiniteArray, y, w::AbstractArray) = call(c, ŷ, y) .* w
-
-# ---------------------------------------------------------------------
-# LogLoss
-
-struct LogLoss{R <: Real} <: Unaggregated
-    tol::R
-end
-LogLoss(;eps=eps(), tol=eps) = LogLoss(tol)
-
-metadata_measure(LogLoss;
-                 instances                = ["log_loss", "cross_entropy"],
-                 target_scitype           = Union{
-                     Arr{<:Union{Missing,Multiclass}},
-                     Arr{<:Union{Missing,OrderedFactor}},
-                     Arr{<:Union{Missing,Continuous}},
-                     Arr{<:Union{Missing,Count}}},
-                 prediction_type          = :probabilistic,
-                 orientation              = :loss,
-                 distribution_type        = Union{WITH_L2NORM...})
-
-const CrossEntropy = LogLoss
-@create_aliases LogLoss
-
-@create_docs(LogLoss,
-body=
-"""
-For details, see [`LogScore`](@ref), which differs only by a sign.
-""",
-scitype=DOC_MULTI)
-
-# for single observation:
-single(c::LogLoss, d, η) = -single(LogScore(tol=c.tol), d, η)
-
-# to get performant broadasting in case of UnivariateFiniteArray:
-call(c::LogLoss, ŷ::UnivariateFiniteArray, y) =
-    -call(LogScore(tol=c.tol), ŷ, y)
-call(c::LogLoss, ŷ::UnivariateFiniteArray, y, w::AbstractArray) =
-    -call(LogScore(tol=c.tol), ŷ, y, w)
-
-
-# -----------------------------------------------------
-# BrierScore
-
-struct BrierScore <: Unaggregated end
-
-metadata_measure(BrierScore;
-                 human_name = "Brier score (a.k.a. quadratic score)",
-                 instances                = ["brier_score",],
-                 target_scitype           = Union{
-                     Arr{<:Union{Missing,Multiclass}},
-                     Arr{<:Union{Missing,OrderedFactor}},
-                     Arr{<:Union{Missing,Continuous}},
-                     Arr{<:Union{Missing,Count}}},
-                 prediction_type          = :probabilistic,
-                 orientation              = :score,
-                 distribution_type        = Union{WITH_L2NORM...})
-
-@create_aliases BrierScore
-
-@create_docs(BrierScore,
-body=
-"""
-Convention as in $PROPER_SCORING_RULES
-
-*Finite case.* If `p` is the predicted probability mass function for a
-*single* observation `η`, and `C` all possible classes, then the
-corresponding score for that observation is given by
-
-``2p(η) - \\left(\\sum_{c ∈ C} p(c)^2\\right) - 1``
-
-*Warning.* `BrierScore()` is a "score" in the sense that bigger is
-better (with `0` optimal, and all other values negative). In Brier's
-original 1950 paper, and many other places, it has the opposite sign,
-despite the name. Moreover, the present implementation does not treat
-the binary case as special, so that the score may differ in the binary
-case by a factor of two from usage elsewhere.
-
-*Infinite case.* Replacing the sum above with an integral does *not*
-lead to the formula adopted here in the case of `Continuous` or
-`Count` target `y`. Rather the convention in the paper cited above is
-adopted, which means returning a score of
-
-``2p(η) - ∫ p(t)^2 dt``
-
-in the `Continuous` case (`p` the probablity density function) or
-
-``2p(η) - ∑_t p(t)^2``
-
-in the `Count` cae (`p` the probablity mass function).
-""",
-scitype=DOC_MULTI)
-
-# calling on single finite observation:
-function single(::BrierScore,
-                d::UnivariateFinite,
-                η)
-    levels = classes(d)
-    pvec = broadcast(pdf, d, levels)
-    offset = 1 + sum(pvec.^2)
-    return 2 * pdf(d, η) - offset
-end
-
-# calling on a single infinite observation:
-single(::BrierScore, d::Distributions.UnivariateDistribution, η) =
-    2*pdf(d, η) - Distributions.pdfsquaredL2norm(d)
-
-# To get performant broadcasted version in case of UnivariateFiniteArray:
-function call(
-    ::BrierScore,
-    ŷ::UnivariateFiniteArray,
-    y
-    )
-
-    probs = pdf(ŷ, classes(first(ŷ)))
-    offset = 1 .+ vec(sum(probs.^2, dims=2))
-
-    2 .* broadcast(pdf, ŷ, y) .- offset
-end
-call(m::BrierScore, ŷ::UnivariateFiniteArray, y, w::AbstractArray) = call(m, ŷ, y) .* w
-
-
-# -----------------------------------------------------
-# BrierLoss
-
-struct BrierLoss <: Unaggregated end
-
-metadata_measure(BrierLoss;
-                 human_name = "Brier loss (a.k.a. quadratic loss)",
-                 instances                = ["brier_loss",],
-                 target_scitype           = Union{
-                     Arr{<:Union{Missing,Multiclass}},
-                     Arr{<:Union{Missing,OrderedFactor}},
-                     Arr{<:Union{Missing,Continuous}},
-                     Arr{<:Union{Missing,Count}}},
-                 prediction_type          = :probabilistic,
-                 orientation              = :loss,
-                 distribution_type        = Union{WITH_L2NORM...})
-
-@create_aliases BrierLoss
-
-@create_docs(BrierLoss,
-body=
-"""
-For details, see [`BrierScore`](@ref), which differs only by a sign.
-""",
-scitype=DOC_MULTI)
-
-# calling on single observation:
-single(::BrierLoss, d, η) = - single(BrierScore(), d, η)
-
-# to get performant broadcasting in case of UnivariateFiniteArray:
-call(m::BrierLoss, ŷ::UnivariateFiniteArray, y) =
-    -call(BrierScore(), ŷ, y)
-call(m::BrierLoss, ŷ::UnivariateFiniteArray, y, w::AbstractArray) =
-    -call(BrierScore(), ŷ, y, w)
-
-
-# -----------------------------------------------------
-# SphericalScore
-
-struct SphericalScore{T<:Real} <: Unaggregated
-    alpha::T
-end
-SphericalScore(; alpha=2) = SphericalScore(alpha)
-
-metadata_measure(SphericalScore;
-                 human_name               = "Spherical score",
-                 instances                = ["spherical_score",],
-                 target_scitype           = Union{
-                     Arr{<:Union{Missing,Multiclass}},
-                     Arr{<:Union{Missing,OrderedFactor}},
-                     Arr{<:Union{Missing,Continuous}},
-                     Arr{<:Union{Missing,Count}}},
-                 prediction_type          = :probabilistic,
-                 orientation              = :score,
-                 distribution_type        = Union{WITH_L2NORM...})
-
-@create_aliases SphericalScore
-
-@create_docs(SphericalScore,
-body=
-"""
-Convention as in $PROPER_SCORING_RULES: If `η` takes on a finite
-number of classes `C` and ``p(η)` is the predicted probability for a
-*single* observation `η`, then the corresponding score for that
-observation is given by
-
-``p(y)^α / \\left(\\sum_{η ∈ C} p(η)^α\\right)^{1-α} - 1``
-
-where `α` is the measure parameter `alpha`.
-
-$DOC_DISTRIBUTIONS
-
-""",
-scitype=DOC_MULTI)
-
-# calling on single observations:
-function single(s::SphericalScore, d::UnivariateFinite, η)
-    α = s.alpha
-    levels = classes(d)
-    pvec = broadcast(pdf, d, levels)
-    return (pdf(d, η)/norm(pvec, α))^(α - 1)
-end
-
-single(s::SphericalScore, d::Distributions.UnivariateDistribution, η) =
-    pdf(d, η)/sqrt(Distributions.pdfsquaredL2norm(d))
-
-# to compute the α-norm along last dimension:
-_norm(A::AbstractArray{<:Any,N}, α) where N =
-    sum(x -> x^α, A, dims=N).^(1/α)
-
-# To get performant version in case of UnivariateFiniteArray:
-function call(
-    s::SphericalScore,
-    ŷ::UnivariateFiniteArray,
-    y
-    )
-    α = s.alpha
-    alphanorm(A) = _norm(A, α)
-
-    predicted_probs = pdf(ŷ, classes(first(ŷ)))
-
-    (broadcast(pdf, ŷ, y) ./ alphanorm(predicted_probs)).^(α - 1)
-end
-call(s::SphericalScore, ŷ::UnivariateFiniteArray, y, w::AbstractArray) =
-    call(s, ŷ, y) .* w
-
-
-# ---------------------------------------------------------------------------
-# Extra check for L2 norm based proper scoring rules
-
-err_l2_norm(m) = ArgumentError(
-    "Distribution not supported by $m. "*
-    "Supported distributions are "*
-    join(string.(map(s->"`$s`", WITH_L2NORM)), ", ", ", and "))
-
-const ERR_UNSUPPORTED_ALPHA = ArgumentError(
-    "Only `alpha = 2` is supported, unless scoring a `Finite` target. ")
-
-# not for export:
-const L2ProperScoringRules = Union{LogScore,
-                                   LogLoss,
-                                   BrierScore,
-                                   BrierLoss,
-                                   SphericalScore}
-
-function extra_check(measure::L2ProperScoringRules, yhat, args...)
-
-    D = nonmissing(eltype(yhat))
-    D <: Distributions.Distribution || D <: UnivariateFinite ||
-        (D = typeof(findfirst(x->!isinvalid(x), yhat)))
-    D <: Union{Nothing, WITH_L2NORM...} ||
-        throw(err_l2_norm(measure))
-
-    if measure isa SphericalScore
-        measure.alpha == 2 || throw(ERR_UNSUPPORTED_ALPHA)
-    end
-
-    return nothing
-end
diff --git a/src/measures/roc.jl b/src/measures/roc.jl
deleted file mode 100644
index 8614b00e..00000000
--- a/src/measures/roc.jl
+++ /dev/null
@@ -1,91 +0,0 @@
-## ROC COMPUTATION
-
-"""
-    _idx_unique_sorted(v)
-
-Internal function to return the index of unique elements in `v` under the
-assumption that the vector `v` is sorted in decreasing order.
-"""
-function _idx_unique_sorted(v::Vec{<:Real})
-    n    = length(v)
-    idx  = ones(Int, n)
-    p, h = 1, 1
-    cur  = v[1]
-    @inbounds while h < n
-        h     += 1                  # head position
-        cand   = v[h]               # candidate value
-        cand   < cur || continue    # is it new? otherwise skip
-        p     += 1                  # if new store it
-        idx[p] = h
-        cur    = cand               # and update the last seen value
-    end
-    p < n && deleteat!(idx, p+1:n)
-    return idx
-end
-
-"""
-    fprs, tprs, ts = roc_curve(ŷ, y) = roc(ŷ, y)
-
-Return the ROC curve for a two-class probabilistic prediction `ŷ` given the
-ground  truth `y`. The true positive rates, false positive rates over a range
-of thresholds `ts` are returned. Note that if there are `k` unique scores,
-there are correspondingly  `k` thresholds and `k+1` "bins" over which the FPR
-and TPR are constant:
-
-* `[0.0 - thresh[1]]`
-* `[thresh[1] - thresh[2]]`
-* ...
-* `[thresh[k] - 1]`
-
-consequently, `tprs` and `fprs` are of length `k+1` if `ts` is of length `k`.
-
-To draw the curve using your favorite plotting backend, do `plot(fprs, tprs)`.
-"""
-function roc_curve(ŷm, ym)
-    ŷ, y    = skipinvalid(ŷm, ym)
-    length(classes(ŷ)) ==  2 || throw(
-        ArgumentError("`ŷ` must be a two-class probabilistic prediction")
-    )
-    length(levels(y)) == 2 || throw(
-        ArgumentError("`y` must be a categorical vector with two-levels.")
-    )
-    n       = length(y)
-    lab_pos = levels(y)[2]
-    scores  = pdf.(ŷ, lab_pos)
-    ranking = sortperm(scores, rev=true)
-
-    scores_sort = scores[ranking]
-    y_sort_bin  = (y[ranking] .== lab_pos)
-
-    idx_unique = _idx_unique_sorted(scores_sort)
-    thresholds = scores_sort[idx_unique]
-
-    # detailed computations with example:
-    # y = [  1   0   0   1   0   0   1]
-    # s = [0.5 0.5 0.2 0.2 0.1 0.1 0.1] thresh are 0.5 0.2, 0.1 // idx [1, 3, 5]
-    # ŷ = [  0   0   0   0   0   0   0] (0.5 - 1.0] # no pos pred
-    # ŷ = [  1   1   0   0   0   0   0] (0.2 - 0.5] # 2 pos pred
-    # ŷ = [  1   1   1   1   0   0   0] (0.1 - 0.2] # 4 pos pred
-    # ŷ = [  1   1   1   1   1   1   1] [0.0 - 0.1] # all pos pre
-
-    idx_unique_2 = idx_unique[2:end]   # [3, 5]
-    n_ŷ_pos      = idx_unique_2 .- 1   # [2, 4] implicit [0, 2, 4, 7]
-
-    cs   = cumsum(y_sort_bin)          # [1, 1, 1, 2, 2, 2, 3]
-    n_tp = cs[n_ŷ_pos]                 # [1, 2] implicit [0, 1, 2, 3]
-    n_fp = n_ŷ_pos .- n_tp             # [1, 2] implicit [0, 1, 2, 4]
-
-    # add end points
-    P = sum(y_sort_bin) # total number of true positives
-    N = n - P           # total number of true negatives
-
-    n_tp = [0, n_tp..., P] # [0, 1, 2, 3]
-    n_fp = [0, n_fp..., N] # [0, 1, 2, 4]
-
-    tprs = n_tp ./ P  # [0/3, 1/3, 2/3, 1]
-    fprs = n_fp ./ N  # [0/4, 1/4, 2/4, 1]
-
-    return fprs, tprs, thresholds
-end
-
-const roc = roc_curve
diff --git a/src/resampling.jl b/src/resampling.jl
index 43483cc3..6b055951 100644
--- a/src/resampling.jl
+++ b/src/resampling.jl
@@ -14,8 +14,6 @@ const PREDICT_OPERATIONS_STRING = begin
     join(strings, ", ", ", or ")
 end
 const PROG_METER_DT = 0.1
-const ERR_WEIGHTS_REAL =
-    ArgumentError("`weights` must be a `Real` vector. ")
 const ERR_WEIGHTS_LENGTH =
     DimensionMismatch("`weights` and target "*
                       "have different lengths. ")
@@ -32,19 +30,41 @@ const ERR_INVALID_OPERATION = ArgumentError(
     "Invalid `operation` or `operations`. "*
     "An operation must be one of these: $PREDICT_OPERATIONS_STRING. ")
 _ambiguous_operation(model, measure) =
-    "`prediction_type($measure) == $(prediction_type(measure))` but "*
-    "`prediction_type($model) == $(prediction_type(model))`."
+    "`$measure` does not support a `model` with "*
+    "`prediction_type(model) == :$(prediction_type(model))`. "
 err_ambiguous_operation(model, measure) = ArgumentError(
     _ambiguous_operation(model, measure)*
-    "\nUnable to deduce an appropriate operation for $measure. "*
+    "\nUnable to infer an appropriate operation for `$measure`. "*
     "Explicitly specify `operation=...` or `operations=...`. ")
 err_incompatible_prediction_types(model, measure) = ArgumentError(
     _ambiguous_operation(model, measure)*
-    "If your model really is making probabilistic predictions, try explicitly "*
+    "If your model is truly making probabilistic predictions, try explicitly "*
     "specifiying operations. For example, for "*
     "`measures = [area_under_curve, accuracy]`, try "*
     "`operations=[predict, predict_mode]`. ")
-
+const LOG_AVOID = "\nTo override measure checks, set check_measure=false. "
+const LOG_SUGGESTION1 =
+    "\nPerhaps you want to set `operation="*
+    "predict_mode` or need to "*
+    "specify multiple operations, "*
+    "one for each measure. "
+const LOG_SUGGESTION2 =
+    "\nPerhaps you want to set `operation="*
+    "predict_mean` or `operation=predict_median`, or "*
+    "specify multiple operations, "*
+    "one for each measure. "
+ERR_MEASURES_OBSERVATION_SCITYPE(measure, T_measure, T) = ArgumentError(
+    "\nobservation scitype of target = `$T` but ($measure) only supports "*
+        "`$T_measure`."*LOG_AVOID
+)
+ERR_MEASURES_PROBABILISTIC(measure, suggestion) = ArgumentError(
+    "The model subtypes `Probabilistic`, and so is not supported by "*
+        "`$measure`. $suggestion"*LOG_AVOID
+)
+ERR_MEASURES_DETERMINISTIC(measure) = ArgumentError(
+    "The model subtypes `Deterministic`, "*
+        "and so is not supported by `$measure`. "*LOG_AVOID
+)
 
 # ==================================================================
 ## MODEL TYPES THAT CAN BE EVALUATED
@@ -345,7 +365,7 @@ For example, if you run `replace!(y, 'a' => 'b', 'b' => 'a')` and then re-run
 `train_test_pairs`, the returned `(train, test)` pairs will be the same.
 
 Pre-shuffling of `rows` is controlled by `rng` and `shuffle`. If `rng`
-is an integer, then the `StratifedCV` keyword constructor resets it to
+is an integer, then the `StratifedCV` keywod constructor resets it to
 `MersenneTwister(rng)`. Otherwise some `AbstractRNG` object is
 expected.
 
@@ -448,72 +468,68 @@ end
 """
     PerformanceEvaluation
 
-Type of object returned by [`evaluate`](@ref) (for models plus data)
-or [`evaluate!`](@ref) (for machines). Such objects encode estimates
-of the performance (generalization error) of a supervised model or
-outlier detection model.
-
-When `evaluate`/`evaluate!` is called, a number of train/test pairs
-("folds") of row indices are generated, according to the options
-provided, which are discussed in the [`evaluate!`](@ref)
-doc-string. Rows correspond to observations. The generated train/test
-pairs are recorded in the `train_test_rows` field of the
-`PerformanceEvaluation` struct, and the corresponding estimates,
-aggregated over all train/test pairs, are recorded in `measurement`, a
-vector with one entry for each measure (metric) recorded in `measure`.
-
-When displayed, a `PerformanceEvalution` object includes a value under
-the heading `1.96*SE`, derived from the standard error of the `per_fold`
-entries. This value is suitable for constructing a formal 95%
-confidence interval for the given `measurement`. Such intervals should
-be interpreted with caution. See, for example, Bates et al.
-[(2021)](https://arxiv.org/abs/2104.00673).
+Type of object returned by [`evaluate`](@ref) (for models plus data) or
+[`evaluate!`](@ref) (for machines). Such objects encode estimates of the performance
+(generalization error) of a supervised model or outlier detection model.
+
+When `evaluate`/`evaluate!` is called, a number of train/test pairs ("folds") of row
+indices are generated, according to the options provided, which are discussed in the
+[`evaluate!`](@ref) doc-string. Rows correspond to observations. The generated train/test
+pairs are recorded in the `train_test_rows` field of the `PerformanceEvaluation` struct,
+and the corresponding estimates, aggregated over all train/test pairs, are recorded in
+`measurement`, a vector with one entry for each measure (metric) recorded in `measure`.
+
+When displayed, a `PerformanceEvalution` object includes a value under the heading
+`1.96*SE`, derived from the standard error of the `per_fold` entries. This value is
+suitable for constructing a formal 95% confidence interval for the given
+`measurement`. Such intervals should be interpreted with caution. See, for example, Bates
+et al.  [(2021)](https://arxiv.org/abs/2104.00673).
 
 ### Fields
 
-These fields are part of the public API of the `PerformanceEvaluation`
-struct.
+These fields are part of the public API of the `PerformanceEvaluation` struct.
 
 - `model`: model used to create the performance evaluation. In the case a
     tuning model, this is the best model found.
 
 - `measure`: vector of measures (metrics) used to evaluate performance
 
-- `measurement`: vector of measurements - one for each element of
-  `measure` - aggregating the performance measurements over all
-  train/test pairs (folds). The aggregation method applied for a given
-  measure `m` is `aggregation(m)` (commonly `Mean` or `Sum`)
+- `measurement`: vector of measurements - one for each element of `measure` - aggregating
+  the performance measurements over all train/test pairs (folds). The aggregation method
+  applied for a given measure `m` is
+  `StatisticalMeasuresBase.external_aggregation_mode(m)` (commonly `Mean()` or `Sum()`)
 
-- `operation` (e.g., `predict_mode`): the operations applied for each
-  measure to generate predictions to be evaluated. Possibilities are:
-  $PREDICT_OPERATIONS_STRING.
+- `operation` (e.g., `predict_mode`): the operations applied for each measure to generate
+  predictions to be evaluated. Possibilities are: $PREDICT_OPERATIONS_STRING.
 
-- `per_fold`: a vector of vectors of individual test fold evaluations
-  (one vector per measure). Useful for obtaining a rough estimate of
-  the variance of the performance estimate.
+- `per_fold`: a vector of vectors of individual test fold evaluations (one vector per
+  measure). Useful for obtaining a rough estimate of the variance of the performance
+  estimate.
 
-- `per_observation`: a vector of vectors of individual observation
-  evaluations of those measures for which
-  `reports_each_observation(measure)` is true, which is otherwise
-  reported `missing`. Useful for some forms of hyper-parameter
-  optimization.
+- `per_observation`: a vector of vectors of vectors containing individual per-observation
+  measurements: for an evaluation `e`, `e.per_observation[m][f][i]` is the measurement for
+  the `i`th observation in the `f`th test fold, evaluated using the `m`th measure.  Useful
+  for some forms of hyper-parameter optimization. Note that an aggregregated measurement
+  for some measure `measure` is repeated across all observations in a fold if
+  `StatisticalMeasures.can_report_unaggregated(measure) == true`. If `e` has been computed
+  with the `per_observation=false` option, then `e_per_observation` is a vector of
+  `missings`.
 
-- `fitted_params_per_fold`: a vector containing `fitted params(mach)`
-  for each machine `mach` trained during resampling - one machine per
-  train/test pair. Use this to extract the learned parameters for each
-  individual training event.
+- `fitted_params_per_fold`: a vector containing `fitted params(mach)` for each machine
+  `mach` trained during resampling - one machine per train/test pair. Use this to extract
+  the learned parameters for each individual training event.
 
-- `report_per_fold`: a vector containing `report(mach)` for each
-  machine `mach` training in resampling - one machine per train/test
-  pair.
+- `report_per_fold`: a vector containing `report(mach)` for each machine `mach` training
+  in resampling - one machine per train/test pair.
 
-- `train_test_rows`: a vector of tuples, each of the form `(train, test)`,
-  where `train` and `test` are vectors of row (observation) indices for
-  training and evaluation respectively.
+- `train_test_rows`: a vector of tuples, each of the form `(train, test)`, where `train`
+  and `test` are vectors of row (observation) indices for training and evaluation
+  respectively.
 
 - `resampling`: the resampling strategy used to generate the train/test pairs.
 
 - `repeats`: the number of times the resampling strategy was repeated.
+
 """
 struct PerformanceEvaluation{M,
                              Measure,
@@ -617,48 +633,37 @@ end
 
 function _check_measure(measure, operation, model, y)
 
-    T = scitype(y)
+    # get observation scitype:
+    T = MLJBase.guess_observation_scitype(y)
+
+    # get type supported by measure:
+    T_measure = StatisticalMeasuresBase.observation_scitype(measure)
 
     T == Unknown && (return true)
-    target_scitype(measure) == Unknown && (return true)
-    prediction_type(measure) == :unknown && (return true)
+    T_measure == Union{} && (return true)
+    isnothing(StatisticalMeasuresBase.kind_of_proxy(measure)) && (return true)
 
-    avoid = "\nTo override measure checks, set check_measure=false. "
 
-    T <: target_scitype(measure) ||
-        throw(ArgumentError(
-            "\nscitype of target = $T but target_scitype($measure) = "*
-            "$(target_scitype(measure))."*avoid))
+    T <: T_measure || throw(ERR_MEASURES_OBSERVATION_SCITYPE(measure, T_measure, T))
 
     incompatible = model isa Probabilistic &&
         operation == predict &&
-        prediction_type(measure) != :probabilistic
+        StatisticalMeasuresBase.kind_of_proxy(measure) != LearnAPI.Distribution()
 
     if incompatible
-        if target_scitype(measure) <:
-            AbstractVector{<:Union{Missing,Finite}}
-            suggestion = "\nPerhaps you want to set `operation="*
-                "predict_mode` or need to "*
-                "specify multiple operations, "*
-                "one for each measure. "
-        elseif target_scitype(measure) <:
-            AbstractVector{<:Union{Missing,Continuous}}
-            suggestion = "\nPerhaps you want to set `operation="*
-                "predict_mean` or `operation=predict_median`, or "*
-                "specify multiple operations, "*
-                "one for each measure. "
+        if T <: Union{Missing,Finite}
+            suggestion = LOG_SUGGESTION1
+        elseif T <: Union{Missing,Infinite}
+            suggestion = LOG_SUGGESTION2
         else
             suggestion = ""
         end
-        throw(ArgumentError(
-            "\n$model <: Probabilistic but prediction_type($measure) = "*
-            ":$(prediction_type(measure)). "*suggestion*avoid))
+        throw(ERR_MEASURES_PROBABILISTIC(measure, suggestion))
     end
 
-    model isa Deterministic && prediction_type(measure) != :deterministic &&
-        throw(ArgumentError("$model <: Deterministic but "*
-                            "prediction_type($measure) ="*
-              ":$(prediction_type(measure))."*avoid))
+    model isa Deterministic &&
+        StatisticalMeasuresBase.kind_of_proxy(measure) != LearnAPI.LiteralTarget() &&
+        throw(ERR_MEASURES_DETERMINISTIC(measure))
 
     return true
 
@@ -682,13 +687,14 @@ function _actual_measures(measures, model)
         _measures = measures
     end
 
-    return _measures
+    # wrap in `robust_measure` to allow unsupported weights to be silently treated as
+    # uniform when invoked; `_check_measure` will throw appropriate warnings unless
+    # explicitly suppressed.
+    return StatisticalMeasuresBase.robust_measure.(_measures)
 
 end
 
 function _check_weights(weights, nrows)
-    weights isa AbstractVector{<:Real} ||
-        throw(ERR_WEIGHTS_REAL)
     length(weights) == nrows ||
         throw(ERR_WEIGHTS_LENGTH)
     return true
@@ -741,21 +747,35 @@ function _actual_operations(operation::Nothing,
                             verbosity)
     map(measures) do m
 
-        prediction_type = MLJBase.prediction_type(m)
-        target_scitype = MLJBase.target_scitype(m)
+        # `kind_of_proxy` is the measure trait corresponding to `prediction_type` model
+        # trait. But it's values are instances of LearnAPI.KindOfProxy, instead of
+        # symbols:
+        #
+        # `LearnAPI.LiteralTarget()` ~ `:deterministic` (`model isa Deterministic`)
+        # `LearnAPI.Distribution()` ~ `:probabilistic` (`model isa Deterministic`)
+        #
+        kind_of_proxy = StatisticalMeasuresBase.kind_of_proxy(m)
 
-        if prediction_type === :unknown
-            return predict
-        end
+        # `observation_type` is the measure trait which we need to match the model
+        # `target_scitype` but the latter refers to the whole target `y`, not a single
+        # observation.
+        #
+        # One day, models will have their own `observation_scitype`
+        observation_scitype = StatisticalMeasuresBase.observation_scitype(m)
+
+        # One day, models will implement LearnAPI and will get their own `kind_of_proxy`
+        # trait replacing `prediction_type` and `observation_scitype` trait replacing
+        # `target_scitype`.
+
+        isnothing(kind_of_proxy) && (return predict)
 
         if MLJBase.prediction_type(model) === :probabilistic
-            if prediction_type === :probabilistic
+            if kind_of_proxy === LearnAPI.Distribution()
                 return predict
-            elseif prediction_type === :deterministic
-                if target_scitype <: AbstractArray{<:Union{Missing,Finite}}
+            elseif kind_of_proxy === LearnAPI.LiteralTarget()
+                if observation_scitype <: Union{Missing,Finite}
                     return predict_mode
-                elseif target_scitype <:
-                    AbstractArray{<:Union{Missing,Continuous,Count}}
+                elseif observation_scitype <:Union{Missing,Infinite}
                     return predict_mean
                 else
                     throw(err_ambiguous_operation(model, m))
@@ -764,19 +784,21 @@ function _actual_operations(operation::Nothing,
                 throw(err_ambiguous_operation(model, m))
             end
         elseif MLJBase.prediction_type(model) === :deterministic
-            if prediction_type === :probabilistic
+            if kind_of_proxy === LearnAPI.Distribution()
                 throw(err_incompatible_prediction_types(model, m))
-            elseif prediction_type === :deterministic
+            elseif kind_of_proxy === LearnAPI.LiteralTarget()
                 return predict
             else
                 throw(err_ambiguous_operation(model, m))
             end
-        else
-            if prediction_type === :interval
+        elseif MLJBase.prediction_type(model) === :interval
+            if kind_of_proxy === LearnAPI.ConfidenceInterval()
                 return predict
             else
                 throw(err_ambiguous_operation(model, m))
             end
+        else
+            throw(err_ambiguous_operation(model, m))
         end
     end
 end
@@ -820,158 +842,123 @@ _process_accel_settings(accel) =  throw(ArgumentError("unsupported" *
 # --------------------------------------------------------------
 # User interface points: `evaluate!` and `evaluate`
 
+const RESAMPLING_STRATEGIES = subtypes(ResamplingStrategy)
+const RESAMPLING_STRATEGIES_LIST =
+    join(
+        map(RESAMPLING_STRATEGIES) do s
+             name = split(string(s), ".") |> last
+             "`$name`"
+        end,
+        ", ",
+        " and ",
+    )
+
 """
     log_evaluation(logger, performance_evaluation)
-Log a performance evaluation to `logger`, an object specific to some logging
-platform, such as mlflow. If `logger=nothing` then no logging is performed.
-The method is called at the end of every call to `evaluate/evaluate!` using
-the logger provided by the `logger` keyword argument.
+
+Log a performance evaluation to `logger`, an object specific to some logging platform,
+such as mlflow. If `logger=nothing` then no logging is performed.  The method is called at
+the end of every call to `evaluate/evaluate!` using the logger provided by the `logger`
+keyword argument.
+
 # Implementations for new logging platforms
-#
-Julia interfaces to workflow logging platforms, such as mlflow (provided by
-the MLFlowClient.jl interface) should overload
-`log_evaluation(logger::LoggerType, performance_evaluation)`,
-where `LoggerType` is a platform-specific type for logger objects. For an
-example, see the implementation provided by the MLJFlow.jl package.
+
+Julia interfaces to workflow logging platforms, such as mlflow (provided by the
+MLFlowClient.jl interface) should overload `log_evaluation(logger::LoggerType,
+performance_evaluation)`, where `LoggerType` is a platform-specific type for logger
+objects. For an example, see the implementation provided by the MLJFlow.jl package.
+
 """
 log_evaluation(logger, performance_evaluation) = nothing
 
 """
-    evaluate!(mach,
-              resampling=CV(),
-              measure=nothing,
-              rows=nothing,
-              weights=nothing,
-              class_weights=nothing,
-              operation=nothing,
-              repeats=1,
-              acceleration=default_resource(),
-              force=false,
-              verbosity=1,
-              check_measure=true,
-              logger=nothing)
-
-Estimate the performance of a machine `mach` wrapping a supervised
-model in data, using the specified `resampling` strategy (defaulting
-to 6-fold cross-validation) and `measure`, which can be a single
-measure or vector.
-
-Do `subtypes(MLJ.ResamplingStrategy)` to obtain a list of available
-resampling strategies. If `resampling` is not an object of type
-`MLJ.ResamplingStrategy`, then a vector of tuples (of the form
-`(train_rows, test_rows)` is expected. For example, setting
+    evaluate!(mach; resampling=CV(), measure=nothing, options...)
+
+Estimate the performance of a machine `mach` wrapping a supervised model in data, using
+the specified `resampling` strategy (defaulting to 6-fold cross-validation) and `measure`,
+which can be a single measure or vector. Returns a [`PerformanceEvaluation`](@ref)
+object.
+
+Available resampling strategies are $RESAMPLING_STRATEGIES_LIST. If `resampling` is not an
+instance of one of these, then a vector of tuples of the form `(train_rows, test_rows)`
+is expected. For example, setting
 
     resampling = [((1:100), (101:200)),
                    ((101:200), (1:100))]
 
 gives two-fold cross-validation using the first 200 rows of data.
 
-The type of operation (`predict`, `predict_mode`, etc) to be
-associated with `measure` is automatically inferred from measure
-traits where possible. For example, `predict_mode` will be used for a
-`Multiclass` target, if `model` is probabilistic but `measure` is
-deterministic. The operations applied can be inspected from the
-`operation` field of the object returned. Alternatively, operations
-can be explicitly specified using `operation=...`. If `measure` is a
-vector, then `operation` must be a single operation, which will be
-associated with all measures, or a vector of the same length as
-`measure`.
-
-The resampling strategy is applied repeatedly (Monte Carlo resampling)
-if `repeats > 1`. For example, if `repeats = 10`, then `resampling =
-CV(nfolds=5, shuffle=true)`, generates a total of 50 `(train, test)`
-pairs for evaluation and subsequent aggregation.
+Any measure conforming to the
+[StatisticalMeasuresBase.jl](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/)
+API can be provided, assuming it can consume multiple observations.
 
-If `resampling isa MLJ.ResamplingStrategy` then one may optionally
-restrict the data used in evaluation by specifying `rows`.
+Although `evaluate!` is mutating, `mach.model` and `mach.args` are not mutated.
 
-An optional `weights` vector may be passed for measures that support
-sample weights (`MLJ.supports_weights(measure) == true`), which is
-ignored by those that don't. These weights are not to be confused with
-any weights `w` bound to `mach` (as in `mach = machine(model, X,
-y, w)`). To pass these to the performance evaluation measures you must
-explictly specify `weights=w` in the `evaluate!` call.
+# Additional keyword options
 
-Additionally, optional `class_weights` dictionary may be passed
-for measures that support class weights
-(`MLJ.supports_class_weights(measure) == true`), which is
-ignored by those that don't. These weights are not to be confused with
-any weights `class_w` bound to `mach` (as in `mach = machine(model, X,
-y, class_w)`). To pass these to the performance evaluation measures you
-must explictly specify `class_weights=w` in the `evaluate!` call.
+- `rows` - vector of observation indices from which both train and test folds are
+  constructed (default is all observations)
 
-User-defined measures are supported; see the manual for details.
+- `operation`/`operations=nothing` - One of $PREDICT_OPERATIONS_STRING, or a vector of
+  these of the same length as `measure`/`measures`. Automatically inferred if left
+  unspecified. For example, `predict_mode` will be used for a `Multiclass` target, if
+  `model` is a probabilistic predictor, but `measure` is expects literal (point) target
+  predictions. Operations actually applied can be inspected from the `operation` field of
+  the object returned.
 
-If no measure is specified, then `default_measure(mach.model)` is
-used, unless this default is `nothing` and an error is thrown.
+- `weights` - per-sample `Real` weights for measures that support them (not to be confused
+  with weights used in training, such as the `w` in `mach = machine(model, X, y, w)`).
 
-The `acceleration` keyword argument is used to specify the compute resource (a
-subtype of `ComputationalResources.AbstractResource`) that will be used to
-accelerate/parallelize the resampling operation.
+- `class_weights` - dictionary of `Real` per-class weights for use with measures that
+  support these, in classification problems (not to be confused
+  with weights used in training, such as the `w` in `mach = machine(model, X, y, w)`).
 
-Although `evaluate!` is mutating, `mach.model` and `mach.args` are
-untouched.
+- `repeats::Int=1`: set to a higher value for repeated (Monte Carlo)
+  resampling. For example, if `repeats = 10`, then `resampling = CV(nfolds=5,
+  shuffle=true)`, generates a total of 50 `(train, test)` pairs for evaluation and
+  subsequent aggregation.
 
-### Summary of key-word arguments
+- `acceleration=CPU1()`: acceleration/parallelization option; can be any instance of
+  `CPU1`, (single-threaded computation), `CPUThreads` (multi-threaded computation) or
+  `CPUProcesses` (multi-process computation); default is `default_resource()`. These types
+  are owned by ComputationalResources.jl.
 
-- `resampling` - resampling strategy (default is `CV(nfolds=6)`)
-
-- `measure`/`measures` - measure or vector of measures (losses, scores, etc)
-
-- `rows` - vector of observation indices from which both train and
-  test folds are constructed (default is all observations)
-
-- `weights` - per-sample weights for measures that support them (not
-  to be confused with weights used in training)
-
-- `class_weights` - dictionary of per-class weights for use with
-  measures that support these, in classification problems (not to be
-  confused with per-sample `weights` or with class weights used in
-  training)
-
-- `operation`/`operations` - One of $PREDICT_OPERATIONS_STRING, or a
-  vector of these of the same length as
-  `measure`/`measures`. Automatically inferred if left unspecified.
-
-- `repeats` - default is 1; set to a higher value for repeated
-  (Monte Carlo) resampling
-
-- `acceleration` - parallelization option; currently supported
-  options are instances of `CPU1` (single-threaded computation)
-  `CPUThreads` (multi-threaded computation) and `CPUProcesses`
-  (multi-process computation); default is `default_resource()`.
-
-- `force` - default is `false`; set to `true` for force cold-restart
+- `force=false`: set to `true` to force cold-restart
   of each training event
 
-- `verbosity` level, an integer defaulting to 1.
-
-- `check_measure` - default is `true`
+- `verbosity::Int=1` logging level; can be negative
 
-- `logger` - a logger object (see [`MLJBase.log_evaluation`](@ref))
+- `check_measure=true`: whether to screen measures for possible incompatibility with the
+  model. Will not catch all incompatibilities.
 
+- `per_observation=true`: whether to calculate estimates for individual observations; if
+  `false` the `per_observation` field of the returned object is populated with
+  `missing`s. Setting to `false` may reduce compute time and allocations.
 
-### Return value
+- `logger` - a logger object (see [`MLJBase.log_evaluation`](@ref))
 
-A [`PerformanceEvaluation`](@ref) object. See
-[`PerformanceEvaluation`](@ref) for details.
+See also [`evaluate`](@ref), [`PerformanceEvaluation`](@ref)
 
 """
-function evaluate!(mach::Machine{<:Measurable};
-                   resampling=CV(),
-                   measures=nothing,
-                   measure=measures,
-                   weights=nothing,
-                   class_weights=nothing,
-                   operations=nothing,
-                   operation=operations,
-                   acceleration=default_resource(),
-                   rows=nothing,
-                   repeats=1,
-                   force=false,
-                   check_measure=true,
-                   verbosity=1,
-                   logger=nothing)
+
+function evaluate!(
+    mach::Machine{<:Measurable};
+    resampling=CV(),
+    measures=nothing,
+    measure=measures,
+    weights=nothing,
+    class_weights=nothing,
+    operations=nothing,
+    operation=operations,
+    acceleration=default_resource(),
+    rows=nothing,
+    repeats=1,
+    force=false,
+    check_measure=true,
+    per_observation=true,
+    verbosity=1,
+    logger=nothing,
+    )
 
     # this method just checks validity of options, preprocess the
     # weights, measures, operations, and dispatches a
@@ -1005,26 +992,52 @@ function evaluate!(mach::Machine{<:Measurable};
                             verbosity,
                             check_measure)
 
-    _warn_about_unsupported(supports_weights,
-                            "Sample", _measures, weights, verbosity)
-    _warn_about_unsupported(supports_class_weights,
-                            "Class", _measures, class_weights, verbosity)
+    _warn_about_unsupported(
+        StatisticalMeasuresBase.supports_weights,
+        "Sample",
+        _measures,
+        weights,
+        verbosity,
+    )
+    _warn_about_unsupported(
+        StatisticalMeasuresBase.supports_class_weights,
+        "Class",
+        _measures,
+        class_weights,
+        verbosity,
+    )
 
     _acceleration= _process_accel_settings(acceleration)
 
-    evaluate!(mach, resampling, weights, class_weights, rows, verbosity,
-              repeats, _measures, _operations, _acceleration, force, logger,
-              resampling)
-
+    evaluate!(
+        mach,
+        resampling,
+        weights,
+        class_weights,
+        rows,
+        verbosity,
+        repeats,
+        _measures,
+        _operations,
+        _acceleration,
+        force,
+        per_observation,
+        logger,
+        resampling,
+    )
 end
 
 """
-    evaluate(model, data...; cache=true, kw_options...)
+    evaluate(model, data...; cache=true, options...)
 
 Equivalent to `evaluate!(machine(model, data..., cache=cache);
-wk_options...)`.  See the machine version `evaluate!` for the complete
+options...)`.  See the machine version `evaluate!` for the complete
 list of options.
 
+Returns a  [`PerformanceEvaluation`](@ref) object.
+
+See also [`evaluate!`](@ref).
+
 """
 evaluate(model::Measurable, args...; cache=true, kwargs...) =
     evaluate!(machine(model, args...; cache=cache); kwargs...)
@@ -1173,30 +1186,32 @@ const AbstractRow = Union{AbstractVector{<:Integer}, Colon}
 const TrainTestPair = Tuple{AbstractRow, AbstractRow}
 const TrainTestPairs = AbstractVector{<:TrainTestPair}
 
-# helper:
-_feature_dependencies_exist(measures) =
-    !all(m->!(is_feature_dependent(m)), measures)
-
-# helper:
-function measure_specific_weights(measure, weights, class_weights, test)
-    supports_weights(measure) && supports_class_weights(measure) &&
-        error("Encountered a measure that simultaneously supports "*
-              "(per-sample) weights and class weights. ")
-    if supports_weights(measure)
-        weights === nothing && return nothing
-        return weights[test]
-    end
-    supports_class_weights(measure) && return class_weights
-    return nothing
-end
+_view(::Nothing, rows) = nothing
+_view(weights, rows) = view(weights, rows)
 
 # Evaluation when `resampling` is a TrainTestPairs (CORE EVALUATOR):
-# `user_resampling` keyword argument is the user defined resampling strategy
-function evaluate!(mach::Machine, resampling, weights, class_weights, rows,
-                   verbosity, repeats, measures, operations, acceleration,
-                   force, logger, user_resampling)
+function evaluate!(
+    mach::Machine,
+    resampling,
+    weights,
+    class_weights,
+    rows,
+    verbosity,
+    repeats,
+    measures,
+    operations,
+    acceleration,
+    force,
+    per_observation_flag,
+    logger,
+    user_resampling,
+    )
+
+    # Note: `user_resampling` keyword argument is the user-defined resampling strategy,
+    # while `resampling` is always a `TrainTestPairs`.
 
-    # Note: `rows` and `repeats` are ignored here
+    # Note: `rows` and `repeats` are only passed to the final `PeformanceEvaluation`
+    # object to be returned and are not otherwise used here.
 
     if !(resampling isa TrainTestPairs)
         error("`resampling` must be an "*
@@ -1206,12 +1221,21 @@ function evaluate!(mach::Machine, resampling, weights, class_weights, rows,
 
     X = mach.args[1]()
     y = mach.args[2]()
+    nrows = MLJBase.nrows(y)
 
     nfolds = length(resampling)
+    test_fold_sizes = map(resampling) do train_test_pair
+        test = last(train_test_pair)
+        test isa Colon && (return nrows)
+        length(test)
+    end
 
-    nmeasures = length(measures)
+    # weights used to aggregate per-fold measurements, which depends on a measures
+    # external mode of aggregation:
+    fold_weights(mode) = nfolds .* test_fold_sizes ./ sum(test_fold_sizes)
+    fold_weights(::StatisticalMeasuresBase.Sum) = nothing
 
-    feature_dependencies_exist = _feature_dependencies_exist(measures)
+    nmeasures = length(measures)
 
     function fit_and_extract_on_fold(mach, k)
         train, test = resampling[k]
@@ -1220,21 +1244,27 @@ function evaluate!(mach::Machine, resampling, weights, class_weights, rows,
         # that appear (`predict`, `predict_mode`, etc):
         yhat_given_operation =
             Dict(op=>op(mach, rows=test) for op in unique(operations))
-        if feature_dependencies_exist
-            Xtest = selectrows(X, test)
-        else
-            Xtest = nothing
-        end
-        ytest = selectrows(y, test)
 
-        measurements =  map(measures, operations) do m, op
-            wtest = measure_specific_weights(
-                m,
-                weights,
-                class_weights,
-                test
-            )
-            value(m, yhat_given_operation[op], Xtest, ytest, wtest)
+        ytest = selectrows(y, test)
+        if per_observation_flag
+            measurements =  map(measures, operations) do m, op
+                StatisticalMeasuresBase.measurements(
+                    m,
+                    yhat_given_operation[op],
+                    ytest,
+                    _view(weights, test),
+                    class_weights,
+                )
+            end
+        else
+            measurements =  map(measures, operations) do m, op
+                m(
+                    yhat_given_operation[op],
+                    ytest,
+                    _view(weights, test),
+                    class_weights,
+                )
+            end
         end
 
         fp = fitted_params(mach)
@@ -1267,27 +1297,38 @@ function evaluate!(mach::Machine, resampling, weights, class_weights, rows,
 
     measurements_flat = vcat(measurements_vector_of_vectors...)
 
-    # in the following rows=folds, columns=measures:
+    # In the `measurements_matrix` below, rows=folds, columns=measures; each element of
+    # the matrix is:
+    #
+    # - a vector of meausurements, one per observation within a fold, if
+    # - `per_observation_flag = true`; or
+    #
+    # - a single measurment for the whole fold, if `per_observation_flag = false`.
+    #
     measurements_matrix = permutedims(
         reshape(collect(measurements_flat), (nmeasures, nfolds))
     )
 
     # measurements for each observation:
-    per_observation = map(1:nmeasures) do k
-        m = measures[k]
-        if reports_each_observation(m)
-            measurements_matrix[:,k]
-        else
-            missing
-        end
+    per_observation = if per_observation_flag
+       map(1:nmeasures) do k
+           measurements_matrix[:,k]
+       end
+    else
+        fill(missing, nmeasures)
     end
 
     # measurements for each fold:
-    per_fold = map(1:nmeasures) do k
-        m = measures[k]
-        if reports_each_observation(m)
-            broadcast(MLJBase.aggregate, per_observation[k], [m,])
-        else
+    per_fold = if per_observation_flag
+        map(1:nmeasures) do k
+            m = measures[k]
+            mode = StatisticalMeasuresBase.external_aggregation_mode(m)
+            map(per_observation[k]) do v
+                StatisticalMeasuresBase.aggregate(v; mode)
+            end
+        end
+    else
+        map(1:nmeasures) do k
             measurements_matrix[:,k]
         end
     end
@@ -1295,7 +1336,12 @@ function evaluate!(mach::Machine, resampling, weights, class_weights, rows,
     # overall aggregates:
     per_measure = map(1:nmeasures) do k
         m = measures[k]
-        MLJBase.aggregate(per_fold[k], m)
+        mode = StatisticalMeasuresBase.external_aggregation_mode(m)
+        StatisticalMeasuresBase.aggregate(
+            per_fold[k];
+            mode,
+            weights=fold_weights(mode),
+        )
     end
 
     evaluation = PerformanceEvaluation(
@@ -1358,39 +1404,36 @@ end
         repeats = 1,
         acceleration=default_resource(),
         check_measure=true,
-        logger=nothing
+        per_observation=true,
+        logger=nothing,
     )
 
-Resampling model wrapper, used internally by the `fit` method of
-`TunedModel` instances and `IteratedModel` instances. See
-[`evaluate!](@ref) for options. Not intended for general use.
+Resampling model wrapper, used internally by the `fit` method of `TunedModel` instances
+and `IteratedModel` instances. See [`evaluate!](@ref) for options. Not intended for use by
+general user, who will ordinarily use [`evaluate!`](@ref) directly.
 
-Given a machine `mach = machine(resampler, args...)` one obtains a
-performance evaluation of the specified `model`, performed according
-to the prescribed `resampling` strategy and other parameters, using
-data `args...`, by calling `fit!(mach)` followed by
+Given a machine `mach = machine(resampler, args...)` one obtains a performance evaluation
+of the specified `model`, performed according to the prescribed `resampling` strategy and
+other parameters, using data `args...`, by calling `fit!(mach)` followed by
 `evaluate(mach)`.
 
-On subsequent calls to `fit!(mach)` new train/test pairs of row
-indices are only regenerated if `resampling`, `repeats` or `cache`
-fields of `resampler` have changed. The evolution of an RNG field of
-`resampler` does *not* constitute a change (`==` for `MLJType` objects
-is not sensitive to such changes; see [`is_same_except'](@ref)).
+On subsequent calls to `fit!(mach)` new train/test pairs of row indices are only
+regenerated if `resampling`, `repeats` or `cache` fields of `resampler` have changed. The
+evolution of an RNG field of `resampler` does *not* constitute a change (`==` for
+`MLJType` objects is not sensitive to such changes; see [`is_same_except`](@ref)).
 
-If there is single train/test pair, then warm-restart behavior of the
-wrapped model `resampler.model` will extend to warm-restart behaviour
-of the wrapper `resampler`, with respect to mutations of the wrapped
-model.
+If there is single train/test pair, then warm-restart behavior of the wrapped model
+`resampler.model` will extend to warm-restart behaviour of the wrapper `resampler`, with
+respect to mutations of the wrapped model.
 
-The sample `weights` are passed to the specified performance measures
-that support weights for evaluation. These weights are not to be
-confused with any weights bound to a `Resampler` instance in a
-machine, used for training the wrapped `model` when supported.
+The sample `weights` are passed to the specified performance measures that support weights
+for evaluation. These weights are not to be confused with any weights bound to a
+`Resampler` instance in a machine, used for training the wrapped `model` when supported.
 
-The sample `class_weights` are passed to the specified performance
-measures that support per-class weights for evaluation. These weights
-are not to be confused with any weights bound to a `Resampler` instance
-in a machine, used for training the wrapped `model` when supported.
+The sample `class_weights` are passed to the specified performance measures that support
+per-class weights for evaluation. These weights are not to be confused with any weights
+bound to a `Resampler` instance in a machine, used for training the wrapped `model` when
+supported.
 
 """
 mutable struct Resampler{S, L} <: Model
@@ -1404,6 +1447,7 @@ mutable struct Resampler{S, L} <: Model
     check_measure::Bool
     repeats::Int
     cache::Bool
+    per_observation::Bool
     logger::L
 end
 
@@ -1433,18 +1477,21 @@ function MLJModelInterface.clean!(resampler::Resampler)
     return warning
 end
 
-function Resampler(;
-    model=nothing,
+function Resampler(
+    ;model=nothing,
     resampling=CV(),
-    measure=nothing,
+    measures=nothing,
+    measure=measures,
     weights=nothing,
     class_weights=nothing,
-    operation=predict,
+    operations=predict,
+    operation=operations,
     acceleration=default_resource(),
     check_measure=true,
     repeats=1,
     cache=true,
-    logger=nothing
+    per_observation=true,
+    logger=nothing,
 )
     resampler = Resampler(
         model,
@@ -1457,7 +1504,8 @@ function Resampler(;
         check_measure,
         repeats,
         cache,
-        logger
+        per_observation,
+        logger,
     )
     message = MLJModelInterface.clean!(resampler)
     isempty(message) || @warn message
@@ -1503,8 +1551,9 @@ function MLJModelInterface.fit(resampler::Resampler, verbosity::Int, args...)
         _operations,
         _acceleration,
         false,
+        resampler.per_observation,
         resampler.logger,
-        resampler.resampling
+        resampler.resampling,
     )
 
     fitresult = (machine = mach, evaluation = e)
@@ -1568,8 +1617,9 @@ function MLJModelInterface.update(
         operations,
         acceleration,
         false,
+        resampler.per_observation,
         resampler.logger,
-        resampler.resampling
+        resampler.resampling,
     )
     report = (evaluation = e, )
     fitresult = (machine=mach2, evaluation=e)
diff --git a/src/utilities.jl b/src/utilities.jl
index 66dd62b7..7288c30e 100644
--- a/src/utilities.jl
+++ b/src/utilities.jl
@@ -469,3 +469,93 @@ end
 
 generate_name!(model, existing_names; kwargs...) =
     generate_name!(typeof(model), existing_names; kwargs...)
+
+
+# # OBSERVATION VS CONTAINER HACKINGS TOOLS
+
+# The following tools are used to bridge the gap between old paradigm of prescribing
+# the scitype of containers of observations, and the LearnAPI.jl paradigm of prescribing
+# only the scitype of the observations themeselves. This is needed because measures are
+# now taken from StatisticalMeasures.jl which follows the LearnAPI.jl paradigm, but model
+# `target_scitype` refers to containers.
+
+"""
+    observation(S)
+
+*Private method.*
+
+Tries to infer the per-observation scitype from the scitype of `S`, when `S` is known to
+be the scitype of some container with multiple observations; here we view the scitype for
+one row of a table to be the scitype of the row converted to a vector. Return `Unknown` if
+unable to draw reliable inferrence.
+
+
+The observation scitype for a table is here understood as the scitype of a row converted
+to a vector.
+
+"""
+observation(::Type) = Unknown
+observation(::Type{AbstractVector{S}}) where S = S
+observation(::Type{AbstractArray{S,N}}) where {S,N} = AbstractArray{S,N-1}
+for T in [:Continuous, :Count, :Finite, :Infinite, :Multiclass, :OrderedFactor]
+    TM = "Union{Missing,$T}" |> Meta.parse
+    for S in [T, TM]
+        quote
+            observation(::Type{AbstractVector{<:$S}}) = $S
+            observation(::Type{AbstractArray{<:$S,N}}) where N = AbstractArray{<:$S,N-1}
+            observation(::Type{Table{<:AbstractVector{<:$S}}}) = AbstractVector{<:$S}
+        end |> eval
+    end
+end
+# note that in Julia `f(::Type{AbstractVector{<:T}}) where T = T` has not a well-formed
+# left-hand side
+
+"""
+    guess_observation_scitype(y)
+
+*Private method.*
+
+If `y` is an `AbstractArray`, return the scitype of `y[:, :, ..., :, 1]`. If `y` is a
+table, return the scitype of the first row, converted to a vector, unless this row has
+`missing` elements, in which case return `Unknown`.
+
+In all other cases, `Unknown`.
+
+```
+julia> guess_observation_scitype([missing, 1, 2, 3])
+Union{Missing, Count}
+
+julia> guess_observation_scitype(rand(3, 2))
+AbstractVector{Continuous}
+
+julia> guess_observation_scitype((x=rand(3), y=rand(Bool, 3)))
+AbstractVector{Union{Continuous, Count}}
+
+julia> guess_observation_scitype((x=[missing, 1, 2], y=[1, 2, 3]))
+Unknown
+```
+"""
+guess_observation_scitype(y) = guess_observation_scitype(y, Val(Tables.istable(y)))
+guess_observation_scitype(y, ::Any) = Unknown
+guess_observation_scitype(y::AbstractArray, ::Val{false}) = observation(scitype(y))
+function guess_observation_scitype(table, ::Val{true})
+    row = Tables.subset(table, 1, viewhint=false) |> collect
+    E = eltype(row)
+    nonmissingtype(E) == E || return Unknown
+    scitype(row)
+end
+
+"""
+    guess_model_targetobservation_scitype(model)
+
+*Private method*
+
+Try to infer a lowest upper bound on the scitype of target observations acceptable to
+`model`, by inspecting `target_scitype(model)`. Return `Unknown` if unable to draw reliable
+inferrence.
+
+The observation scitype for a table is here understood as the scitype of a row converted
+to a vector.
+
+"""
+guess_model_target_observation_scitype(model) =  observation(target_scitype(model))
diff --git a/test/composition/learning_networks/deprecated_machines.jl b/test/composition/learning_networks/deprecated_machines.jl
index 19b580d6..bad68bd2 100644
--- a/test/composition/learning_networks/deprecated_machines.jl
+++ b/test/composition/learning_networks/deprecated_machines.jl
@@ -9,6 +9,7 @@ using MLJBase
 using Tables
 using StableRNGs
 using Serialization
+using StatisticalMeasures
 rng = StableRNG(616161)
 
 # A dummy clustering model:
diff --git a/test/composition/learning_networks/nodes.jl b/test/composition/learning_networks/nodes.jl
index 1f175d45..e79cec9d 100644
--- a/test/composition/learning_networks/nodes.jl
+++ b/test/composition/learning_networks/nodes.jl
@@ -6,6 +6,7 @@ using MLJBase
 using ..Models
 using ..TestUtilities
 using CategoricalArrays
+using StatisticalMeasures
 import Random.seed!
 seed!(1234)
 
diff --git a/test/composition/learning_networks/signatures.jl b/test/composition/learning_networks/signatures.jl
index 08785b40..019a9cd5 100644
--- a/test/composition/learning_networks/signatures.jl
+++ b/test/composition/learning_networks/signatures.jl
@@ -7,6 +7,7 @@ using Tables
 using Test
 using MLJModelInterface
 using OrderedCollections
+using StatisticalMeasures
 
 @testset "signatures - accessor functions" begin
     a = source(:a)
diff --git a/test/composition/models/network_composite.jl b/test/composition/models/network_composite.jl
index 87e064df..df00f201 100644
--- a/test/composition/models/network_composite.jl
+++ b/test/composition/models/network_composite.jl
@@ -1,4 +1,4 @@
-module TestNetowrkComposite
+module TestNetoworkComposite
 
 using Test
 using MLJBase
@@ -9,6 +9,7 @@ using Tables
 using MLJModelInterface
 using CategoricalArrays
 using OrderedCollections
+using StatisticalMeasures
 using Serialization
 
 const MMI = MLJModelInterface
diff --git a/test/composition/models/stacking.jl b/test/composition/models/stacking.jl
index 6cbe6588..ca973775 100644
--- a/test/composition/models/stacking.jl
+++ b/test/composition/models/stacking.jl
@@ -2,11 +2,11 @@ module TestStacking
 
 using Test
 using MLJBase
+using StatisticalMeasures
 using MLJModelInterface
 using ..Models
 using Random
 using StableRNGs
-
 import Distributions
 
 rng = StableRNGs.StableRNG(1234)
@@ -31,7 +31,7 @@ function test_internal_evaluation(internalreport, std_evaluation, modelnames)
         @test model_ev isa PerformanceEvaluation
         @test model_ev.per_fold == std_ev.per_fold
         @test model_ev.measurement == std_ev.measurement
-        @test model_ev.per_observation[1] === std_ev.per_observation[1] === missing
+        @test model_ev.per_observation[1] == std_ev.per_observation[1]
         @test model_ev.per_observation[2] == std_ev.per_observation[2]
         @test model_ev.operation == std_ev.operation
         @test model_ev.report_per_fold == std_ev.report_per_fold
diff --git a/test/composition/models/static_transformers.jl b/test/composition/models/static_transformers.jl
index c0162950..072dcbca 100644
--- a/test/composition/models/static_transformers.jl
+++ b/test/composition/models/static_transformers.jl
@@ -5,6 +5,7 @@ using Test
 using MLJBase
 using ..Models
 using CategoricalArrays
+using StatisticalMeasures
 import Random.seed!
 seed!(1234)
 
diff --git a/test/default_measures.jl b/test/default_measures.jl
new file mode 100644
index 00000000..28a28b5d
--- /dev/null
+++ b/test/default_measures.jl
@@ -0,0 +1,42 @@
+mutable struct DRegressor <: Deterministic end
+MLJBase.target_scitype(::Type{<:DRegressor}) =
+    AbstractVector{<:Union{Missing,Continuous}}
+
+mutable struct D2Regressor <: Deterministic end
+MLJBase.target_scitype(::Type{<:D2Regressor}) =
+    AbstractVector{<:Union{Missing,Continuous}}
+
+mutable struct DClassifier <: Deterministic end
+MLJBase.target_scitype(::Type{<:DClassifier}) =
+    AbstractVector{<:Union{Missing,Finite}}
+
+mutable struct DClassifierWeird <: Deterministic end
+MLJBase.target_scitype(::Type{<:DClassifierWeird}) =
+    AbstractVector{<:Textual}
+
+mutable struct PClassifier <: Probabilistic end
+MLJBase.target_scitype(::Type{<:PClassifier}) =
+    AbstractVector{<:Union{Missing,Finite}}
+
+mutable struct PRegressor <: Probabilistic end
+MLJBase.target_scitype(::Type{<:PRegressor}) =
+    AbstractVector{<:Union{Missing,Continuous}}
+
+mutable struct PCountRegressor <: Probabilistic end
+MLJBase.target_scitype(::Type{<:PCountRegressor}) =
+    AbstractVector{<:Union{Missing,Count}}
+
+
+
+@testset "default_measure" begin
+    @test MLJBase.default_measure(DRegressor()) == l2
+    @test MLJBase.default_measure(D2Regressor()) == l2
+    @test MLJBase.default_measure(DClassifier()) == misclassification_rate
+    @test MLJBase.default_measure(PClassifier()) == log_loss
+    @test MLJBase.default_measure(PRegressor()) == log_loss
+    @test MLJBase.default_measure(PCountRegressor()) == log_loss
+    @test isnothing(MLJBase.default_measure(DClassifierWeird()))
+    @test isnothing(MLJBase.default_measure("junk"))
+end
+
+true
diff --git a/test/interface/model_api.jl b/test/interface/model_api.jl
index 9bf3e0bf..8966f70f 100644
--- a/test/interface/model_api.jl
+++ b/test/interface/model_api.jl
@@ -2,6 +2,7 @@ module TestModelAPI
 
 using Test
 using MLJBase
+using StatisticalMeasures
 import MLJModelInterface
 using ..Models
 using Distributions
@@ -77,7 +78,7 @@ UnivariateFiniteFitter(;alpha=1.0) = UnivariateFiniteFitter(alpha)
     yhat = predict(mach, nothing) # single UnivariateFinite distribution
 
     @test cross_entropy(fill(yhat, 3), ytest) ≈
-        [-log(1/2), -log(1/2), -log(1/4)]
+        mean([-log(1/2), -log(1/2), -log(1/4)])
 
 end
 
diff --git a/test/machines.jl b/test/machines.jl
index 16655d26..7d0845c2 100644
--- a/test/machines.jl
+++ b/test/machines.jl
@@ -7,6 +7,7 @@ using ..Models
 using StableRNGs
 using Serialization
 using ..TestUtilities
+using StatisticalMeasures
 
 const MLJModelInterface = MLJBase.MLJModelInterface
 const MMI = MLJModelInterface
diff --git a/test/measures/confusion_matrix.jl b/test/measures/confusion_matrix.jl
deleted file mode 100644
index 3e7d9b7f..00000000
--- a/test/measures/confusion_matrix.jl
+++ /dev/null
@@ -1,116 +0,0 @@
-using Test
-using MLJBase
-include(joinpath("..", "..", "test", "_models", "models.jl"))
-using .Models
-
-@testset "_categorical" begin
-    a = [1, 1, 2, 3]
-    b = [3, 3, 4, 5]
-    c = [missing, a...]
-    d = [missing, b...]
-    e = categorical(a)
-    f = categorical(b)
-    g = categorical(c)
-    h = categorical(d)
-    j = CategoricalArrays.CategoricalValue{Int64, UInt32}[e[1], e[1], e[1], e[1]]
-    k = CategoricalArrays.CategoricalValue{Int64, UInt32}[e[4], e[4], e[4], e[4]]
-    rhs = (Set(1:5), Set(1:5))
-    @test Set.(levels.(MLJBase._categorical(a, b))) == rhs
-    @test Set.(levels.(MLJBase._categorical(a, d))) == rhs
-    @test Set.(levels.(MLJBase._categorical(c, b))) == rhs
-    @test Set.(levels.(MLJBase._categorical(c, d))) == rhs
-    @test Set.(levels.(MLJBase._categorical(a, f))) == rhs
-    @test Set.(levels.(MLJBase._categorical(a, h))) == rhs
-    @test Set.(levels.(MLJBase._categorical(b, a))) == rhs
-    @test Set.(levels.(MLJBase._categorical(d, a))) == rhs
-    @test Set.(levels.(MLJBase._categorical(b, c))) == rhs
-    @test Set.(levels.(MLJBase._categorical(d, c))) == rhs
-    @test Set.(levels.(MLJBase._categorical(f, a))) == rhs
-    @test Set.(levels.(MLJBase._categorical(h, a))) == rhs
-
-    @test Set.(levels.(MLJBase._categorical(j, k))) == (Set(1:3), Set(1:3))
-
-    # case of ordinary vector with CategoricalValue eltype:
-    acv = CategoricalArrays.CategoricalVector
-end
-
-@testset "basics" begin
-    yraw = ['m',     'm', 'f', 'n', missing, 'f', 'm', 'n', 'n', 'm', 'f']
-    ŷraw = [missing, 'f', 'f', 'm', 'f',     'f', 'n', 'm', 'n', 'm', 'f']
-    y = categorical(yraw)
-    ŷ = categorical(ŷraw)
-    l = levels(y) # f, m, n
-    cm = MLJBase._confmat(ŷ, y; warn=false)
-    ŷ_clean, y_clean = MLJBase.skipinvalid(ŷ, y)
-    ee(l,i,j) = sum((ŷ_clean .== l[i]) .& (y_clean .== l[j]))
-    for i in 1:3, j in 1:3
-        @test cm[i,j] == ee(l,i,j)
-    end
-
-    cm2 = @test_logs (:warn, r"The classes are") MLJBase._confmat(ŷraw, yraw)
-    @test cm2.mat == cm.mat
-
-    perm = [3, 1, 2]
-    l2 = l[perm]
-    cm2 = @test_logs MLJBase._confmat(ŷ, y; perm=perm)
-    m = ConfusionMatrix(perm=perm)
-    for i in 1:3, j in 1:3
-        @test cm2[i,j] == ee(l2,i,j)
-    end
-    @test_logs (:warn, r"The classes are un") MLJBase._confmat(ŷ, y)
-    ŷc = coerce(ŷ, Union{Missing,OrderedFactor})
-    yc = coerce(y, Union{Missing,OrderedFactor})
-    @test MLJBase._confmat(ŷc, yc).mat == cm.mat
-
-    y = categorical(['a','b','a','b'])
-    ŷ = categorical(['b','b','a','a'])
-    @test_logs (:warn, r"The classes are un") MLJBase._confmat(ŷ, y)
-
-    # more tests for coverage
-    y = categorical([1,2,3,1,2,3,1,2,3])
-    ŷ = categorical([1,2,3,1,2,3,1,2,3])
-    @test_throws ArgumentError MLJBase._confmat(ŷ, y, rev=true)
-
-    # silly test for display
-    ŷ = coerce(y, OrderedFactor)
-    y = coerce(y, OrderedFactor)
-    iob = IOBuffer()
-    Base.show(iob, MIME("text/plain"), MLJBase._confmat(ŷ, y))
-    siob = String(take!(iob))
-    @test strip(siob) == strip("""
-              ┌──────────────┐
-              │ Ground Truth │
-    ┌─────────┼────┬────┬────┤
-    │Predicted│ 1  │ 2  │ 3  │
-    ├─────────┼────┼────┼────┤
-    │    1    │ 3  │ 0  │ 0  │
-    ├─────────┼────┼────┼────┤
-    │    2    │ 0  │ 3  │ 0  │
-    ├─────────┼────┼────┼────┤
-    │    3    │ 0  │ 0  │ 3  │
-    └─────────┴────┴────┴────┘""")
-end
-
-@testset "ConfusionMatrix measure" begin
-
-    @test info(confmat).orientation == :other
-    model = DeterministicConstantClassifier()
-
-    X = (x=rand(10),)
-    long = categorical(collect("abbaacaabbbbababcbac"), ordered=true)
-    y = long[1:10]
-    yhat =long[11:20]
-
-    @test confmat(yhat, y).mat == [1 2 0; 3 1 1; 1 1 0]
-    @test ConfusionMatrix(perm=[2, 1, 3])(yhat, y).mat ==
-        MLJBase._confmat(yhat, y, perm=[2, 1, 3]).mat
-
-    MLJBase.value(confmat, yhat, X, y, nothing)
-
-    e = evaluate(model, X, y,
-                 measures=[misclassification_rate, confmat],
-                 resampling=Holdout(fraction_train=0.5))
-    cm = e.measurement[2]
-    @test cm.labels == ["a", "b", "c"]
-    @test cm.mat == [2 2 1; 0 0 0; 0 0 0]
-end
diff --git a/test/measures/continuous.jl b/test/measures/continuous.jl
deleted file mode 100644
index 3e645845..00000000
--- a/test/measures/continuous.jl
+++ /dev/null
@@ -1,31 +0,0 @@
-rng = StableRNG(666899)
-
-@testset "regressor measures" begin
-    y    = [1, 42,  2, 3, missing, 4]
-    yhat = [4, NaN, 3, 2, 42,      1]
-    w =    [1, 42,  2, 4, 42,      3]
-    y    = [1,  2, 3, 4]
-    yhat = [4, 3, 2,      1]
-    w =    [1,  2, 4,      3]
-    @test isapprox(mae(yhat, y), 2)
-    @test isapprox(mae(yhat, y, w), (1*3 + 2*1 + 4*1 + 3*3)/4)
-    @test isapprox(rms(yhat, y), sqrt(5))
-    @test isapprox(rms(yhat, y, w), sqrt((1*3^2 + 2*1^2 + 4*1^2 + 3*3^2)/4))
-    @test rsq(yhat, y) == -3
-    @test isapprox(mean(skipinvalid(l1(yhat, y))), 2)
-    @test isapprox(mean(skipinvalid(l1(yhat, y, w))), mae(yhat, y, w))
-    @test isapprox(mean(skipinvalid(l2(yhat, y))), 5)
-    @test isapprox(mean(skipinvalid(l2(yhat, y, w))), rms(yhat, y, w)^2)
-    @test isapprox(mean(skipinvalid(log_cosh(yhat, y))), 1.3715546675)
-
-    y    = [1, 42,  2, 3, missing, 4]
-    yhat = [2, NaN, 3, 4, 42,      5]
-    @test isapprox(rmsl(yhat, y),
-                   sqrt((log(1/2)^2 + log(2/3)^2 + log(3/4)^2 + log(4/5)^2)/4))
-    @test isapprox(rmslp1(yhat, y),
-                   sqrt((log(2/3)^2 + log(3/4)^2 + log(4/5)^2 + log(5/6)^2)/4))
-    @test isapprox(rmsp(yhat, y), sqrt((1 + 1/4 + 1/9 + 1/16)/4))
-    @test isapprox(mape(yhat, y), (1/1 + 1/2 + 1/3 + 1/4)/4)
-end
-
-true
diff --git a/test/measures/doc_strings.jl b/test/measures/doc_strings.jl
deleted file mode 100644
index 1cbf96c4..00000000
--- a/test/measures/doc_strings.jl
+++ /dev/null
@@ -1,9 +0,0 @@
-using MLJBase
-
-docstring = (Base.Docs.doc)((Base.Docs.Binding)(Main, :multiclass_recall))
-
-@test string(docstring) == "An instance of type "*
-    "[`MulticlassTruePositiveRate`](@ref). Query the "*
-    "[`MulticlassTruePositiveRate`](@ref) doc-string for details. \n"
-
-true
diff --git a/test/measures/finite.jl b/test/measures/finite.jl
deleted file mode 100644
index f06266c3..00000000
--- a/test/measures/finite.jl
+++ /dev/null
@@ -1,609 +0,0 @@
-rng = StableRNG(51803)
-
-const Vec = AbstractVector
-
-@testset "misclassification_rate" begin
-    y    = categorical(collect("asdfasdfaaassdd"))
-    yhat = categorical(collect("asdfaadfaasssdf"))
-    w = 1:15
-    ym = vcat(y, [missing,])
-    yhatm = vcat(yhat, [missing,])
-    wm = 1:16
-    @test misclassification_rate(yhat, y) ≈ 0.2
-    @test misclassification_rate(yhatm, ym) ≈ 0.2
-    @test misclassification_rate(yhat, y, w) ≈ (6*1 + 11*1 + 15*1) / 15
-    @test misclassification_rate(yhatm, ym, wm) ≈ (6*1 + 11*1 + 15*1) / 15
-end
-
-@testset "mcr, acc, bacc, mcc" begin
-    y = categorical(['m', 'f', 'n', 'f', 'm', 'n', 'n', 'm', 'f'])
-    ŷ = categorical(['f', 'f', 'm', 'f', 'n', 'm', 'n', 'm', 'f'])
-    @test accuracy(ŷ, y) == 1-mcr(ŷ,y) ==
-        accuracy(MLJBase._confmat(ŷ, y, warn=false))  ==
-        1-mcr(MLJBase._confmat(ŷ, y, warn=false))
-    w = randn(rng,length(y))
-    @test accuracy(ŷ, y, w) == 1-mcr(ŷ,y,w)
-
-    ## balanced accuracy
-    y = categorical([
-        3, 4, 1, 1, 1, 4, 1, 3, 3, 1, 2, 3, 1, 3, 3, 3, 2, 4, 3, 2, 1, 3,
-        3, 1, 1, 1, 2, 4, 1, 4, 4, 4, 1, 1, 4, 4, 3, 1, 2, 2, 3, 4, 2, 1,
-        2, 2, 3, 2, 2, 3, 1, 2, 3, 4, 1, 2, 4, 2, 1, 4, 3, 2, 3, 3, 3, 1,
-        3, 1, 4, 3, 1, 2, 3, 1, 2, 2, 4, 4, 1, 3, 2, 1, 4, 3, 3, 1, 3, 1,
-        2, 2, 2, 2, 2, 3, 2, 1, 1, 4, 2, 2])
-    ŷ = categorical([
-        2, 3, 2, 1, 2, 2, 3, 3, 2, 4, 2, 3, 2, 4, 3, 4, 4, 2, 1, 3, 3, 3,
-        3, 3, 2, 4, 4, 3, 4, 4, 1, 2, 3, 2, 4, 1, 2, 3, 1, 4, 2, 2, 1, 2,
-        3, 2, 2, 4, 3, 2, 2, 2, 1, 2, 2, 1, 3, 1, 4, 1, 2, 1, 2, 4, 3, 2,
-        4, 3, 2, 4, 4, 2, 4, 3, 2, 3, 1, 2, 1, 2, 1, 2, 3, 1, 1, 3, 4, 2,
-        4, 4, 2, 1, 3, 2, 2, 4, 1, 1, 4, 1])
-    w = [
-        0.5, 1.4, 0.6, 1. , 0.1, 0.5, 1.2, 0.2, 1.8, 0.3, 0.6, 2.2, 0.1,
-        1.4, 0.2, 0.4, 0.6, 2.1, 0.7, 0.2, 0.9, 0.4, 0.7, 0.3, 0.1, 1.7,
-        0.2, 0.7, 1.2, 1. , 0.9, 0.4, 0.5, 0.5, 0.5, 1. , 0.3, 0.1, 0.2,
-        0. , 2.2, 0.8, 0.9, 0.8, 1.3, 0.2, 0.4, 0.7, 1. , 0.7, 1.7, 0.7,
-        1.1, 1.8, 0.1, 1.2, 1.8, 1. , 0.1, 0.5, 0.6, 0.7, 0.6, 1.2, 0.6,
-        1.2, 0.5, 0.5, 0.8, 0.2, 0.6, 1. , 0.3, 1. , 0.2, 1.1, 1.1, 1.1,
-        0.6, 1.4, 1.2, 0.3, 1.1, 0.2, 0.5, 1.6, 0.3, 1. , 0.3, 0.9, 0.9,
-        0. , 0.6, 0.6, 0.4, 0.5, 0.4, 0.2, 0.9, 0.4]
-    sk_bacc = 0.17493386243386244 # note: sk-learn reverses ŷ and y
-    @test bacc(ŷ, y) ≈ sk_bacc
-    sk_adjusted_bacc =  -0.10008818342151675
-    @test BalancedAccuracy(adjusted=true)(ŷ, y) ≈ sk_adjusted_bacc
-    sk_bacc_w = 0.1581913163016446
-    @test bacc(ŷ, y, w) ≈ sk_bacc_w
-    sk_adjusted_bacc_w = -0.1224115782644738
-    @test BalancedAccuracy(adjusted=true)(ŷ, y, w) ≈ sk_adjusted_bacc_w
-
-    ## matthews correlation
-    sk_mcc = -0.09759509982785947
-    @test mcc(ŷ, y) == matthews_correlation(ŷ, y) ≈ sk_mcc
-    # invariance with respect to permutation ?
-    cm = MLJBase._confmat(ŷ, y, perm=[3, 1, 2, 4])
-    @test mcc(cm) ≈ sk_mcc
-
-    # Issue #381
-    cm = MLJBase.ConfusionMatrixObject([29488 13017; 12790 29753], ["0.0", "1.0"])
-    @test mcc(cm) ≈ 0.39312321239417797
-end
-
-@testset "kappa" begin
-    # Binary case
-    y_b = categorical([2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2])
-    ŷ_b = categorical([1, 1, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2])
-    cm_b = MLJBase._confmat(y_b, ŷ_b, warn=false)
-    p0_b = (4+10)/30
-    pe_b = (13*11 + 17*19)/(30*30)
-
-    # Multiclass case
-    y_m = categorical([5, 5, 3, 5, 4, 4, 2, 2, 3, 2, 5, 2, 4, 3, 2, 1, 1, 5, 1, 4, 2, 5, 4, 5, 2, 3, 3, 4, 2, 4])
-    ŷ_m = categorical([1, 1, 1, 5, 4, 2, 1, 3, 4, 4, 2, 5, 4, 4, 1, 5, 5, 2, 3, 3, 1, 3, 2, 5, 5, 2, 3, 2, 5, 3])
-    cm_m = MLJBase._confmat(ŷ_m, y_m, warn=false)
-    p0_m = 5/30
-    pe_m = (3*6 + 8*6 + 5*6 + 7*5 + 7*7)/(30*30)
-
-    # Tests
-    @test kappa(y_m, ŷ_m) ≈ (p0_m - pe_m)/(1 - pe_m)
-    @test kappa(y_b, ŷ_b) ≈ (p0_b - pe_b)/(1 - pe_b)
-    @test kappa(cm_m)     == kappa(y_m, ŷ_m)
-    @test kappa(cm_b)     == kappa(y_b, ŷ_b)
-    @test kappa(ŷ_m, y_m) == kappa(y_m, ŷ_m)
-    @test kappa(ŷ_b, y_b) == kappa(y_b, ŷ_b)
-    @test kappa(y_m, y_m) == 1.0
-    @test kappa(y_b, y_b) == 1.0
-end
-
-@testset "confusion matrix {2}" begin
-    # first class is 1 is assumed negative, second positive
-    y = categorical([1, 2, 1, 2, 1, 1, 2])
-    ŷ = categorical([1, 2, 2, 2, 2, 1, 2])
-    cm = MLJBase._confmat(ŷ, y, warn=false)
-    TN = sum(ŷ .== y .== 1) # pred and true = - (1)
-    TP = sum(ŷ .== y .== 2) # pred and true = + (2)
-    FP = sum(ŷ .!= y .== 1) # pred + (2) and true - (1)
-    FN = sum(ŷ .!= y .== 2) # pred - (1) and true + (2)
-    @test cm[1,1] == TN
-    @test cm[2,2] == TP
-    @test cm[1,2] == FN
-    @test cm[2,1] == FP
-
-    ym = categorical([1, missing, 2, 1, 2, 1, 1, 1, 2])
-    ŷm = categorical([1, 2,       2, 2, 2, missing, 2, 1, 2])
-    cm = MLJBase._confmat(ŷ, y, warn=false)
-    TN = sum(skipmissing(ŷ .== y .== 1)) # pred and true = - (1)
-    TP = sum(skipmissing(ŷ .== y .== 2)) # pred and true = + (2)
-    FP = sum(skipmissing(ŷ .!= y .== 1)) # pred + (2) and true - (1)
-    FN = sum(skipmissing(ŷ .!= y .== 2)) # pred - (1) and true + (2)
-    @test cm[1,1] == TN
-    @test cm[2,2] == TP
-    @test cm[1,2] == FN
-    @test cm[2,1] == FP
-
-    cm2 = MLJBase._confmat(ŷ, y; rev=true)
-    @test cm2[1,1] == cm[2,2]
-    @test cm2[1,2] == cm[2,1]
-    @test cm2[2,2] == cm[1,1]
-    @test cm2[2,1] == cm[1,2]
-
-    @test accuracy(ŷ, y) == accuracy(cm) == sum(y .== ŷ) / length(y)
-
-    @test @test_logs((:warn, r"The classes are un-ordered"),
-                     recall(ŷ, y) == TP / (TP + FN))
-
-    ŷ = coerce(ŷ, Union{Missing,OrderedFactor})
-    y = coerce(y, Union{Missing,OrderedFactor})
-
-    @test precision(ŷ, y)   == TP / (TP + FP)
-    @test specificity(ŷ, y) == TN / (TN + FP)
-    @test f1score(ŷ, y) ≈
-        2.0 / (1.0 / recall(ŷ, y) + 1.0 / precision(ŷ, y))
-
-    recall_rev = Recall(rev=true)
-    @test recall_rev(ŷ, y) ==
-        TN / (TN + FP) # no warning because rev is specified
-    precision_rev = Precision(rev=true)
-    @test precision_rev(ŷ, y) == TN / (TN + FN)
-    specificity_rev = Specificity(rev=true)
-    @test specificity_rev(ŷ, y) == TP / (TP + FN)
-    f1score_rev = FScore(rev=true)
-    @test f1score_rev(ŷ, y) ≈
-        2.0 / (1.0 / recall_rev(ŷ, y) + 1.0 / precision_rev(ŷ, y))
-end
-
-@testset "confusion matrix {n}" begin
-    y = coerce([1, 2, 0, 2, 1, 0, 0, 1, 2, 2, 2, 1, 2,
-                            2, 1, 0, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1,
-                            2, 2, 2], Multiclass)
-    ŷ = coerce([2, 0, 2, 2, 2, 0, 1, 2, 1, 2, 0, 1, 2,
-                            1, 1, 1, 2, 0, 1, 2, 1, 2, 2, 2, 1, 2,
-                            1, 2, 2], Multiclass)
-    class_w = Dict(0=>0,2=>2,1=>1)
-    cm = MLJBase._confmat(ŷ, y, warn=false)
-
-    #               ┌─────────────────────────────────────────┐
-    #               │              Ground Truth               │
-    # ┌─────────────┼─────────────┬─────────────┬─────────────┤
-    # │  Predicted  │      0      │      1      │      2      │
-    # ├─────────────┼─────────────┼─────────────┼─────────────┤
-    # │      0      │      1      │      1      │      2      │
-    # ├─────────────┼─────────────┼─────────────┼─────────────┤
-    # │      1      │      2      │      4      │      4      │
-    # ├─────────────┼─────────────┼─────────────┼─────────────┤
-    # │      2      │      1      │      6      │      8      │
-    # └─────────────┴─────────────┴─────────────┴─────────────┘
-
-    cm_tp   = [1; 4; 8]
-    cm_tn   = [22; 12; 8]
-    cm_fp   = [1+2; 2+4; 1+6]
-    cm_fn   = [2+1; 1+6; 2+4]
-    cm_prec = cm_tp ./ ( cm_tp + cm_fp  )
-    cm_rec  = cm_tp ./ ( cm_tp + cm_fn  )
-
-    # Check if is positive
-    m = MulticlassTruePositive(;return_type=Vector)
-    @test  [0; 0; 0] <= m(ŷ, y) == cm_tp
-    m = MulticlassTrueNegative(;return_type=Vector)
-    @test  [0; 0; 0] <= m(ŷ, y) == cm_tn
-    m = MulticlassFalsePositive(;return_type=Vector)
-    @test  [0; 0; 0] <= m(ŷ, y) == cm_fp
-    m = MulticlassFalseNegative(;return_type=Vector)
-    @test  [0; 0; 0] <= m(ŷ, y) == cm_fn
-
-    # Check if is in [0,1]
-    m = MulticlassTruePositiveRate(average=no_avg;return_type=Vector)
-    @test  [0; 0; 0] <= m(ŷ, y) == cm_tp ./ (cm_fn.+cm_tp) <= [1; 1; 1]
-    m = MulticlassTrueNegativeRate(average=no_avg;return_type=Vector)
-    @test  [0; 0; 0] <= m(ŷ, y) == cm_tn ./ (cm_tn.+cm_fp) <= [1; 1; 1]
-    m = MulticlassFalsePositiveRate(average=no_avg;return_type=Vector)
-    @test  [0; 0; 0] <= m(ŷ, y) == 1 .- cm_tn ./ (cm_tn.+cm_fp) <= [1; 1; 1]
-    m = MulticlassFalseNegativeRate(average=no_avg;return_type=Vector)
-    @test  [0; 0; 0] <= m(ŷ, y) == 1 .- cm_tp ./ (cm_fn.+cm_tp) <= [1; 1; 1]
-
-    #`no_avg` and `LittleDict`
-    @test collect(values(MulticlassPrecision(average=no_avg)(cm))) ≈
-        collect(values(MulticlassPrecision(average=no_avg)(ŷ, y))) ≈
-        cm_prec
-    @test MulticlassPrecision(average=macro_avg)(cm) ≈
-        MulticlassPrecision(average=macro_avg)(ŷ, y) ≈ mean(cm_prec)
-    @test collect(keys(MulticlassPrecision(average=no_avg)(cm)))  ==
-        collect(keys(MulticlassPrecision(average=no_avg)(ŷ, y))) ==
-        ["0"; "1"; "2"]
-    @test collect(values(MulticlassRecall(average=no_avg)(cm))) ≈
-        collect(values(MulticlassRecall(average=no_avg)(ŷ, y))) ≈
-        cm_rec
-    @test collect(values(MulticlassFScore(average=no_avg)(cm))) ≈
-        collect(values(MulticlassFScore(average=no_avg)(ŷ, y))) ≈
-        2 ./ ( 1 ./ cm_prec + 1 ./ cm_rec )
-
-    #`no_avg` and `LittleDict` with class weights
-    @test collect(values(MulticlassPrecision(average=no_avg)(cm, class_w))) ≈
-        collect(values(MulticlassPrecision(average=no_avg)(ŷ, y, class_w))) ≈
-        cm_prec .* [0; 1; 2]
-    @test collect(values(MulticlassRecall(average=no_avg)(cm, class_w))) ≈
-        collect(values(MulticlassRecall(average=no_avg)(ŷ, y, class_w))) ≈
-        cm_rec .* [0; 1; 2]
-    @test collect(values(MulticlassFScore(average=no_avg)(cm, class_w))) ≈
-        collect(values(MulticlassFScore(average=no_avg)(ŷ, y, class_w))) ≈
-        2 ./ ( 1 ./ cm_prec + 1 ./ cm_rec ) .* [0; 1; 2]
-
-    #`macro_avg` and `LittleDict`
-    macro_prec = MulticlassPrecision(average=macro_avg)
-    macro_rec  = MulticlassRecall(average=macro_avg)
-
-    @test macro_prec(cm)    ≈ macro_prec(ŷ, y)    ≈ mean(cm_prec)
-    @test macro_rec(cm)     ≈ macro_rec(ŷ, y)     ≈ mean(cm_rec)
-    @test macro_f1score(cm) ≈ macro_f1score(ŷ, y) ≈ mean(2 ./ ( 1 ./ cm_prec + 1 ./ cm_rec ))
-
-    #`micro_avg` and `LittleDict`
-    micro_prec = MulticlassPrecision(average=micro_avg)
-    micro_rec  = MulticlassRecall(average=micro_avg)
-
-    @test micro_prec(cm)    == micro_prec(ŷ, y)    == sum(cm_tp) ./ sum(cm_fp.+cm_tp)
-    @test micro_rec(cm)     == micro_rec(ŷ, y)     == sum(cm_tp) ./ sum(cm_fn.+cm_tp)
-    @test micro_f1score(cm) == micro_f1score(ŷ, y) ==
-    2 ./ ( 1 ./ ( sum(cm_tp) ./ sum(cm_fp.+cm_tp) ) + 1 ./ ( sum(cm_tp) ./ sum(cm_fn.+cm_tp) ) )
-
-    #`no_avg` and `Vector` with class weights
-    vec_precision = MulticlassPrecision(return_type=Vector)
-    vec_recall    = MulticlassRecall(return_type=Vector)
-    vec_f1score   = MulticlassFScore(return_type=Vector)
-
-    @test vec_precision(cm, class_w) ≈ vec_precision(ŷ, y, class_w) ≈
-        mean(cm_prec .* [0; 1; 2])
-    @test vec_recall(cm, class_w)    ≈ vec_recall(ŷ, y, class_w)    ≈
-        mean(cm_rec .* [0; 1; 2])
-    @test vec_f1score(cm, class_w)   ≈ vec_f1score(ŷ, y, class_w)   ≈
-        mean(2 ./ ( 1 ./ cm_prec + 1 ./ cm_rec ) .* [0; 1; 2])
-
-    #`macro_avg` and `Vector`
-    v_ma_prec = MulticlassPrecision(average=macro_avg,
-                                    return_type=Vector)
-    v_ma_rec  = MulticlassRecall(average=macro_avg, return_type=Vector)
-    v_ma_f1   = MulticlassFScore(average=macro_avg, return_type=Vector)
-
-    @test v_ma_prec(cm) ≈ v_ma_prec(ŷ, y) ≈ mean(cm_prec)
-    @test v_ma_rec(cm)  ≈ v_ma_rec(ŷ, y)  ≈ mean(cm_rec)
-    @test v_ma_f1(cm)   ≈ v_ma_f1(ŷ, y)   ≈ mean(2 ./ ( 1 ./ cm_prec + 1 ./ cm_rec ))
-
-    #`macro_avg` and `Vector` with class weights
-    @test v_ma_prec(cm, class_w) ≈ v_ma_prec(ŷ, y, class_w) ≈
-        mean(cm_prec .* [0, 1, 2])
-    @test v_ma_rec(cm, class_w)  ≈ v_ma_rec(ŷ, y, class_w)  ≈
-        mean(cm_rec .* [0, 1, 2])
-    @test v_ma_f1(cm, class_w)   ≈ v_ma_f1(ŷ, y, class_w)   ≈
-        mean(2 ./ ( 1 ./ cm_prec + 1 ./ cm_rec ) .* [0, 1, 2])
-
-    #`micro_avg` and `Vector`
-    v_mi_prec = MulticlassPrecision(average=micro_avg, return_type=Vector)
-    v_mi_rec  = MulticlassRecall(average=micro_avg, return_type=Vector)
-    v_mi_f1   = MulticlassFScore(average=micro_avg, return_type=Vector)
-
-    @test v_mi_prec(cm) == v_mi_prec(ŷ, y) == sum(cm_tp) ./ sum(cm_fp.+cm_tp)
-    @test v_mi_rec(cm)  == v_mi_rec(ŷ, y)  == sum(cm_tp) ./ sum(cm_fn.+cm_tp)
-    @test v_mi_f1(cm)   == v_mi_f1(ŷ, y)   ==
-    2 ./ ( 1 ./ ( sum(cm_tp) ./ sum(cm_fp.+cm_tp) ) + 1 ./ ( sum(cm_tp) ./ sum(cm_fn.+cm_tp) ) )
-end
-
-@testset "issue #630" begin
-    # multiclass fscore corner case of absent class
-
-    y = coerce([1, 2, 2, 2, 3], OrderedFactor)[1:4]
-    # [1, 2, 2, 2] # but 3 is in the pool
-    yhat = reverse(y)
-    # [2, 2, 2, 1]
-
-    # In this case, assigning "3" as "positive" gives all true negative,
-    # and so NaN for that class's contribution to the average F1Score,
-    # which should accordingly be skipped.
-
-    # postive class | TP | FP | FN | score for that class
-    # --------------|----|----|----|---------------------
-    #  1            | 0  | 1  | 2  | 0
-    #  2            | 2  | 1  | 1  | 2/3
-    #  3            | 0  | 0  | 0  | NaN
-
-    # mean score with skippin NaN is 1/3
-    @test MulticlassFScore()(yhat, y) ≈ 1/3
-end
-
-@testset "Metadata binary" begin
-    for m in (accuracy, recall, Precision(), f1score, specificity)
-        e = info(m)
-        m == accuracy    && (@test e.name == "Accuracy")
-        m == recall      && (@test e.name == "TruePositiveRate")
-        m isa Precision  && (@test e.name == "Precision")
-        m == f1score     && (@test e.name == "FScore")
-        m == specificity && (@test e.name == "TrueNegativeRate")
-        @test e.target_scitype <: AbstractArray{<:Union{Missing,Finite}}
-        @test e.prediction_type == :deterministic
-        @test e.orientation == :score
-        @test e.reports_each_observation == false
-        @test e.is_feature_dependent == false
-        if m == accuracy
-            @test e.supports_weights
-        else
-            @test !e.supports_weights
-        end
-    end
-    e = info(auc)
-    @test e.name == "AreaUnderCurve"
-    @test e.target_scitype ==
-        Union{AbstractArray{<:Union{Missing,Multiclass{2}}},
-              AbstractArray{<:Union{Missing,OrderedFactor{2}}}}
-    @test e.prediction_type == :probabilistic
-    @test e.reports_each_observation == false
-    @test e.is_feature_dependent == false
-    @test e.supports_weights == false
-end
-
-@testset "Metadata multiclass" begin
-    for m in (MulticlassRecall(), MulticlassPrecision(),
-              MulticlassFScore(), MulticlassTrueNegativeRate())
-        e = info(m)
-        m isa MulticlassRecall &&
-            (@test e.name == "MulticlassTruePositiveRate")
-        m isa MulticlassPrecision   &&
-            (@test e.name == "MulticlassPrecision")
-        m isa MulticlassFScore &&
-            (@test e.name == "MulticlassFScore")
-        m isa MulticlassTrueNegativeRate &&
-            (@test e.name == "MulticlassTrueNegativeRate")
-        @test e.target_scitype <: AbstractArray{<:Union{Missing,Finite}}
-        @test e.prediction_type == :deterministic
-        @test e.orientation == :score
-        @test e.reports_each_observation == false
-        @test e.is_feature_dependent == false
-        @test e.supports_weights == false
-        @test e.supports_class_weights == true
-    end
-end
-
-@testset "More binary metrics" begin
-    y = coerce([missing, 1, 2, 1, 2, 1, 1, 2, 1, 2, 2, 2, 1, 2,
-                2, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1,
-                2, 2, 2, 1], Union{Missing,OrderedFactor})
-    ŷ = coerce([1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2,
-                1, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2,
-                1, 2, 2, missing], Union{Missing,OrderedFactor})
-
-    # check all constructors
-    m = TruePositive()
-    @test m(ŷ, y) == truepositive(ŷ, y)
-    m = TruePositive(rev=true)
-    @test m(ŷ, y) == truenegative(ŷ, y)
-    m = TrueNegative()
-    @test m(ŷ, y) == truenegative(ŷ, y)
-    m = FalsePositive()
-    @test m(ŷ, y) == falsepositive(ŷ, y)
-    m = FalseNegative()
-    @test m(ŷ, y) == falsenegative(ŷ, y)
-    m = TruePositiveRate()
-    @test m(ŷ, y) == tpr(ŷ, y) == truepositive_rate(ŷ, y)
-    m = TrueNegativeRate()
-    @test m(ŷ, y) == tnr(ŷ, y) == truenegative_rate(ŷ, y)
-    m = FalsePositiveRate()
-    @test m(ŷ, y) == fpr(ŷ, y) == falsepositive_rate(ŷ, y)
-    m = FalseNegativeRate()
-    @test m(ŷ, y) == fnr(ŷ, y) == falsenegative_rate(ŷ, y)
-    m = FalseDiscoveryRate()
-    @test m(ŷ, y) == fdr(ŷ, y) == falsediscovery_rate(ŷ, y)
-    m = Precision()
-    @test m(ŷ, y) == precision(ŷ, y)
-    m = NPV()
-    @test m(ŷ, y) == npv(ŷ, y)
-    m = FScore()
-    @test m(ŷ, y) == f1score(ŷ, y)
-    # check synonyms
-    m = TPR()
-    @test m(ŷ, y) == tpr(ŷ, y)
-    m = TNR()
-    @test m(ŷ, y) == tnr(ŷ, y)
-    m = FPR()
-    @test m(ŷ, y) == fpr(ŷ, y) == fallout(ŷ, y)
-    m = FNR()
-    @test m(ŷ, y) == fnr(ŷ, y) == miss_rate(ŷ, y)
-    m = FDR()
-    @test m(ŷ, y) == fdr(ŷ, y)
-    m = PPV()
-    @test m(ŷ, y) == precision(ŷ, y) == ppv(ŷ, y)
-    m = Recall()
-    @test m(ŷ, y) == tpr(ŷ, y) == recall(ŷ, y) ==
-        sensitivity(ŷ, y) == hit_rate(ŷ, y)
-    m = Specificity()
-    @test m(ŷ, y) == tnr(ŷ, y) == specificity(ŷ, y) == selectivity(ŷ, y)
-    # 'higher order'
-    m = BACC()
-    @test m(ŷ, y) == bacc(ŷ, y) == (tpr(ŷ, y) + tnr(ŷ, y))/2
-
-    ### External comparisons
-    sk_prec = 0.6111111111111112 # m.precision_score(y, yhat, pos_label=2)
-    @test precision(ŷ, y) ≈ sk_prec
-    sk_rec = 0.6875
-    @test recall(ŷ, y) == sk_rec # m.recall_score(y, yhat, pos_label=2)
-    sk_f05 = 0.625
-    f05 = FScore(β=0.5)
-    @test f05(ŷ, y) ≈ sk_f05 # m.fbeta_score(y, yhat, 0.5, pos_label=2)
-
-    # reversion mechanism
-    sk_prec_rev = 0.5454545454545454
-    prec_rev = Precision(rev=true)
-    @test prec_rev(ŷ, y) ≈ sk_prec_rev
-    sk_rec_rev = 0.46153846153846156
-    rec_rev = Recall(rev=true)
-    @test rec_rev(ŷ, y) ≈ sk_rec_rev
-end
-
-@testset "More multiclass metrics" begin
-    y = coerce(categorical([missing, 1, 2, 0, 2, 1, 0, 0, 1, 2, 2, 2, 1, 2,
-                            2, 1, 0, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1,
-                            2, 2, 2, 0]), Union{Missing,Multiclass})
-    ŷ = coerce(categorical([0, 2, 0, 2, 2, 2, 0, 1, 2, 1, 2, 0, 1, 2,
-                            1, 1, 1, 2, 0, 1, 2, 1, 2, 2, 2, 1, 2,
-                            1, 2, 2, missing]), Union{Missing,Multiclass})
-    w = Dict(0=>1, 1=>2, 2=>3) #class_w
-    # check all constructors
-    m = MulticlassTruePositive()
-    @test m(ŷ, y) == multiclass_truepositive(ŷ, y)
-    m = MulticlassTrueNegative()
-    @test m(ŷ, y) == multiclass_truenegative(ŷ, y)
-    m = MulticlassFalsePositive()
-    @test m(ŷ, y) == multiclass_falsepositive(ŷ, y)
-    m = MulticlassFalseNegative()
-    @test m(ŷ, y) == multiclass_falsenegative(ŷ, y)
-    m = MulticlassTruePositiveRate()
-    @test m(ŷ, y) == multiclass_tpr(ŷ, y) ==
-        multiclass_truepositive_rate(ŷ, y)
-    @test m(ŷ, y, w) == multiclass_tpr(ŷ, y, w) ==
-        multiclass_truepositive_rate(ŷ, y, w)
-    m = MulticlassTrueNegativeRate()
-    @test m(ŷ, y) == multiclass_tnr(ŷ, y) ==
-        multiclass_truenegative_rate(ŷ, y)
-    @test m(ŷ, y, w) == multiclass_tnr(ŷ, y, w) ==
-        multiclass_truenegative_rate(ŷ, y, w)
-    m = MulticlassFalsePositiveRate()
-    @test m(ŷ, y) == multiclass_fpr(ŷ, y) ==
-        multiclass_falsepositive_rate(ŷ, y)
-    @test m(ŷ, y, w) == multiclass_fpr(ŷ, y, w) ==
-        multiclass_falsepositive_rate(ŷ, y, w)
-    m = MulticlassFalseNegativeRate()
-    @test m(ŷ, y) == multiclass_fnr(ŷ, y) ==
-        multiclass_falsenegative_rate(ŷ, y)
-    @test m(ŷ, y, w) == multiclass_fnr(ŷ, y, w) ==
-        multiclass_falsenegative_rate(ŷ, y, w)
-    m = MulticlassFalseDiscoveryRate()
-    @test m(ŷ, y) == multiclass_fdr(ŷ, y) ==
-        multiclass_falsediscovery_rate(ŷ, y)
-    @test m(ŷ, y, w) == multiclass_fdr(ŷ, y, w) ==
-        multiclass_falsediscovery_rate(ŷ, y, w)
-    m = MulticlassPrecision()
-    @test m(ŷ, y) == multiclass_precision(ŷ, y)
-    @test m(ŷ, y, w) == multiclass_precision(ŷ, y, w)
-    m = MulticlassNegativePredictiveValue()
-    @test m(ŷ, y) == multiclass_npv(ŷ, y)
-    @test m(ŷ, y, w) == multiclass_npv(ŷ, y, w)
-    m = MulticlassFScore()
-    @test m(ŷ, y) == macro_f1score(ŷ, y)
-    @test m(ŷ, y, w) == macro_f1score(ŷ, y, w)
-    # check synonyms
-    m = MTPR(return_type=Vector)
-    @test m(ŷ, y) == multiclass_tpr(ŷ, y)
-    @test m(ŷ, y, w) == multiclass_tpr(ŷ, y, w)
-    m = MTNR(return_type=Vector)
-    @test m(ŷ, y) == multiclass_tnr(ŷ, y)
-    @test m(ŷ, y, w) == multiclass_tnr(ŷ, y, w)
-    m = MFPR()
-    @test m(ŷ, y) == multiclass_fpr(ŷ, y) == multiclass_fallout(ŷ, y)
-    @test m(ŷ, y, w) == multiclass_fpr(ŷ, y, w) ==
-        multiclass_fallout(ŷ, y, w)
-    m = MFNR()
-    @test m(ŷ, y) == multiclass_fnr(ŷ, y) ==
-        multiclass_miss_rate(ŷ, y)
-    @test m(ŷ, y, w) == multiclass_fnr(ŷ, y, w) ==
-        multiclass_miss_rate(ŷ, y, w)
-    m = MFDR()
-    @test m(ŷ, y) == multiclass_fdr(ŷ, y)
-    @test m(ŷ, y, w) == multiclass_fdr(ŷ, y, w)
-    m = MPPV()
-    @test m(ŷ, y) == MulticlassPrecision()(ŷ, y) ==
-        multiclass_ppv(ŷ, y)
-    @test m(ŷ, y, w) == MulticlassPrecision()(ŷ, y, w) ==
-        multiclass_ppv(ŷ, y, w)
-    m = MulticlassRecall()
-    @test m(ŷ, y) == multiclass_tpr(ŷ, y)
-    @test m(ŷ, y, w) == multiclass_tpr(ŷ, y, w)
-    @test m(ŷ, y) == multiclass_sensitivity(ŷ, y) ==
-        multiclass_hit_rate(ŷ, y)
-    @test m(ŷ, y, w) == multiclass_sensitivity(ŷ, y, w) ==
-        multiclass_hit_rate(ŷ, y, w)
-    m = MulticlassSpecificity()
-    @test m(ŷ, y) == multiclass_tnr(ŷ, y) == multiclass_specificity(ŷ, y) ==
-        multiclass_selectivity(ŷ, y)
-    @test m(ŷ, y, w) == multiclass_tnr(ŷ, y, w) ==
-        multiclass_specificity(ŷ, y, w) == multiclass_selectivity(ŷ, y, w)
-end
-
-
-@testset "Additional multiclass tests" begin
-    table = reshape(collect("aabbbccccddbabccbacccd"), 11, 2)
-    table = coerce(table, Multiclass);
-    yhat = table[:,1] # ['a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd']
-    y    = table[:,2] # ['b', 'a', 'b', 'c', 'c', 'b', 'a', 'c', 'c', 'c', 'd']
-    class_w = Dict('a'=>7, 'b'=>5, 'c'=>2, 'd'=> 0)
-
-    # class | TP | FP | TP + FP | precision | FN | TP + FN | recall
-    # ------|----|----|------------------------------------|-------
-    # a     | 1  | 1  | 2       | 1/2       | 1  | 2       | 1/2
-    # b     | 1  | 2  | 3       | 1/3       | 2  | 3       | 1/3
-    # c     | 2  | 2  | 4       | 1/2       | 3  | 5       | 2/5
-    # d     | 1  | 1  | 2       | 1/2       | 0  | 1       | 1
-
-    # helper:
-    inverse(x) = 1/x
-    harmonic_mean(x, y; beta=1.0) =
-        (1 + inverse(beta^2))*inverse(mean(inverse(beta^2*x)+ inverse(y)))
-
-    # precision:
-    p_macro = mean([1/2, 1/3, 1/2, 1/2])
-    @test MulticlassPrecision()(yhat, y) ≈ p_macro
-    p_macro_w = mean([7/2, 5/3, 2/2, 0/2])
-    @test MulticlassPrecision()(yhat, y, class_w) ≈ p_macro_w
-    @test p_macro_w ≈
-        @test_logs((:warn, r"Using macro"),
-                     MulticlassPrecision(average=micro_avg)(yhat, y, class_w))
-    p_micro = (1 + 1 + 2 + 1)/(2 + 3 + 4 + 2)
-    @test MulticlassPrecision(average=micro_avg)(yhat, y) ≈ p_micro
-
-    # recall:
-    r_macro = mean([1/2, 1/3, 2/5, 1])
-    @test MulticlassRecall(average=macro_avg)(yhat, y) ≈ r_macro
-    r_macro_w = mean([7/2, 5/3, 4/5, 0/1])
-    @test MulticlassRecall(average=macro_avg)(yhat, y, class_w) ≈ r_macro_w
-    @test r_macro_w ≈
-        @test_logs((:warn, r"Using macro"),
-                     MulticlassRecall(average=micro_avg)(yhat, y, class_w))
-    r_micro = (1 + 1 + 2 + 1)/(2 + 3 + 5 + 1)
-    @test MulticlassPrecision(average=micro_avg)(yhat, y) ≈ r_micro
-
-    # fscore:
-    harm_means = [harmonic_mean(1/2, 1/2),
-                     harmonic_mean(1/3, 1/3),
-                     harmonic_mean(1/2, 2/5),
-                     harmonic_mean(1/2, 1)]
-    f1_macro = mean(harm_means)
-    @test MulticlassFScore(average=macro_avg)(yhat, y) ≈ f1_macro
-    @test MulticlassFScore(average=no_avg,
-                           return_type=Vector)(yhat, y, class_w) ≈
-        [7, 5, 2, 0] .* harm_means
-    f1_macro_w = mean([7, 5, 2, 0] .* harm_means)
-    @test MulticlassFScore(average=macro_avg)(yhat, y, class_w) ≈ f1_macro_w
-    @test f1_macro_w ≈
-        @test_logs((:warn, r"Using macro"),
-                     MulticlassFScore(average=micro_avg)(yhat, y, class_w))
-    f1_micro = harmonic_mean(p_micro, r_micro)
-    @test MulticlassFScore(average=micro_avg)(yhat, y) ≈ f1_micro
-
-    # fscore, β=1/3:
-    harm_means = [harmonic_mean(1/2, 1/2, beta=1/3),
-                     harmonic_mean(1/3, 1/3, beta=1/3),
-                     harmonic_mean(1/2, 2/5, beta=1/3),
-                     harmonic_mean(1/2, 1, beta=1/3)]
-    f1_macro = mean(harm_means)
-    @test MulticlassFScore(β=1/3, average=macro_avg)(yhat, y) ≈ f1_macro
-    @test MulticlassFScore(β=1/3,
-                           average=no_avg,
-                           return_type=Vector)(yhat, y, class_w) ≈
-        [7, 5, 2, 0] .* harm_means
-    f1_macro_w = mean([7, 5, 2, 0] .* harm_means)
-    @test MulticlassFScore(β=1/3,
-                           average=macro_avg)(yhat, y, class_w) ≈ f1_macro_w
-    @test f1_macro_w ≈
-        @test_logs((:warn, r"Using macro"),
-                   MulticlassFScore(β=1/3,
-                                    average=micro_avg)(yhat, y, class_w))
-    f1_micro = harmonic_mean(p_micro, r_micro, beta=1/3)
-    @test MulticlassFScore(β=1/3, average=micro_avg)(yhat, y) ≈ f1_micro
-end
-
-@testset "docstrings coverage" begin
-    @test startswith(info(BrierScore()).docstring, "`BrierScore`")
-end
diff --git a/test/measures/loss_functions_interface.jl b/test/measures/loss_functions_interface.jl
deleted file mode 100644
index 8c59945b..00000000
--- a/test/measures/loss_functions_interface.jl
+++ /dev/null
@@ -1,68 +0,0 @@
-rng = StableRNG(614)
-
-# convert a Binary vector into vector of +1 or -1 values
-# (for testing only):
-pm1(y) = Int8(2) .* (Int8.(MLJBase.int(y))) .- Int8(3)
-
-const MARGIN_LOSSES = MLJBase.MARGIN_LOSSES
-const DISTANCE_LOSSES = MLJBase.DISTANCE_LOSSES
-
-# using `WeightedSum` instead of `WeightedMean`; see
-# https://github.com/JuliaML/LossFunctions.jl/issues/149
-WeightedSum(w) = LossFunctions.AggMode.WeightedMean(w, normalize=false)
-
-@testset "naked" begin
-    @test MLJBase.naked(MLJBase.LossFunctions.PeriodicLoss{Float64}) ==
-        :PeriodicLoss
-    @test MLJBase.naked(MLJBase.LossFunctions.PeriodicLoss) ==
-        :PeriodicLoss
-end
-
-@testset "LossFunctions.jl - binary" begin
-    y = categorical(["yes", "yes", "no", "yes"])
-    yes, no = y[1], y[3]
-    dyes = MLJBase.UnivariateFinite([yes, no], [0.6, 0.4])
-    dno =  MLJBase.UnivariateFinite([yes, no], [0.3, 0.7])
-    yhat = [dno, dno, dyes, dyes]
-    w = [1, 2, 3, 4]
-
-    @test MLJBase.ZeroOneLoss()(yhat, y) ≈ [1, 1, 1, 0]
-    @test MLJBase.zero_one_loss(yhat,y, w) ≈ [1, 2, 3, 0]
-
-    N = 10
-    y = categorical(rand(rng, ["yes", "no"], N), ordered=true)
-    levels!(y, ["no", "yes"])
-    no, yes = MLJBase.classes(y[1])
-    @test pm1([yes, no]) in [[+1, -1], [-1, +1]]
-    ym = pm1(y) # observations for raw LossFunctions measure
-    p_vec = rand(N)
-    yhat = MLJBase.UnivariateFinite([no, yes], p_vec, augment=true)
-    yhatm = MLJBase._scale.(p_vec) # predictions for raw LossFunctions measure
-    w = rand(rng, N)
-
-    for M_ex in MARGIN_LOSSES
-        m = eval(:(MLJBase.$M_ex()))
-        @test m(yhat, y) ≈ (getfield(m, :loss)).(yhatm, ym)
-        @test m(yhat, y, w) ≈
-            w .* (getfield(m, :loss)).(yhatm, ym)
-    end
-end
-
-@testset "LossFunctions.jl - continuous" begin
-    # losses for continuous targets:
-    N    = 10
-    y    = randn(rng, N)
-    yhat = randn(rng, N)
-    X    = nothing
-    w    = rand(rng, N)
-
-    for M_ex in DISTANCE_LOSSES
-        m = eval(:(MLJBase.$M_ex()))
-        m_ex = MLJBase.snakecase(M_ex)
-        @test m == eval(:(MLJBase.$m_ex))
-        @test m(yhat, y) ≈
-            (getfield(m, :loss)).(yhat, y)
-        @test m(yhat ,y, w) ≈
-            w .* (getfield(m, :loss)).(yhat, y)
-    end
-end
diff --git a/test/measures/measure_search.jl b/test/measures/measure_search.jl
deleted file mode 100644
index f8aa5e4d..00000000
--- a/test/measures/measure_search.jl
+++ /dev/null
@@ -1,42 +0,0 @@
-ms = map(measures()) do m
-    m.name
-end
-@test "LogLoss" in ms
-@test "RootMeanSquaredError"  in ms
-
-# test `M()` makes sense for all measure types `M` extracted from `name`,
-@test all(Symbol.(ms)) do ex
-    try
-        eval(:($ex()))
-        true
-    catch
-        false
-    end
-end
-
-S = AbstractVector{Union{Missing,Multiclass{3}}}
-task(m) = S <: m.target_scitype
-
-ms = map(measures(task)) do m
-    m.name
-end
-
-@test "LogLoss" in ms
-@test !("RootMeanSquaredError"  in ms)
-
-task(m) = AbstractVector{Continuous} <: m.target_scitype
-
-ms = map(measures(task)) do m
-    m.name
-end
-
-@test !("Accuracy" in ms)
-@test "RootMeanSquaredError"  in ms
-
-ms = map(measures("Brier")) do  m
-    m.name
-end
-
-@test Set(ms) == Set(["BrierLoss", "BrierScore"])
-
-true
diff --git a/test/measures/measures.jl b/test/measures/measures.jl
deleted file mode 100644
index 602c3e78..00000000
--- a/test/measures/measures.jl
+++ /dev/null
@@ -1,134 +0,0 @@
-module TestMeasures
-
-using MLJBase, Test
-import Distributions
-using CategoricalArrays
-using Statistics
-import LossFunctions
-using StableRNGs
-using OrderedCollections: LittleDict
-
-rng  = StableRNGs.StableRNG(123)
-
-@testset "aggregation" begin
-    v = rand(5)
-    @test aggregate(v, mae) ≈ mean(v)
-    @test aggregate(v, TruePositive()) ≈ sum(v)
-    @test aggregate(v, rms) ≈ sqrt(mean(v.^2))
-    λ = rand()
-    @test aggregate(λ, rms) === λ
-    @test aggregate(aggregate(v, l2), l2) == aggregate(v, l2)
-    m = LittleDict([0, 1, 2, 3, 4], v)
-    @test aggregate(m, MTPR()) == mean(v)
-end
-
-@testset "metadata" begin
-    measures()
-    measures(m -> m.target_scitype <: AbstractVector{<:Finite} &&
-             m.supports_weights)
-    info(rms)
-    @test true
-end
-
-@testset "coverage" begin
-    # just checking that the traits work not that they're correct
-    @test orientation(BrierScore()) == :score
-    @test orientation(auc) == :score
-    @test orientation(rms) == :loss
-
-    @test reports_each_observation(auc) == false
-    @test is_feature_dependent(auc) == false
-
-    @test MLJBase.distribution_type(auc) == MLJBase.UnivariateFinite
-end
-
-@testset "MLJBase.value" begin
-    yhat = randn(rng,5)
-    X = (weight=randn(rng,5), x1 = randn(rng,5))
-    y = randn(rng,5)
-    w = randn(rng,5)
-
-    @test MLJBase.value(mae, yhat, nothing, y, nothing) ≈ mae(yhat, y)
-    @test MLJBase.value(mae, yhat, nothing, y, w) ≈ mae(yhat, y, w)
-
-    spooky(yhat, y) = abs.(yhat - y) |> mean
-    @test MLJBase.value(spooky, yhat, nothing, y, nothing) ≈ mae(yhat, y)
-
-    cool(yhat, y, w) = abs.(yhat - y) .* w |> mean
-    MLJBase.supports_weights(::Type{typeof(cool)}) = true
-    @test MLJBase.value(cool, yhat, nothing, y, w) ≈ mae(yhat, y, w)
-
-    funky(yhat, X, y) = X.weight .* abs.(yhat - y) |> mean
-    MLJBase.is_feature_dependent(::Type{typeof(funky)}) = true
-    @test MLJBase.value(funky, yhat, X, y, nothing) ≈ mae(yhat, y, X.weight)
-
-    weird(yhat, X, y, w) = w .* X.weight .* abs.(yhat - y) |> mean
-    MLJBase.is_feature_dependent(::Type{typeof(weird)}) = true
-    MLJBase.supports_weights(::Type{typeof(weird)}) = true
-    @test MLJBase.value(weird, yhat, X, y, w) ≈ mae(yhat, y, X.weight .* w)
-end
-
-mutable struct DRegressor <: Deterministic end
-MLJBase.target_scitype(::Type{<:DRegressor}) =
-    AbstractVector{<:Continuous}
-
-mutable struct D2Regressor <: Deterministic end
-MLJBase.target_scitype(::Type{<:D2Regressor}) =
-    AbstractVector{Continuous}
-
-mutable struct DClassifier <: Deterministic end
-MLJBase.target_scitype(::Type{<:DClassifier}) =
-    AbstractVector{<:Finite}
-
-mutable struct PClassifier <: Probabilistic end
-MLJBase.target_scitype(::Type{<:PClassifier}) =
-    AbstractVector{<:Finite}
-
-mutable struct PRegressor <: Probabilistic end
-MLJBase.target_scitype(::Type{<:PRegressor}) =
-    AbstractVector{<:Continuous}
-
-mutable struct PCountRegressor <: Probabilistic end
-MLJBase.target_scitype(::Type{<:PCountRegressor}) =
-    AbstractVector{<:Count}
-
-@testset "default_measure" begin
-    @test MLJBase.default_measure(DRegressor()) == rms
-    @test MLJBase.default_measure(D2Regressor()) == rms
-    @test MLJBase.default_measure(DClassifier()) == misclassification_rate
-    @test MLJBase.default_measure(PClassifier()) == log_loss
-
-    @test MLJBase.default_measure(DRegressor) == rms
-    @test MLJBase.default_measure(D2Regressor) == rms
-    @test MLJBase.default_measure(DClassifier) == misclassification_rate
-    @test MLJBase.default_measure(PClassifier) == log_loss
-
-    @test MLJBase.default_measure(PRegressor) == log_loss
-    @test MLJBase.default_measure(PCountRegressor) == log_loss
-end
-
-include("confusion_matrix.jl")
-include("roc.jl")
-include("continuous.jl")
-include("finite.jl")
-include("probabilistic.jl")
-include("loss_functions_interface.jl")
-
-@testset "show method for measures" begin
-    io = IOBuffer()
-    for meta in measures()
-        m = eval(Meta.parse("$(meta.name)()"))
-        show(io, MIME("text/plain"), m)
-        show(io, m)
-    end
-end
-
-@testset "missing and NaN values in aggregation" begin
-    v =[1, 2, missing, 5, NaN]
-    @test MLJBase.Sum()(v) == 8
-    @test MLJBase.RootMeanSquare()(v) ≈ sqrt((1 + 4 + 25)/3)
-    @test MLJBase.Mean()(Union{Missing,Float32}[]) |> isnan
-end
-
-end
-true
diff --git a/test/measures/probabilistic.jl b/test/measures/probabilistic.jl
deleted file mode 100644
index 733c0d20..00000000
--- a/test/measures/probabilistic.jl
+++ /dev/null
@@ -1,174 +0,0 @@
-rng = StableRNG(51803)
-using LinearAlgebra
-
-const Vec = AbstractVector
-
-@testset "AUC" begin
-    # this is random binary and random scores generated with numpy
-    # then using roc_auc_score from sklearn to get the AUC
-    # we check that we recover a comparable AUC and that it's invariant
-    # to ordering.
-    c = ["neg", "pos"]
-    y = categorical(c[[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
-                     1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
-                     1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
-                     1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
-                     1, 0] .+ 1])
-    probs = [
-        0.90237535, 0.41276349, 0.94511611, 0.08390761, 0.55847392,
-        0.26043136, 0.78565351, 0.20133953, 0.7404382 , 0.15307601,
-        0.59596716, 0.8169512 , 0.88200483, 0.23321489, 0.94050483,
-        0.27593662, 0.60702176, 0.36427036, 0.35481784, 0.06416543,
-        0.45576954, 0.12354048, 0.79830435, 0.15799818, 0.20981099,
-        0.43451663, 0.24020098, 0.11401055, 0.25785748, 0.86490263,
-        0.75715379, 0.06550534, 0.12628999, 0.18878245, 0.1283757 ,
-        0.76542903, 0.8780248 , 0.86891113, 0.24835709, 0.06528076,
-        0.72061354, 0.89451634, 0.95634394, 0.07555979, 0.16345437,
-        0.43498831, 0.37774708, 0.31608861, 0.41369339, 0.95691113]
-
-    ŷ = UnivariateFinite(y[1:2], probs, augment=true)
-    # ŷ = [UnivariateFinite(y[1:2], [1.0 - p, p]) for p in [
-    #     0.90237535, 0.41276349, 0.94511611, 0.08390761, 0.55847392,
-    #     0.26043136, 0.78565351, 0.20133953, 0.7404382 , 0.15307601,
-    #     0.59596716, 0.8169512 , 0.88200483, 0.23321489, 0.94050483,
-    #     0.27593662, 0.60702176, 0.36427036, 0.35481784, 0.06416543,
-    #     0.45576954, 0.12354048, 0.79830435, 0.15799818, 0.20981099,
-    #     0.43451663, 0.24020098, 0.11401055, 0.25785748, 0.86490263,
-    #     0.75715379, 0.06550534, 0.12628999, 0.18878245, 0.1283757 ,
-    #     0.76542903, 0.8780248 , 0.86891113, 0.24835709, 0.06528076,
-    #     0.72061354, 0.89451634, 0.95634394, 0.07555979, 0.16345437,
-    #     0.43498831, 0.37774708, 0.31608861, 0.41369339, 0.95691113]]
-    @test isapprox(auc(ŷ, y), 0.455716, rtol=1e-4)
-    ŷ_unwrapped = [ŷ...]
-    @test isapprox(auc(ŷ_unwrapped, y), 0.455716, rtol=1e-4)
-
-    # reversing the roles of positive and negative should return very
-    # similar score
-    y2 = deepcopy(y);
-    levels!(y2, reverse(levels(y2)));
-    @test y == y2
-    @test levels(y) != levels(y2)
-    ŷ2 = UnivariateFinite(y2[1:2], probs, augment=true) # same probs
-    @test isapprox(auc(ŷ2, y2), auc(ŷ, y), rtol=1e-4)
-
-    # The auc algorithm should be able to handle the case where two or more
-    # samples in the prediction vector has the same UnivariateFinite distribution
-    # We check this by comparing our auc with that gotten from roc_auc_score from sklearn.
-    y = categorical(["class_1","class_1","class_0","class_0","class_1","class_1","class_0"])
-    ŷ = UnivariateFinite(levels(y), [0.8,0.7,0.5,0.5,0.5,0.5,0.3], augment=true, pool=y)
-    # We can see that ŷ[3] ≈ ŷ[4] ≈ ŷ[5] ≈ ŷ[6]
-    @test isapprox(auc(ŷ, y), 0.8333333333333334, rtol=1e-16)
-end
-
-@testset "Log, Brier, Spherical - finite case" begin
-    y = categorical(collect("abb"))
-    L = [y[1], y[2]]
-    d1 = UnivariateFinite(L, [0.1, 0.9]) # a
-    d2 = UnivariateFinite(L, Float32[0.4, 0.6]) # b
-    d3 = UnivariateFinite(L, [0.2, 0.8]) # b
-    yhat = [d1, d2, d3]
-    ym = vcat(y, [missing,])
-    yhatm = vcat(yhat, [d3, ])
-
-    @test mean(log_loss(yhat, y)) ≈
-        Float32(-(log(0.1) + log(0.6) + log(0.8))/3)
-    @test mean(skipmissing(log_loss(yhatm, ym))) ≈
-        Float32(-(log(0.1) + log(0.6) + log(0.8))/3)
-    yhat = UnivariateFinite(L, [0.1 0.9;
-                                0.4 0.6;
-                                0.2 0.8])
-    @test isapprox(mean(log_loss(yhat, y)),
-                   -(log(0.1) + log(0.6) + log(0.8))/3, atol=eps(Float32))
-
-    @test log_score(yhat, y) ≈ -log_loss(yhat, y)
-
-    # sklearn test
-    # >>> from sklearn.metrics import log_loss
-    # >>> log_loss(["spam", "ham", "ham", "spam","ham","ham"],
-    #    [[.1, .9], [.9, .1], [.8, .2], [.35, .65], [0.2, 0.8], [0.3,0.7]])
-    # 0.6130097025803921
-    y2 = categorical(["spam", "ham", "ham", "spam", "ham", "ham"])
-    L2 = classes(y2[1])
-    probs = vcat([.1 .9], [.9 .1], [.8 .2], [.35 .65], [0.2 0.8], [0.3 0.7])
-    yhat2 = UnivariateFinite(L2, probs)
-    y2m = vcat(y2, [missing,])
-    yhat2m = UnivariateFinite(L2, vcat(probs, [0.1 0.9]))
-    @test mean(log_loss(yhat2, y2)) ≈ 0.6130097025803921
-    @test mean(skipmissing(log_loss(yhat2, y2))) ≈ 0.6130097025803921
-
-    ## Brier
-    scores = BrierScore()(yhat, y)
-    @test size(scores) == size(y)
-    @test Float32.(scores) ≈ [-1.62, -0.32, -0.08]
-    scoresm = BrierScore()(yhatm, ym)
-    @test Float32.((scoresm)[1:3]) ≈ [-1.62, -0.32, -0.08]
-    @test ismissing(scoresm[end])
-    # test specialized broadcasting on brierloss
-    @test BrierLoss()(yhat, y) == -BrierScore()(yhat, y) 
-    # sklearn test
-    # >>> from sklearn.metrics import brier_score_loss
-    # >>> brier_score_loss([1, 0, 0, 1, 0, 0], [.9, .1, .2, .65, 0.8, 0.7])
-    # 0.21875 NOTE: opposite orientation
-    @test -mean(BrierScore()(yhat2, y2)) / 2 ≈ 0.21875
-    probs2 = [[.1, .9], [Float32(0.9), Float32(1) - Float32(0.9)], [.8, .2],
-              [.35, .65], [0.2, 0.8], [0.3, 0.7]]
-    yhat3 = [UnivariateFinite(L2, prob) for prob in probs2]
-    @test -mean(BrierScore()(yhat3, y2) / 2) ≈ 0.21875
-    @test mean(BrierLoss()(yhat3, y2) / 2) ≈ -mean(BrierScore()(yhat3, y2) / 2)
-
-    # Spherical
-    s = SphericalScore() # SphericalScore(2)
-    norms = [norm(probs[i,:]) for i in 1:size(probs, 1)]
-    @test (pdf.(yhat2, y2) ./ norms) ≈  s(yhat2, y2)
-    # non-performant version:
-    yhat4 = [yhat2...]
-    @test (pdf.(yhat2, y2) ./ norms) ≈  s(yhat4, y2)
-end
-
-@testset "LogScore, BrierScore, SphericalScore - infinite case" begin
-    uniform = Distributions.Uniform(2, 5)
-    betaprime = Distributions.BetaPrime()
-    discrete_uniform = Distributions.DiscreteUniform(2, 5)
-    w = [2, 3]
-
-    # brier
-    yhat = [missing, uniform]
-    @test isapprox(brier_score(yhat, [1.0, 1.0]) |> last, -1/3)
-    @test isapprox(brier_score(yhat, [NaN, 4.0]) |> last,  1/3)
-    @test isapprox(brier_score(yhat, [1.0, 1.0], w) |> last, -1)
-    yhat = [missing, uniform]
-    # issue https://github.com/JuliaStats/Distributions.jl/issues/1392
-    @test_broken isapprox(brier_score(yhat, [missing, 4.0], w), [1,])
-    yhat = [discrete_uniform, discrete_uniform]
-    @test isapprox(brier_score(yhat, [NaN, 1.0]), [-1/4, -1/4,])
-    @test isapprox(brier_score(yhat, [4.0, 4.0]), [1/4, 1/4,])
-
-    # spherical
-    yhat = [uniform, uniform]
-    @test isapprox(spherical_score(yhat, [1.0, 1.0]), [0, 0])
-    @test isapprox(spherical_score(yhat, [NaN, 4.0]), [0, 1/sqrt(3),])
-    # issue https://github.com/JuliaStats/Distributions.jl/issues/1392
-    @test_broken isapprox(spherical_score(yhat, [missing, 4.0], w), [sqrt(3),])
-    @test isapprox(spherical_score(yhat, [4.0, 4.0], w), [2/sqrt(3), sqrt(3),])
-    yhat = [discrete_uniform, discrete_uniform]
-    @test isapprox(spherical_score(yhat, [NaN, 1.0]), [0, 0])
-    @test isapprox(spherical_score(yhat, [4.0, 4.0]), [1/2, 1/2])
-
-    # log
-    yhat = [uniform, uniform]
-    @test isapprox(log_score(yhat, [4.0, 4.0]), [-log(3), -log(3),])
-    @test isapprox(log_score(yhat, [4.0, 4.0], w), [-2*log(27)/3, -log(27)])
-    yhat = [discrete_uniform, discrete_uniform]
-    # issue https://github.com/JuliaStats/Distributions.jl/issues/1392
-    @test_broken  isapprox(log_score(yhat, [missing, 4.0]), [-log(4),])
-
-    log_score([missing, uniform], [4.0, 4.0])
-
-    # errors
-    @test_throws(MLJBase.err_l2_norm(brier_score),
-                 brier_score([betaprime, betaprime], [1.0, 1.0]))
-    s = SphericalScore(alpha=1)
-    @test_throws MLJBase.ERR_UNSUPPORTED_ALPHA s(yhat, [1.0, 1.0])
-end
-
-true
diff --git a/test/measures/roc.jl b/test/measures/roc.jl
deleted file mode 100644
index aaaed8b7..00000000
--- a/test/measures/roc.jl
+++ /dev/null
@@ -1,13 +0,0 @@
-@testset "ROC" begin
-    y = [  0   0   0   1   0   1   1   0] |> vec |> categorical
-    s = [0.0 0.1 0.1 0.1 0.2 0.2 0.5 0.5] |> vec
-    ŷ = UnivariateFinite([0, 1], s, augment=true, pool=y)
-
-    fprs, tprs, ts = roc(ŷ, y)
-
-    sk_fprs = [0. , 0.2, 0.4, 0.8, 1. ]
-    sk_tprs = [0. , 0.33333333, 0.66666667, 1., 1.]
-
-    @test fprs ≈ sk_fprs
-    @test tprs ≈ sk_tprs
-end
diff --git a/test/preliminaries.jl b/test/preliminaries.jl
index b806a840..bffc1f4e 100644
--- a/test/preliminaries.jl
+++ b/test/preliminaries.jl
@@ -12,12 +12,8 @@ using Distributed
 addprocs(; exeflags="--project=$(Base.active_project())")
 
 @info "nprocs() = $(nprocs())"
-@static if VERSION >= v"1.3.0-DEV.573"
-    import .Threads
-    @info "nthreads() = $(Threads.nthreads())"
-else
-    @info "Running julia $(VERSION). Multithreading tests excluded. "
-end
+import .Threads
+@info "nthreads() = $(Threads.nthreads())"
 
 @everywhere begin
     using MLJModelInterface
@@ -27,6 +23,7 @@ end
     using Logging
     using ComputationalResources
     using StableRNGs
+    using StatisticalMeasures
 end
 
 import TypedTables
diff --git a/test/resampling.jl b/test/resampling.jl
index c170039a..27850375 100644
--- a/test/resampling.jl
+++ b/test/resampling.jl
@@ -5,6 +5,9 @@ import ComputationalResources: CPU1, CPUProcesses, CPUThreads
 using .TestUtilities
 using ProgressMeter
 import Tables
+@everywhere import StatisticalMeasures.StatisticalMeasuresBase as API
+using StatisticalMeasures
+import LearnAPI
 
 @everywhere begin
     using .Models
@@ -25,13 +28,18 @@ struct DummyInterval <: Interval end
 dummy_interval=DummyInterval()
 
 dummy_measure_det(yhat, y) = 42
-MLJBase.target_scitype(::typeof(dummy_measure_det)) = Table(MLJBase.Textual)
-MLJBase.prediction_type(::typeof(dummy_measure_det)) = :deterministic
-
-dummy_measure_interval(yhat, y) = [123, 456]
-MLJBase.target_scitype(::typeof(dummy_measure_interval)) =
-    Table(MLJBase.Textual)
-MLJBase.prediction_type(::typeof(dummy_measure_interval)) = :interval
+API.@trait(
+    typeof(dummy_measure_det),
+    observation_scitype = MLJBase.Textual,
+    kind_of_proxy = LearnAPI.LiteralTarget(),
+)
+
+dummy_measure_interval(yhat, y) = 42
+API.@trait(
+    typeof(dummy_measure_interval),
+    observation_scitype = MLJBase.Textual,
+    kind_of_proxy = LearnAPI.ConfidenceInterval(),
+)
 
 @testset "_actual_operations" begin
     clf = ConstantClassifier()
@@ -49,7 +57,7 @@ MLJBase.prediction_type(::typeof(dummy_measure_interval)) = :interval
                                     1) ==
                                    [predict_mean, predict_mean]
 
-    # handling of a measure with `:unknown` `prediction_type` (eg,
+    # handling of a measure with `nothing` `kind_of_proxy` (eg,
     # custom measure):
     my_mae(yhat, y) = abs.(yhat - y)
     @test(
@@ -71,21 +79,29 @@ MLJBase.prediction_type(::typeof(dummy_measure_interval)) = :interval
            [predict_mode])
     @test MLJBase._actual_operations(nothing, [l2,], rgs, 1) ==
         [predict_mean, ]
-    @test_throws(MLJBase.err_incompatible_prediction_types(clf_det, LogLoss()),
-                 MLJBase._actual_operations(nothing, [LogLoss(),], clf_det, 1))
+    @test_throws(
+        MLJBase.err_incompatible_prediction_types(clf_det, LogLoss()),
+        MLJBase._actual_operations(nothing, [LogLoss(),], clf_det, 1),
+    )
     @test MLJBase._actual_operations(nothing, measures_det, clf_det, 1) ==
         [predict, predict]
 
-    # measure/model differ in prediction type but weird target_scitype:
+    # measure/model differ in prediction type:
     @test_throws(
         MLJBase.err_ambiguous_operation(clf, dummy_measure_det),
-        MLJBase._actual_operations(nothing, [dummy_measure_det, ], clf, 1))
+        MLJBase._actual_operations(nothing, [dummy_measure_det, ], clf, 1),
+    )
 
     # measure has :interval prediction type but model does not (2 cases):
     @test_throws(
         MLJBase.err_ambiguous_operation(clf, dummy_measure_interval),
-        MLJBase._actual_operations(nothing,
-                                   [dummy_measure_interval, ], clf, 1))
+        MLJBase._actual_operations(
+            nothing,
+            [dummy_measure_interval, ],
+            clf,
+            1,
+        ),
+    )
     @test_throws(
         MLJBase.err_ambiguous_operation(clf_det, dummy_measure_interval),
         MLJBase._actual_operations(nothing,
@@ -103,16 +119,6 @@ MLJBase.prediction_type(::typeof(dummy_measure_interval)) = :interval
                                    [LogLoss(), ], dummy_interval, 1))
 end
 
-@testset "_feature_dependencies_exist" begin
-    measures = Any[rms, rsq, log_loss, brier_score]
-    @test !MLJBase._feature_dependencies_exist(measures)
-    my_feature_dependent_loss(ŷ, X, y) =
-        sum(abs.(ŷ - y) .* X.penalty)/sum(X.penalty);
-    MLJBase.is_feature_dependent(::typeof(my_feature_dependent_loss)) = true
-    push!(measures, my_feature_dependent_loss)
-    @test MLJBase._feature_dependencies_exist(measures)
-end
-
 @testset_accelerated "dispatch of resources and progress meter" accel begin
 
     @info "Checking progress bars:"
@@ -175,34 +181,50 @@ end
     y = rand(rng,4)
 
     # model prediction type is Probablistic but measure is Deterministic:
-    @test_throws(ArgumentError,
-                  MLJBase._check_measure(rms, predict, model, y))
+    @test_throws(
+        MLJBase.ERR_MEASURES_PROBABILISTIC(rms, MLJBase.LOG_SUGGESTION2),
+        MLJBase._check_measure(rms, predict, model, y),
+    )
 
     @test MLJBase._check_measure(rms, predict_mean, model, y)
 
     @test MLJBase._check_measure(rms, predict_median, model, y)
 
-    # has `y`  `Finite` elscityp but measure `rms` is for `Continuous`:
+    # has `y`  `Finite` elscitype but measure `rms` is for `Continuous`:
     y=categorical(collect("abc"))
-    @test_throws(ArgumentError,
-                 MLJBase._check_measure(rms, predict_median, model, y))
+    @test_throws(
+        MLJBase.ERR_MEASURES_OBSERVATION_SCITYPE(
+            rms,
+            Union{Missing,Infinite},
+            Multiclass{3},
+        ),
+        MLJBase._check_measure(rms, predict_median, model, y),
+    )
     model = ConstantClassifier()
     # model prediction type is Probablistic but measure is Deterministic:
-    @test_throws(ArgumentError,
-                 MLJBase._check_measure(mcr, predict, model, y))
+    @test_throws(
+        MLJBase.ERR_MEASURES_PROBABILISTIC(mcr, MLJBase.LOG_SUGGESTION1),
+        MLJBase._check_measure(mcr, predict, model, y),
+    )
 
     @test MLJBase._check_measure(mcr, predict_mode, model, y)
 
     # `Determistic` model but `Probablistic` measure:
     model = DeterministicConstantClassifier()
-    @test_throws(ArgumentError,
-                 MLJBase._check_measure(cross_entropy, predict, model, y))
+    @test_throws(
+        MLJBase.ERR_MEASURES_DETERMINISTIC(cross_entropy),
+        MLJBase._check_measure(cross_entropy, predict, model, y),
+    )
 
     # measure with wrong target_scitype:
-    @test_throws(ArgumentError,
-                 MLJBase._check_measures([brier_score, rms],
-                                         [predict_mode, predict_mean],
-                                         model, y))
+    @test_throws(
+        MLJBase.ERR_MEASURES_DETERMINISTIC(brier_score),
+        MLJBase._check_measures(
+            [brier_score, rms],
+            [predict_mode, predict_mean],
+            model, y,
+        ),
+    )
 
     model = ConstantClassifier()
     @test MLJBase._check_measures([brier_score, cross_entropy, accuracy],
@@ -211,8 +233,6 @@ end
 end
 
 @testset "check weights" begin
-    @test_throws(MLJBase.ERR_WEIGHTS_REAL,
-                 MLJBase._check_weights([:junk, :junk], 2))
     @test_throws(MLJBase.ERR_WEIGHTS_LENGTH,
                  MLJBase._check_weights([0.5, 0.5], 3))
     @test MLJBase._check_weights([0.5, 0.5], 2)
@@ -227,18 +247,18 @@ end
     @test MLJBase._check_class_weights(w, ['b', 'a'])
 end
 
+@everywhere begin
+    user_rms(yhat, y) = mean((yhat -y).^2) |> sqrt
+    # deliberately omitting `consumes_multiple_observations` trait:
+    API.@trait typeof(user_rms) kind_of_proxy=LearnAPI.LiteralTarget()
+end
+
 @testset_accelerated "folds specified" accel begin
     x1 = ones(10)
     x2 = ones(10)
     X  = (x1=x1, x2=x2)
     y  = [1.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0, 1.0, 1.0]
 
-    my_rms(yhat, y) = sqrt(mean((yhat -y).^2))
-    my_mae(yhat, y) = abs.(yhat - y)
-    MLJBase.reports_each_observation(::typeof(my_mae)) = true
-    MLJBase.prediction_type(::typeof(my_rms)) = :deterministic
-    MLJBase.prediction_type(::typeof(my_mae)) = :deterministic
-
     resampling = [(3:10, 1:2),
                   ([1, 2, 5, 6, 7, 8, 9, 10], 3:4),
                   ([1, 2, 3, 4, 7, 8, 9, 10], 5:6),
@@ -251,19 +271,27 @@ end
         mach  = machine(model, X, y, cache=cache)
 
         # check detection of incompatible measure (cross_entropy):
-        @test_throws ArgumentError evaluate!(mach, resampling=resampling,
-                                             measure=[cross_entropy, rmslp1],
-                                             verbosity=verb,
-                                             acceleration=accel)
+        @test_throws(
+            MLJBase.err_incompatible_prediction_types(model, cross_entropy),
+            evaluate!(
+                mach,
+                resampling=resampling,
+                measure=[cross_entropy, rmslp1],
+                verbosity=verb,
+                acceleration=accel,
+            ),
+        )
         result = evaluate!(mach, resampling=resampling, verbosity=verb,
-                           measure=[my_rms, my_mae, rmslp1], acceleration=accel)
+                           measure=[user_rms, mae, rmslp1], acceleration=accel)
 
         v = [1/2, 3/4, 1/2, 3/4, 1/2]
 
         @test result.per_fold[1] ≈ v
         @test result.per_fold[2] ≈ v
         @test result.per_fold[3][1] ≈ abs(log(2) - log(2.5))
-        @test ismissing(result.per_observation[1])
+        @test result.per_observation[1] ≈ map(result.per_fold[1]) do μ
+            fill(μ, 2)
+        end
         @test result.per_observation[2][1] ≈ [1/2, 1/2]
         @test result.per_observation[2][2] ≈ [3/4, 3/4]
         @test result.measurement[1] ≈ mean(v)
@@ -276,6 +304,42 @@ end
     end
 end
 
+@testset "folds specified - per_observation=false" begin
+    accel = CPU1()
+    cache = true
+    x1 = ones(10)
+    x2 = ones(10)
+    X  = (x1=x1, x2=x2)
+    y  = [1.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0, 1.0, 1.0]
+
+    resampling = [(3:10, 1:2),
+                  ([1, 2, 5, 6, 7, 8, 9, 10], 3:4),
+                  ([1, 2, 3, 4, 7, 8, 9, 10], 5:6),
+                  ([1, 2, 3, 4, 5, 6, 9, 10], 7:8),
+                  (1:8, 9:10)]
+
+    model = DeterministicConstantRegressor()
+    mach  = machine(model, X, y, cache=cache)
+
+    result = evaluate!(mach, resampling=resampling, verbosity=verb,
+                       measure=[user_rms, mae, rmslp1], acceleration=accel,
+                       per_observation=false)
+
+    v = [1/2, 3/4, 1/2, 3/4, 1/2]
+
+    @test result.per_fold[1] ≈ v
+    @test result.per_fold[2] ≈ v
+    @test result.per_fold[3][1] ≈ abs(log(2) - log(2.5))
+    @test result.per_observation isa Vector{Missing}
+    @test result.measurement[1] ≈ mean(v)
+    @test result.measurement[2] ≈ mean(v)
+
+    # fitted_params and report per fold:
+    @test map(fp->fp.fitresult, result.fitted_params_per_fold) ≈
+        [1.5, 1.25, 1.5, 1.25, 1.5]
+    @test all(isnothing, result.report_per_fold)
+end
+
 @testset "repeated resampling" begin
     x1 = ones(20)
     x2 = ones(20)
@@ -313,10 +377,11 @@ end
     model = Models.DeterministicConstantRegressor()
     for cache in [true, false]
         mach = machine(model, X, y, cache=cache)
+        # to see if a default measure is found:
+        evaluate!(mach, resampling=holdout, verbosity=verb,
+                  acceleration=accel)
         result = evaluate!(mach, resampling=holdout, verbosity=verb,
                            measure=[rms, rmslp1], acceleration=accel)
-        result = evaluate!(mach, resampling=holdout, verbosity=verb,
-                           acceleration=accel)
         @test result.measurement[1] ≈ 2/3
 
         # test direct evaluation of a model + data:
@@ -454,7 +519,7 @@ end
                d for fold in folds])
 end
 
-@testset_accelerated "sample weights in evaluation" accel begin
+@testset_accelerated "weights in evaluation" accel begin
     # cv:
     x1 = ones(4)
     x2 = ones(4)
@@ -483,7 +548,7 @@ end
     X, y = make_blobs(rng=rng)
     cv=CV(nfolds = 2)
     fold1, fold2 = partition(eachindex(y), 0.5)
-    m = MLJBase.MulticlassFScore()
+    m = MulticlassFScore()
     class_w = Dict(1=>1, 2=>2, 3=>3)
 
     model = Models.DeterministicConstantClassifier()
@@ -637,13 +702,6 @@ end
                                measure=misclassification_rate,
                                weights = fill(1, 100), acceleration=accel,
                                verbosity=verb))
-
-        @test_throws(ArgumentError,
-                     evaluate!(mach, resampling=Holdout(fraction_train=0.6),
-                               operation=predict_mode,
-                               measure=misclassification_rate,
-                               weights = fill('a', 5), acceleration=accel,
-                               verbosity=verb))
     end
 
     # resampling on a subset of all rows:
@@ -813,7 +871,7 @@ end
         operation=predict_mode,
         measure=ConfusionMatrix(),
         resampling=CV(),
-    )
+    );
     printed_evaluations = sprint(show, "text/plain", evaluations)
     @test contains(printed_evaluations, "N/A")
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 8b07929e..f6076565 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -25,10 +25,8 @@ end
     @test include("interface/data_utils.jl")
 end
 
-@conditional_testset "measures" begin
-    @test include("measures/measures.jl")
-    @test include("measures/measure_search.jl")
-    @test include("measures/doc_strings.jl")
+@conditional_testset "default_measures" begin
+    @test include("default_measures.jl")
 end
 
 @conditional_testset "resampling" begin
diff --git a/test/utilities.jl b/test/utilities.jl
index f9e40580..03be2877 100644
--- a/test/utilities.jl
+++ b/test/utilities.jl
@@ -171,5 +171,40 @@ end
         "sin, cos, tan, ..."
 end
 
+@testset "observation" begin
+    @test MLJBase.observation(AbstractVector{Count}) ==
+        Count
+    @test MLJBase.observation(AbstractVector{<:Count}) ==
+        Count
+    @test MLJBase.observation(AbstractVector{<:Union{Missing,Count}}) ==
+        Union{Missing,Count}
+    @test MLJBase.observation(AbstractMatrix{<:Count}) ==
+        AbstractVector{<:Count}
+    @test MLJBase.observation(AbstractMatrix{Union{Missing,Count}}) ==
+        AbstractVector{Union{Missing,Count}}
+    @test MLJBase.observation(AbstractMatrix{<:Union{Missing,Count}}) ==
+        AbstractVector{<:Union{Missing,Count}}
+    @test MLJBase.observation(Table(Count)) == AbstractVector{<:Count}
+end
+
+@testset "guess_observation_scitype" begin
+    @test MLJBase.guess_observation_scitype([missing, 1, 2, 3]) ==
+        Union{Missing, Count}
+    @test MLJBase.guess_observation_scitype(rand(3, 2)) ==
+        AbstractVector{Continuous}
+    @test MLJBase.guess_observation_scitype((x=rand(3), y=rand(Bool, 3))) ==
+        AbstractVector{Union{Continuous, Count}}
+    @test MLJBase.guess_observation_scitype((x=[missing, 1, 2], y=[1, 2, 3])) ==
+        Unknown
+    @test MLJBase.guess_observation_scitype(5) == Unknown
+end
+
+mutable struct DRegressor2 <: Deterministic end
+MLJBase.target_scitype(::Type{<:DRegressor2}) =
+    AbstractVector{<:Continuous}
+
+@test MLJBase.guess_model_target_observation_scitype(DRegressor2()) == Continuous
+
+
 end # module
 true