diff --git a/src/composition/learning_networks/nodes.jl b/src/composition/learning_networks/nodes.jl index 0733b211..07bd3ae0 100644 --- a/src/composition/learning_networks/nodes.jl +++ b/src/composition/learning_networks/nodes.jl @@ -409,14 +409,14 @@ of nodes, sources and other arguments. ### Examples -``` -X = source(π) -W = @node sin(X) +```julia-repl +julia> X = source(π) +julia> W = @node sin(X) julia> W() 0 -X = source(1:10) -Y = @node selectrows(X, 3:4) +julia> X = source(1:10) +julia> Y = @node selectrows(X, 3:4) julia> Y() 3:4 @@ -425,10 +425,10 @@ julia> Y(["one", "two", "three", "four"]) "three" "four" -X1 = source(4) -X2 = source(5) -add(a, b, c) = a + b + c -N = @node add(X1, 1, X2) +julia> X1 = source(4) +julia> X2 = source(5) +julia> add(a, b, c) = a + b + c +julia> N = @node add(X1, 1, X2) julia> N() 10 diff --git a/src/composition/learning_networks/signatures.jl b/src/composition/learning_networks/signatures.jl index d49aace9..7ffadb7c 100644 --- a/src/composition/learning_networks/signatures.jl +++ b/src/composition/learning_networks/signatures.jl @@ -8,10 +8,10 @@ **Private method.** -Return a dictionary of machines, keyed on model, for the all machines in the completed -learning network for which `node` is the greatest lower bound. Only machines bound to -symbolic models are included. Values are always vectors, even if they contain only a -single machine. +Return a dictionary of machines, keyed on model, for the all machines in the +completed learning network for which `node` is the greatest lower bound. Only +machines bound to symbolic models are included. Values are always vectors, +even if they contain only a single machine. """ function machines_given_model(node::AbstractNode) @@ -35,14 +35,14 @@ attempt_scalarize(v) = length(v) == 1 ? v[1] : v **Private method.** -Given a dictionary of machine vectors, keyed on model names (symbols), broadcast `f` over -each vector, and make the result, in the returned named tuple, the value associated with -the corresponding model name as key. +Given a dictionary of machine vectors, keyed on model names (symbols), broadcast +`f` over each vector, and make the result, in the returned named tuple, the +value associated with the corresponding model name as key. Singleton vector values are scalarized, unless `scalarize = false`. -If a value in the computed named tuple is `nothing`, or a vector of `nothing`s, then the -entry is dropped from the tuple, unless `drop_nothings=false`. +If a value in the computed named tuple is `nothing`, or a vector of `nothing`s, +then the entry is dropped from the tuple, unless `drop_nothings=false`. """ function tuple_keyed_on_model(f, machines_given_model; scalarize=true, drop_nothings=true) diff --git a/src/composition/models/stacking.jl b/src/composition/models/stacking.jl index ec872c16..9f6b4121 100644 --- a/src/composition/models/stacking.jl +++ b/src/composition/models/stacking.jl @@ -337,12 +337,12 @@ internal_stack_report( ) = NamedTuple{}() """ -internal_stack_report( - m::Stack, - verbosity::Int, - y::AbstractNode, - folds_evaluations::Vararg{AbstractNode}, -) + internal_stack_report( + m::Stack, + verbosity::Int, + y::AbstractNode, + folds_evaluations::Vararg{AbstractNode}, + ) When measure/measures is provided, the folds_evaluation will have been filled by `store_for_evaluation`. This function is not doing any heavy work (not constructing nodes @@ -518,7 +518,7 @@ function oos_set(m::Stack{modelnames}, Xs::Source, ys::Source, tt_pairs) where m end ####################################### -################# Prefit ################# +################# Prefit ############## ####################################### function prefit(m::Stack{modelnames}, verbosity::Int, X, y) where modelnames @@ -564,8 +564,7 @@ const DOC_STACK = Stack(; metalearner=nothing, name1=model1, name2=model2, ..., keyword_options...) Implements the two-layer generalized stack algorithm introduced by -[Wolpert -(1992)](https://www.sciencedirect.com/science/article/abs/pii/S0893608005800231) +[Wolpert (1992)](https://www.sciencedirect.com/science/article/abs/pii/S0893608005800231) and generalized by [Van der Laan et al (2007)](https://biostats.bepress.com/ucbbiostat/paper222/). Returns an instance of type `ProbabilisticStack` or `DeterministicStack`, diff --git a/src/composition/models/transformed_target_model.jl b/src/composition/models/transformed_target_model.jl index 259cff97..9304b63c 100644 --- a/src/composition/models/transformed_target_model.jl +++ b/src/composition/models/transformed_target_model.jl @@ -61,7 +61,7 @@ const ERR_MODEL_UNSPECIFIED = ArgumentError( "Expecting atomic model as argument. None specified. " ) const ERR_TRANSFORMER_UNSPECIFIED = ArgumentError( -"You must specify `transformer=...`. ." + "You must specify `transformer=...`. ." ) const ERR_TOO_MANY_ARGUMENTS = ArgumentError( "At most one non-keyword argument, a model, allowed. " @@ -123,7 +123,7 @@ y -> mode.(y))`. A model that normalizes the target before applying ridge regression, with predictions returned on the original scale: -``` +```julia @load RidgeRegressor pkg=MLJLinearModels model = RidgeRegressor() tmodel = TransformedTargetModel(model, transformer=Standardizer()) @@ -132,7 +132,7 @@ tmodel = TransformedTargetModel(model, transformer=Standardizer()) A model that applies a static `log` transformation to the data, again returning predictions to the original scale: -``` +```julia tmodel2 = TransformedTargetModel(model, transformer=y->log.(y), inverse=z->exp.(y)) ``` diff --git a/src/data/data.jl b/src/data/data.jl index 76c40fc4..d3242807 100644 --- a/src/data/data.jl +++ b/src/data/data.jl @@ -104,23 +104,28 @@ corresponding `fractions` of `length(nrows(X))`, where valid fractions are floats between 0 and 1 whose sum is less than one. The last fraction is not provided, as it is inferred from the preceding ones. -For "synchronized" partitioning of multiple objects, use the -`multi=true` option described below. +For synchronized partitioning of multiple objects, use the +`multi=true` option. - julia> partition(1:1000, 0.8) - ([1,...,800], [801,...,1000]) +```julia-repl +julia> partition(1:1000, 0.8) +([1,...,800], [801,...,1000]) - julia> partition(1:1000, 0.2, 0.7) - ([1,...,200], [201,...,900], [901,...,1000]) +julia> partition(1:1000, 0.2, 0.7) +([1,...,200], [201,...,900], [901,...,1000]) - julia> partition(reshape(1:10, 5, 2), 0.2, 0.4) - ([1 6], [2 7; 3 8], [4 9; 5 10]) +julia> partition(reshape(1:10, 5, 2), 0.2, 0.4) +([1 6], [2 7; 3 8], [4 9; 5 10]) - X, y = make_blobs() # a table and vector - Xtrain, Xtest = partition(X, 0.8, stratify=y) +julia> X, y = make_blobs() # a table and vector +julia> Xtrain, Xtest = partition(X, 0.8, stratify=y) +``` - (Xtrain, Xtest), (ytrain, ytest) = partition((X, y), 0.8, rng=123, multi=true) +Here's an example of synchronized partitioning of multiple objects: +```julia-repl +julia> (Xtrain, Xtest), (ytrain, ytest) = partition((X, y), 0.8, rng=123, multi=true) +``` ## Keywords @@ -209,7 +214,7 @@ Returns a tuple of tables/vectors with length one greater than the number of supplied predicates, with the last component including all previously unselected columns. -``` +```julia-repl julia> table = DataFrame(x=[1,2], y=['a', 'b'], z=[10.0, 20.0], w=["A", "B"]) 2×4 DataFrame Row │ x y z w @@ -300,9 +305,11 @@ The method is curried, so that `restrict(folds, i)` is the operator on data defined by `restrict(folds, i)(X) = restrict(X, folds, i)`. ### Example - - folds = ([1, 2], [3, 4, 5], [6,]) - restrict([:x1, :x2, :x3, :x4, :x5, :x6], folds, 2) # [:x3, :x4, :x5] +# +```julia +folds = ([1, 2], [3, 4, 5], [6,]) +restrict([:x1, :x2, :x3, :x4, :x5, :x6], folds, 2) # [:x3, :x4, :x5] +``` See also [`corestrict`](@ref) @@ -322,7 +329,9 @@ all elements of `folds`. Here `folds` is a vector or tuple of integer vectors, typically representing row indices or a vector, matrix or table. - complement(([1,2], [3,], [4, 5]), 2) # [1 ,2, 4, 5] +```julia +complement(([1,2], [3,], [4, 5]), 2) # [1 ,2, 4, 5] +``` """ complement(f, i) = reduce(vcat, collect(f)[Not(i)]) @@ -345,8 +354,10 @@ on data defined by `corestrict(folds, i)(X) = corestrict(X, folds, i)`. ### Example - folds = ([1, 2], [3, 4, 5], [6,]) - corestrict([:x1, :x2, :x3, :x4, :x5, :x6], folds, 2) # [:x1, :x2, :x6] +```julia +folds = ([1, 2], [3, 4, 5], [6,]) +corestrict([:x1, :x2, :x3, :x4, :x5, :x6], folds, 2) # [:x1, :x2, :x6] +``` """ corestrict(f::NTuple{N}, i) where N = FoldComplementRestrictor{i,N}(f) diff --git a/src/data/datasets.jl b/src/data/datasets.jl index 9e84b75c..ba4d88db 100644 --- a/src/data/datasets.jl +++ b/src/data/datasets.jl @@ -158,7 +158,7 @@ const COERCE_SUNSPOTS = ( (:sunspot_number=>Continuous),) """ -load_dataset(fpath, coercions) + load_dataset(fpath, coercions) Load one of standard dataset like Boston etc assuming the file is a comma separated file with a header. diff --git a/src/data/datasets_synthetic.jl b/src/data/datasets_synthetic.jl index 58a984c4..d1a8830e 100644 --- a/src/data/datasets_synthetic.jl +++ b/src/data/datasets_synthetic.jl @@ -18,9 +18,6 @@ const EXTRA_CLASSIFICATION = Internal function to finalize the `make_*` functions. """ -x = [1 2 3 ; 4 5 6] -x -length(size(collect(1:3))) # ( function finalize_Xy(X, y, shuffle, as_table, eltype, rng; clf::Bool=true) # Shuffle the rows if required if shuffle @@ -78,7 +75,7 @@ By default, a table `X` with `p` columns (features) and `n` rows ### Example -``` +```julia X, y = make_blobs(100, 3; centers=2, cluster_std=[1.0, 3.0]) ``` @@ -95,8 +92,7 @@ function make_blobs(n::Integer=100, # check arguments make sense if n < 1 || p < 1 - throw(ArgumentError( - "Expected `n` and `p` to be at least 1.")) + throw(ArgumentError("Expected `n` and `p` to be at least 1.")) end if center_box.first >= center_box.second throw(ArgumentError( @@ -181,7 +177,7 @@ $(EXTRA_KW_MAKE*EXTRA_CLASSIFICATION) ### Example -``` +```julia X, y = make_circles(100; noise=0.5, factor=0.3) ``` @@ -196,12 +192,10 @@ function make_circles(n::Integer=100; # check arguments make sense if n < 1 - throw(ArgumentError( - "Expected `n` to be at least 1.")) + throw(ArgumentError("Expected `n` to be at least 1.")) end if noise < 0 - throw(ArgumentError( - "Noise argument cannot be negative.")) + throw(ArgumentError("Noise argument cannot be negative.")) end if !(0 < factor < 1) throw(ArgumentError( @@ -224,12 +218,12 @@ function make_circles(n::Integer=100; X .+= noise .* randn(rng, n, 2) end - return finalize_Xy(X, y, shuffle, as_table, eltype, rng) + return finalize_Xy(X, y, shuffle, as_table, eltype, rng) end """ - make_moons(n::Int=100; kwargs...) + make_moons(n::Int=100; kwargs...) Generates labeled two-dimensional points lying close to two interleaved semi-circles, for use with classification and clustering @@ -257,7 +251,7 @@ membership to the left or right semi-circle. ### Example -``` +```julia X, y = make_moons(100; noise=0.5) ``` @@ -273,12 +267,10 @@ function make_moons(n::Int=150; # check arguments make sense if n < 1 - throw(ArgumentError( - "Expected `n` to be at least 1.")) + throw(ArgumentError("Expected `n` to be at least 1.")) end if noise < 0 - throw(ArgumentError( - "Noise argument cannot be negative.")) + throw(ArgumentError("Noise argument cannot be negative.")) end rng = init_rng(rng) @@ -324,8 +316,7 @@ end Make portion `s` of vector `θ` exactly 0. """ -sparsify!(rng, θ, s) = - (θ .*= (rand(rng, length(θ)) .< s)) +sparsify!(rng, θ, s) = (θ .*= (rand(rng, length(θ)) .< s)) """Add outliers to portion s of vector.""" outlify!(rng, y, s) = @@ -338,19 +329,18 @@ const SIGMOID_32 = log(Float32(1)/eps(Float32) - Float32(1)) sigmoid(x) Return the sigmoid computed in a numerically stable way: - ``σ(x) = 1/(1+exp(-x))`` """ function sigmoid(x::Float64) - x > SIGMOID_64 && return one(x) - x < -SIGMOID_64 && return zero(x) - return one(x) / (one(x) + exp(-x)) + x > SIGMOID_64 && return one(x) + x < -SIGMOID_64 && return zero(x) + return one(x) / (one(x) + exp(-x)) end function sigmoid(x::Float32) - x > SIGMOID_32 && return one(x) - x < -SIGMOID_32 && return zero(x) - return one(x) / (one(x) + exp(-x)) + x > SIGMOID_32 && return one(x) + x < -SIGMOID_32 && return zero(x) + return one(x) / (one(x) + exp(-x)) end sigmoid(x) = sigmoid(float(x)) @@ -392,7 +382,7 @@ $EXTRA_KW_MAKE ### Example -``` +```julia X, y = make_regression(100, 5; noise=0.5, sparse=0.2, outliers=0.1) ``` @@ -411,24 +401,19 @@ function make_regression(n::Int=100, # check arguments make sense if n < 1 || p < 1 - throw(ArgumentError( - "Expected `n` and `p` to be at least 1.")) + throw(ArgumentError("Expected `n` and `p` to be at least 1.")) end if n_targets < 1 - throw(ArgumentError( - "Expected `n_targets` to be at least 1.")) + throw(ArgumentError("Expected `n_targets` to be at least 1.")) end if !(0 <= sparse < 1) - throw(ArgumentError( - "Sparsity argument must be in [0, 1).")) + throw(ArgumentError("Sparsity argument must be in [0, 1).")) end if noise < 0 - throw(ArgumentError( - "Noise argument cannot be negative.")) + throw(ArgumentError("Noise argument cannot be negative.")) end if !(0 <= outliers <= 1) - throw(ArgumentError( - "Outliers argument must be in [0, 1].")) + throw(ArgumentError("Outliers argument must be in [0, 1].")) end rng = init_rng(rng) diff --git a/src/hyperparam/one_dimensional_range_methods.jl b/src/hyperparam/one_dimensional_range_methods.jl index 5afc670c..dc82d03a 100644 --- a/src/hyperparam/one_dimensional_range_methods.jl +++ b/src/hyperparam/one_dimensional_range_methods.jl @@ -296,9 +296,11 @@ Construct an object `s` which can be used to generate random samples from a `ParamRange` object `r` (a one-dimensional range) using one of the following calls: - rand(s) # for one sample - rand(s, n) # for n samples - rand(rng, s [, n]) # to specify an RNG +```julia +rand(s) # for one sample +rand(s, n) # for n samples +rand(rng, s [, n]) # to specify an RNG +``` The argument `probs` can be any probability vector with the same length as `r.values`. The second `sampler` method above calls the @@ -329,30 +331,32 @@ in the special case `r.scale` is a callable object `f`. In that case, ### Examples - r = range(Char, :letter, values=collect("abc")) - s = sampler(r, [0.1, 0.2, 0.7]) - samples = rand(s, 1000); - StatsBase.countmap(samples) - Dict{Char,Int64} with 3 entries: - 'a' => 107 - 'b' => 205 - 'c' => 688 - - r = range(Int, :k, lower=2, upper=6) # numeric but discrete - s = sampler(r, Normal) - samples = rand(s, 1000); - UnicodePlots.histogram(samples) - ┌ ┐ - [2.0, 2.5) ┤▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 119 - [2.5, 3.0) ┤ 0 - [3.0, 3.5) ┤▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 296 - [3.5, 4.0) ┤ 0 - [4.0, 4.5) ┤▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 275 - [4.5, 5.0) ┤ 0 - [5.0, 5.5) ┤▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 221 - [5.5, 6.0) ┤ 0 - [6.0, 6.5) ┤▇▇▇▇▇▇▇▇▇▇▇ 89 - └ ┘ +```julia-repl +julia> r = range(Char, :letter, values=collect("abc")) +julia> s = sampler(r, [0.1, 0.2, 0.7]) +julia> samples = rand(s, 1000); +julia> StatsBase.countmap(samples) +Dict{Char,Int64} with 3 entries: + 'a' => 107 + 'b' => 205 + 'c' => 688 + +julia> r = range(Int, :k, lower=2, upper=6) # numeric but discrete +julia> s = sampler(r, Normal) +julia> samples = rand(s, 1000); +julia> UnicodePlots.histogram(samples) + ┌ ┐ +[2.0, 2.5) ┤▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 119 +[2.5, 3.0) ┤ 0 +[3.0, 3.5) ┤▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 296 +[3.5, 4.0) ┤ 0 +[4.0, 4.5) ┤▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 275 +[4.5, 5.0) ┤ 0 +[5.0, 5.5) ┤▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 221 +[5.5, 6.0) ┤ 0 +[6.0, 6.5) ┤▇▇▇▇▇▇▇▇▇▇▇ 89 + └ ┘ +``` """ Distributions.sampler(r::NumericRange{T}, diff --git a/src/machines.jl b/src/machines.jl index f70cd77a..1a3f5388 100644 --- a/src/machines.jl +++ b/src/machines.jl @@ -14,7 +14,7 @@ The effect of the `scitype_check_level` option in calls of the form `machine(model, data, scitype_check_level=...)` is summarized below: `scitype_check_level` | Inspect scitypes? | If `Unknown` in scitypes | If other scitype mismatch | -|:-------------------:|:-----------------:|:------------------------:|:-------------------------:| +|:--------------------|:-----------------:|:------------------------:|:-------------------------:| 0 | × | | | 1 (value at startup) | ✓ | | warning | 2 | ✓ | warning | warning | @@ -120,7 +120,7 @@ any upstream dependencies in a learning network): ```julia replace(mach, :args => (), :data => (), :data_resampled_data => (), :cache => nothing) - +``` """ function Base.replace(mach::Machine{<:Any,<:Any,C}, field_value_pairs::Pair...) where C # determined new `model` and `args` and build replacement dictionary: @@ -206,8 +206,7 @@ const WARN_UNKNOWN_SCITYPE = "Some data contains `Unknown` scitypes, which might lead to model-data mismatches. " err_length_mismatch(model) = DimensionMismatch( - "Differing number of observations "* - "in input and target. ") + "Differing number of observations in input and target. ") function check(model::Model, scitype_check_level, args...) @@ -682,7 +681,7 @@ function fit_only!( force == true || # condition (ii) upstream_has_changed || # condition (iii) condition_iv || # condition (iv) - modeltype_changed # conditions (vi) or (vii) + modeltype_changed # conditions (vi) or (vii) isdefined(mach, :report) || (mach.report = LittleDict{Symbol,Any}()) @@ -807,12 +806,12 @@ type's field names as keys. The corresponding value is the fitted parameters for machine in the underlying learning network bound to that model. (If multiple machines share the same model, then the value is a vector.) -```julia -using MLJ -@load LogisticClassifier pkg=MLJLinearModels -X, y = @load_crabs; -pipe = Standardizer() |> LogisticClassifier() -mach = machine(pipe, X, y) |> fit! +```julia-repl +julia> using MLJ +julia> @load LogisticClassifier pkg=MLJLinearModels +julia> X, y = @load_crabs; +julia> pipe = Standardizer() |> LogisticClassifier(); +julia> mach = machine(pipe, X, y) |> fit!; julia> fitted_params(mach).logistic_classifier (classes = CategoricalArrays.CategoricalValue{String,UInt32}["B", "O"], @@ -845,12 +844,12 @@ type's field names as keys. The corresponding value is the report for the machin underlying learning network bound to that model. (If multiple machines share the same model, then the value is a vector.) -```julia -using MLJ -@load LinearBinaryClassifier pkg=GLM -X, y = @load_crabs; -pipe = Standardizer() |> LinearBinaryClassifier() -mach = machine(pipe, X, y) |> fit! +```julia-repl +julia> using MLJ +julia> @load LinearBinaryClassifier pkg=GLM +julia> X, y = @load_crabs; +julia> pipe = Standardizer() |> LinearBinaryClassifier(); +julia> mach = machine(pipe, X, y) |> fit!; julia> report(mach).linear_binary_classifier (deviance = 3.8893386087844543e-7, @@ -957,25 +956,25 @@ A machine returned by `serializable` is characterized by the property `mach.state == -1`. ### Example using [JLSO](https://invenia.github.io/JLSO.jl/stable/) - - using MLJ - using JLSO - Tree = @load DecisionTreeClassifier - tree = Tree() - X, y = @load_iris - mach = fit!(machine(tree, X, y)) - - # This machine can now be serialized - smach = serializable(mach) - JLSO.save("machine.jlso", :machine => smach) - - # Deserialize and restore learned parameters to useable form: - loaded_mach = JLSO.load("machine.jlso")[:machine] - restore!(loaded_mach) - - predict(loaded_mach, X) - predict(mach, X) - +```julia +using MLJ +using JLSO +Tree = @load DecisionTreeClassifier +tree = Tree() +X, y = @load_iris +mach = fit!(machine(tree, X, y)) + +# This machine can now be serialized +smach = serializable(mach) +JLSO.save("machine.jlso", :machine => smach) + +# Deserialize and restore learned parameters to useable form: +loaded_mach = JLSO.load("machine.jlso")[:machine] +restore!(loaded_mach) + +predict(loaded_mach, X) +predict(mach, X) +``` See also [`restore!`](@ref), [`MLJBase.save`](@ref). """ @@ -1051,21 +1050,23 @@ the example below. ### Example - using MLJ - Tree = @load DecisionTreeClassifier - X, y = @load_iris - mach = fit!(machine(Tree(), X, y)) - - MLJ.save("tree.jls", mach) - mach_predict_only = machine("tree.jls") - predict(mach_predict_only, X) - - # using a buffer: - io = IOBuffer() - MLJ.save(io, mach) - seekstart(io) - predict_only_mach = machine(io) - predict(predict_only_mach, X) +```julia +using MLJ +Tree = @load DecisionTreeClassifier +X, y = @load_iris +mach = fit!(machine(Tree(), X, y)) + +MLJ.save("tree.jls", mach) +mach_predict_only = machine("tree.jls") +predict(mach_predict_only, X) + +# using a buffer: +io = IOBuffer() +MLJ.save(io, mach) +seekstart(io) +predict_only_mach = machine(io) +predict(predict_only_mach, X) +``` !!! warning "Only load files from trusted sources" Maliciously constructed JLS files, like pickles, and most other @@ -1078,8 +1079,7 @@ the example below. See also [`serializable`](@ref), [`machine`](@ref). """ -function save(file::Union{String,IO}, - mach::Machine) +function save(file::Union{String,IO}, mach::Machine) isdefined(mach, :fitresult) || error("Cannot save an untrained machine. ") diff --git a/src/resampling.jl b/src/resampling.jl index 8fc2c948..250e3ca0 100644 --- a/src/resampling.jl +++ b/src/resampling.jl @@ -15,8 +15,7 @@ const PREDICT_OPERATIONS_STRING = begin end const PROG_METER_DT = 0.1 const ERR_WEIGHTS_LENGTH = - DimensionMismatch("`weights` and target "* - "have different lengths. ") + DimensionMismatch("`weights` and target have different lengths. ") const ERR_WEIGHTS_DICT = ArgumentError("`class_weights` must be a "* "dictionary with `Real` values. ") @@ -158,14 +157,14 @@ train_test_pairs(::InSample, rows) = [(rows, rows),] # Holdout """ - holdout = Holdout(; fraction_train=0.7, - shuffle=nothing, - rng=nothing) + holdout = Holdout(; fraction_train=0.7, shuffle=nothing, rng=nothing) Instantiate a `Holdout` resampling strategy, for use in `evaluate!`, `evaluate` and in tuning. - train_test_pairs(holdout, rows) +```julia +train_test_pairs(holdout, rows) +``` Returns the pair `[(train, test)]`, where `train` and `test` are vectors such that `rows=vcat(train, test)` and @@ -200,7 +199,7 @@ Holdout(; fraction_train::Float64=0.7, shuffle=nothing, rng=nothing) = function train_test_pairs(holdout::Holdout, rows) train, test = partition(rows, holdout.fraction_train, - shuffle=holdout.shuffle, rng=holdout.rng) + shuffle=holdout.shuffle, rng=holdout.rng) return [(train, test),] end @@ -214,7 +213,9 @@ end Cross-validation resampling strategy, for use in `evaluate!`, `evaluate` and tuning. - train_test_pairs(cv, rows) +```julia +train_test_pairs(cv, rows) +``` Returns an `nfolds`-length iterator of `(train, test)` pairs of vectors (row indices), where each `train` and `test` is a sub-vector @@ -297,7 +298,9 @@ Cross-validation resampling strategy, for use in `evaluate!`, `evaluate` and tuning, when observations are chronological and not expected to be independent. - train_test_pairs(tscv, rows) +```julia +train_test_pairs(tscv, rows) +``` Returns an `nfolds`-length iterator of `(train, test)` pairs of vectors (row indices), where each `train` and `test` is a sub-vector @@ -392,7 +395,9 @@ Stratified cross-validation resampling strategy, for use in `evaluate!`, `evaluate` and in tuning. Applies only to classification problems (`OrderedFactor` or `Multiclass` targets). - train_test_pairs(stratified_cv, rows, y) +```julia +train_test_pairs(stratified_cv, rows, y) +``` Returns an `nfolds`-length iterator of `(train, test)` pairs of vectors (row indices) where each `train` and `test` is a sub-vector of `rows`. The `test` vectors are mutually exclusive and exhaust @@ -1009,8 +1014,10 @@ Available resampling strategies are $RESAMPLING_STRATEGIES_LIST. If `resampling` instance of one of these, then a vector of tuples of the form `(train_rows, test_rows)` is expected. For example, setting - resampling = [((1:100), (101:200)), - ((101:200), (1:100))] +```julia +resampling = [((1:100), (101:200)), + ((101:200), (1:100))] +``` gives two-fold cross-validation using the first 200 rows of data. @@ -1161,9 +1168,8 @@ end """ evaluate(model, data...; cache=true, options...) -Equivalent to `evaluate!(machine(model, data..., cache=cache); -options...)`. See the machine version `evaluate!` for the complete -list of options. +Equivalent to `evaluate!(machine(model, data..., cache=cache); options...)`. +See the machine version `evaluate!` for the complete list of options. Returns a [`PerformanceEvaluation`](@ref) object. diff --git a/src/show.jl b/src/show.jl index a1ebb945..9a9616af 100644 --- a/src/show.jl +++ b/src/show.jl @@ -27,7 +27,9 @@ Private method (used in testing). Equivalent to `const x = value` but registers the binding thus: - MLJBase.HANDLE_GIVEN_ID[objectid(value)] = :x +```julia +MLJBase.HANDLE_GIVEN_ID[objectid(value)] = :x +``` Registered objects get displayed using the variable name to which it was bound in calls to `show(x)`, etc. @@ -320,19 +322,21 @@ _show(stream::IO, ::Nothing) = println(stream, "nothing") """ _recursive_show(stream, object, current_depth, depth) +**Private method.** + Generate a table of the properties of the `MLJType` object, dislaying each property value by calling the method `_show` on it. The behaviour of `_show(stream, f)` is as follows: 1. If `f` is itself a `MLJType` object, then its short form is shown -and `_recursive_show` generates as separate table for each of its -properties (and so on, up to a depth of argument `depth`). + and `_recursive_show` generates as separate table for each of its + properties (and so on, up to a depth of argument `depth`). 2. Otherwise `f` is displayed as "(omitted T)" where `T = typeof(f)`, -unless `istoobig(f)` is false (the `istoobig` fall-back for arbitrary -types being `true`). In the latter case, the long (ie, -MIME"plain/text") form of `f` is shown. To override this behaviour, -overload the `_show` method for the type in question. + unless `istoobig(f)` is false (the `istoobig` fall-back for arbitrary + types being `true`). In the latter case, the long (ie, + MIME"plain/text") form of `f` is shown. To override this behaviour, + overload the `_show` method for the type in question. """ function _recursive_show(stream::IO, object::MLJType, current_depth, depth) diff --git a/src/sources.jl b/src/sources.jl index d2fd7524..083e269e 100644 --- a/src/sources.jl +++ b/src/sources.jl @@ -41,9 +41,11 @@ expected. The calling behaviour of a `Source` object is this: - Xs() = X - Xs(rows=r) = selectrows(X, r) # eg, X[r,:] for a DataFrame - Xs(Xnew) = Xnew +```julia +Xs() = X +Xs(rows=r) = selectrows(X, r) # eg, X[r,:] for a DataFrame +Xs(Xnew) = Xnew +``` See also: [`MLJBase.prefit`](@ref), [`sources`](@ref), [`origins`](@ref), [`node`](@ref). diff --git a/src/utilities.jl b/src/utilities.jl index 969fce4c..3dcf31a6 100644 --- a/src/utilities.jl +++ b/src/utilities.jl @@ -14,7 +14,7 @@ View a nested named tuple `t` as a tree and return, as a tuple, the values at the leaves, in the order they appear in the original tuple. ```julia-repl -julia> t = (X = (x = 1, y = 2), Y = 3) +julia> t = (X = (x = 1, y = 2), Y = 3); julia> flat_values(t) (1, 2, 3) ``` @@ -51,6 +51,7 @@ end For prepending symbols in expressions like `:(y.w)` and `:(x1.x2.x3)`. +```julia-repl julia> prepend(:x, :y) :(x.y) @@ -59,6 +60,7 @@ julia> prepend(:x, :(y.z)) julia> prepend(:w, ans) :(w.x.y.z) +``` If the second argument is `nothing`, then `nothing` is returned. @@ -74,10 +76,11 @@ prepend(s::Symbol, ex::Expr) = Expr(:(.), prepend(s, ex.args[1]), ex.args[2]) Call getproperty recursively on `object` to extract the value of some nested property, as in the following example: - julia> object = (X = (x = 1, y = 2), Y = 3) - julia> recursive_getproperty(object, :(X.y)) - 2 - +```julia-repl +julia> object = (X = (x = 1, y = 2), Y = 3); +julia> recursive_getproperty(object, :(X.y)) +2 +``` """ recursive_getproperty(obj, property::Symbol) = getproperty(obj, property) function recursive_getproperty(obj, ex::Expr) @@ -105,7 +108,7 @@ end Set a nested property of an `object` to `value`, as in the following example: -``` +```julia-repl julia> mutable struct Foo X Y @@ -150,7 +153,7 @@ have the same number of rows. end """ -_permute_rows(obj, perm) + _permute_rows(obj, perm) Internal function to return a vector or matrix with permuted rows given the permutation `perm`. @@ -182,7 +185,7 @@ function shuffle_rows( end """ -init_rng(rng) + init_rng(rng) Create an `AbstractRNG` from `rng`. If `rng` is a non-negative `Integer`, it returns a `MersenneTwister` random number generator seeded with `rng`; If `rng` is @@ -249,8 +252,10 @@ end Return a "sequence" string from the first `n` elements generated by `itr`. - julia> MLJBase.sequence_string(1:10, 4) - "1, 2, 3, 4, ..." +```julia-repl +julia> MLJBase.sequence_string(1:10, 4) +"1, 2, 3, 4, ..." +``` **Private method.** @@ -293,7 +298,7 @@ column cycle fastest, those in the last clolumn slowest. ### Example -```julia +```julia-repl julia> iterators = ([1, 2], ["a","b"], ["x", "y", "z"]); julia> MLJTuning.unwind(iterators...) 12×3 Array{Any,2}: @@ -340,15 +345,15 @@ end Split an `AbstractRange` into `n` subranges of approximately equal length. ### Example -```julia +```julia-repl julia> collect(chunks(1:5, 2)) 2-element Array{UnitRange{Int64},1}: 1:3 4:5 +``` **Private method** -``` """ function chunks(c::AbstractRange, n::Integer) n < 1 && throw(ArgumentError("cannot split range into $n subranges")) @@ -410,8 +415,8 @@ If `only` is specified, then the operation is restricted to those `M` for which `M isa only`. In all other cases the symbolic name is generated using `substitute` as the base symbol. -``` -existing_names = [] +```julia-repl +julia> existing_names = []; julia> generate_name!(Vector{Int}, existing_names) :vector @@ -470,14 +475,14 @@ generate_name!(model, existing_names; kwargs...) = *Private method.* -Tries to infer the per-observation scitype from the scitype of `S`, when `S` is known to -be the scitype of some container with multiple observations; here we view the scitype for -one row of a table to be the scitype of the row converted to a vector. Return `Unknown` if -unable to draw reliable inferrence. +Tries to infer the per-observation scitype from the scitype of `S`, when `S` is +known to be the scitype of some container with multiple observations; here we +view the scitype for one row of a table to be the scitype of the row converted +to a vector. Return `Unknown` if unable to draw reliable inferrence. -The observation scitype for a table is here understood as the scitype of a row converted -to a vector. +The observation scitype for a table is here understood as the scitype of a row +converted to a vector. """ observation(::Type) = Unknown @@ -501,13 +506,13 @@ end *Private method.* -If `y` is an `AbstractArray`, return the scitype of `y[:, :, ..., :, 1]`. If `y` is a -table, return the scitype of the first row, converted to a vector, unless this row has -`missing` elements, in which case return `Unknown`. +If `y` is an `AbstractArray`, return the scitype of `y[:, :, ..., :, 1]`. If `y` +is a table, return the scitype of the first row, converted to a vector, unless +this row has `missing` elements, in which case return `Unknown`. In all other cases, `Unknown`. -``` +```julia-repl julia> guess_observation_scitype([missing, 1, 2, 3]) Union{Missing, Count} @@ -536,12 +541,12 @@ end *Private method* -Try to infer a lowest upper bound on the scitype of target observations acceptable to -`model`, by inspecting `target_scitype(model)`. Return `Unknown` if unable to draw reliable -inferrence. +Try to infer a lowest upper bound on the scitype of target observations +acceptable to `model`, by inspecting `target_scitype(model)`. Return `Unknown` +if unable to draw reliable inferrence. -The observation scitype for a table is here understood as the scitype of a row converted -to a vector. +The observation scitype for a table is here understood as the scitype of a row +converted to a vector. """ guess_model_target_observation_scitype(model) = observation(target_scitype(model))