diff --git a/.github/workflows/Documenter.yml b/.github/workflows/Documenter.yml index 8bbf8bc7..218dc550 100644 --- a/.github/workflows/Documenter.yml +++ b/.github/workflows/Documenter.yml @@ -26,6 +26,11 @@ jobs: version: '1' - run: julia --project -e 'using Pkg; Pkg.develop([PackageSpec(path=joinpath(pwd(), "SnoopCompileCore"))])' - uses: julia-actions/julia-buildpkg@latest + # To access the developer tools from within a package's environment, they should be in the default environment + - run: julia -e 'using Pkg; Pkg.develop([PackageSpec(path=joinpath(pwd(), "SnoopCompileCore")), PackageSpec(path=joinpath(pwd()))]); Pkg.instantiate()' + # Additional packages we'll need + - run: julia -e 'using Pkg; Pkg.add(["AbstractTrees", "Cthulhu"])' # pyplot would be nice but it often errors + # Documenter wants them to be in the local environment - run: julia --project=docs/ -e 'using Pkg; Pkg.develop([PackageSpec(path=joinpath(pwd(), "SnoopCompileCore")), PackageSpec(path=joinpath(pwd()))]); Pkg.instantiate()' - uses: julia-actions/julia-docdeploy@releases/v1 env: diff --git a/SnoopCompileCore/src/snoop_inference.jl b/SnoopCompileCore/src/snoop_inference.jl index 76c9f0e8..08398f4c 100644 --- a/SnoopCompileCore/src/snoop_inference.jl +++ b/SnoopCompileCore/src/snoop_inference.jl @@ -98,36 +98,40 @@ function _snoop_inference(cmd::Expr) end """ - tinf = @snoop_inference commands + tinf = @snoop_inference commands; -Produce a profile of julia's type inference, recording the amount of time spent inferring -every `MethodInstance` processed while executing `commands`. Each fresh entrance to -type inference (whether executed directly in `commands` or because a call was made -by runtime-dispatch) also collects a backtrace so the caller can be identified. +Produce a profile of julia's type inference, recording the amount of time spent +inferring every `MethodInstance` processed while executing `commands`. Each +fresh entrance to type inference (whether executed directly in `commands` or +because a call was made by runtime-dispatch) also collects a backtrace so the +caller can be identified. -`tinf` is a tree, each node containing data on a particular inference "frame" (the method, -argument-type specializations, parameters, and even any constant-propagated values). -Each reports the [`exclusive`](@ref)/[`inclusive`](@ref) times, where the exclusive -time corresponds to the time spent inferring this frame in and of itself, whereas -the inclusive time includes the time needed to infer all the callees of this frame. +`tinf` is a tree, each node containing data on a particular inference "frame" +(the method, argument-type specializations, parameters, and even any +constant-propagated values). Each reports the +[`exclusive`](@ref)/[`inclusive`](@ref) times, where the exclusive time +corresponds to the time spent inferring this frame in and of itself, whereas the +inclusive time includes the time needed to infer all the callees of this frame. The top-level node in this profile tree is `ROOT`. Uniquely, its exclusive time -corresponds to the time spent _not_ in julia's type inference (codegen, llvm_opt, runtime, etc). +corresponds to the time spent _not_ in julia's type inference (codegen, +llvm_opt, runtime, etc). -There are many different ways of inspecting and using the data stored in `tinf`. -The simplest is to load the `AbstracTrees` package and display the tree with -`AbstractTrees.print_tree(tinf)`. -See also: `flamegraph`, `flatten`, `inference_triggers`, `SnoopCompile.parcel`, -`runtime_inferencetime`. +Working with `tinf` effectively requires loading `SnoopCompile`. + +!!! warning + Note the semicolon `;` at the end of the `@snoop_inference` macro call. + Because `SnoopCompileCore` is not permitted to invalidate any code, it cannot define + the `Base.show` methods that pretty-print `tinf`. Defer inspection of `tinf` + until `SnoopCompile` has been loaded. # Example -```jldoctest; setup=:(using SnoopCompile), filter=r"([0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?/[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?|\\d direct)" + +```jldoctest; setup=:(using SnoopCompileCore), filter=r"([0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?/[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?|\\d direct)" julia> tinf = @snoop_inference begin sort(rand(100)) # Evaluate some code and profile julia's type inference - end -InferenceTimingNode: 0.110018224/0.131464476 on Core.Compiler.Timings.ROOT() with 2 direct children + end; ``` - """ macro snoop_inference(cmd) return _snoop_inference(cmd) diff --git a/SnoopCompileCore/src/snoop_invalidations.jl b/SnoopCompileCore/src/snoop_invalidations.jl index 5119e678..25fe5899 100644 --- a/SnoopCompileCore/src/snoop_invalidations.jl +++ b/SnoopCompileCore/src/snoop_invalidations.jl @@ -1,18 +1,18 @@ export @snoop_invalidations """ - list = @snoop_invalidations expr + invs = @snoop_invalidations expr Capture method cache invalidations triggered by evaluating `expr`. -`list` is a sequence of invalidated `Core.MethodInstance`s together with "explanations," consisting +`invs` is a sequence of invalidated `Core.MethodInstance`s together with "explanations," consisting of integers (encoding depth) and strings (documenting the source of an invalidation). -Unless you are working at a low level, you essentially always want to pass `list` +Unless you are working at a low level, you essentially always want to pass `invs` directly to [`SnoopCompile.invalidation_trees`](@ref). # Extended help -`list` is in a format where the "reason" comes after the items. +`invs` is in a format where the "reason" comes after the items. Method deletion results in the sequence [zero or more (mi, "invalidate_mt_cache") pairs..., zero or more (depth1 tree, loctag) pairs..., method, loctag] with loctag = "jl_method_table_disable" @@ -22,14 +22,16 @@ where `mi` means a `MethodInstance`. `depth1` means a sequence starting at `dept Method insertion results in the sequence [zero or more (depth0 tree, sig) pairs..., same info as with delete_method except loctag = "jl_method_table_insert"] + +The authoritative reference is Julia's own `src/gf.c` file. """ macro snoop_invalidations(expr) quote - local list = ccall(:jl_debug_method_invalidation, Any, (Cint,), 1) + local invs = ccall(:jl_debug_method_invalidation, Any, (Cint,), 1) Expr(:tryfinally, $(esc(expr)), ccall(:jl_debug_method_invalidation, Any, (Cint,), 0) ) - list + invs end end diff --git a/SnoopCompileCore/src/snoop_llvm.jl b/SnoopCompileCore/src/snoop_llvm.jl index 8e2bc0fa..602cf3de 100644 --- a/SnoopCompileCore/src/snoop_llvm.jl +++ b/SnoopCompileCore/src/snoop_llvm.jl @@ -3,11 +3,10 @@ export @snoop_llvm using Serialization """ -``` -@snoop_llvm "func_names.csv" "llvm_timings.yaml" begin - # Commands to execute, in a new process -end -``` + @snoop_llvm "func_names.csv" "llvm_timings.yaml" begin + # Commands to execute, in a new process + end + causes the julia compiler to log timing information for LLVM optimization during the provided commands to the files "func_names.csv" and "llvm_timings.yaml". These files can be used for the input to `SnoopCompile.read_snoop_llvm("func_names.csv", "llvm_timings.yaml")`. diff --git a/docs/Project.toml b/docs/Project.toml index c3c736ff..f82615e3 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,15 +1,19 @@ [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" +Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b" MethodAnalysis = "85b6ec6f-f7df-4429-9514-a64bcd9ee824" PyPlot = "d330b81b-6aea-500a-939a-2ce795aea3ee" SnoopCompile = "aa65fe97-06da-5843-b5b1-d5d13cad87d2" +SnoopCompileCore = "e2b509da-e806-4183-be48-004708413034" [compat] AbstractTrees = "0.4" +Cthulhu = "2" Documenter = "1" JET = "0.9" MethodAnalysis = "0.4" PyPlot = "2" SnoopCompile = "3" +SnoopCompileCore = "3" diff --git a/docs/make.jl b/docs/make.jl index 1b2b338b..7fc7f401 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,19 +1,23 @@ using Documenter +using SnoopCompileCore using SnoopCompile import PyPlot # so that the visualizations.jl file is loaded makedocs( sitename = "SnoopCompile", format = Documenter.HTML( - prettyurls = get(ENV, "CI", nothing) == "true" + prettyurls = true, ), - modules = [SnoopCompile.SnoopCompileCore, SnoopCompile], - linkcheck = true, + modules = [SnoopCompileCore, SnoopCompile], + linkcheck = true, # the link check is slow, set to false if you're building frequently # doctest = :fix, + warnonly=true, # delete when https://github.com/JuliaDocs/Documenter.jl/issues/2541 is fixed pages = ["index.md", - "tutorial.md", - "Modern tools" => ["snoop_invalidations.md", "snoop_inference.md", "pgdsgui.md", "snoop_inference_analysis.md", "snoop_inference_parcel.md", "jet.md"], - "reference.md"], + "Basic tutorials" => ["tutorials/invalidations.md", "tutorials/snoop_inference.md", "tutorials/snoop_llvm.md", "tutorials/pgdsgui.md", "tutorials/jet.md"], + "Advanced tutorials" => ["tutorials/snoop_inference_analysis.md", "tutorials/snoop_inference_parcel.md"], + "Explanations" => ["explanations/tools.md", "explanations/gotchas.md", "explanations/fixing_inference.md"], + "reference.md", + ] ) deploydocs( diff --git a/docs/src/assets/ascend_optimizeme1.png b/docs/src/assets/ascend_optimizeme1.png new file mode 100644 index 00000000..aa595dd9 Binary files /dev/null and b/docs/src/assets/ascend_optimizeme1.png differ diff --git a/docs/src/explanations/basic.md b/docs/src/explanations/basic.md new file mode 100644 index 00000000..04f205e7 --- /dev/null +++ b/docs/src/explanations/basic.md @@ -0,0 +1,40 @@ +# Understanding SnoopCompile and Julia's compilation pipeline + +Julia uses +[Just-in-time (JIT) compilation](https://en.wikipedia.org/wiki/Just-in-time_compilation) to +generate the code that runs on your CPU. +Broadly speaking, there are two major compilation steps: *inference* and *code generation*. +Inference is the process of determining the type of each object, which in turn +determines which specific methods get called; once type inference is complete, +code generation performs optimizations and ultimately generates the assembly +language (native code) used on CPUs. +Some aspects of this process are documented [here](https://docs.julialang.org/en/v1/devdocs/eval/). + +Using code that has never been compiled requires that it first be JIT-compiled, and this contributes to the latency of using the package. +In some circumstances, you can cache (store) the results of compilation to files to +reduce the latency when your package is used. These files are the the `*.ji` and +`*.so` files that live in the `compiled` directory of your Julia depot, usually +located at `~/.julia/compiled`. However, if these files become large, loading +them can be another source for latency. Julia needs time both to load and +validate the cached compiled code. Minimizing the latency of using a package +involves focusing on caching the compilation of code that is both commonly used +and takes time to compile. + +Caching code for later use is called *precompilation*. Julia has had some forms of precompilation almost since the very first packages. However, it was [Julia +1.9](https://julialang.org/blog/2023/04/julia-1.9-highlights/#caching_of_native_code) that first supported "complete" precompilation, including the ability to store native code in shared-library cache files. + +SnoopCompile is designed to try to allow you to analyze the costs of JIT-compilation, identify +key bottlenecks that contribute to latency, and set up `precompile` directives to see whether +it produces measurable benefits. + +## Package precompilation + +When a package is precompiled, here's what happens under the hood: + +- Julia loads all of the package's dependencies (the ones in the `[deps]` section of the `Project.toml` file), typically from precompile cache files +- Julia evaluates the source code (text files) that define the package module(s). Evaluating `function foo(args...) ... end` creates a new method `foo`. Note that: + + the source code might also contain statements that create "data" (e.g., `const`s). In some cases this can lead to some subtle precompilation ["gotchas"](@ref running-during-pc) + + the source code might also contain a precompile workload, which forces compilation and tracking of package methods. +- Julia iterates over the module contents and writes the *result* to disk. Note that the module contents might include compiled code, and if so it is written along with everything else to the cache file. + +When Julia loads your package, it just loads the "snapshot" stored in the cache file: it does not re-evaluate the source-text files that defined your package! It is appropriate to think of the source files of your package as "build scripts" that create your module; once the "build scripts" are executed, it's the module itself that gets cached, and the job of the build scripts is done. diff --git a/docs/src/explanations/fixing_inference.md b/docs/src/explanations/fixing_inference.md new file mode 100644 index 00000000..de12db66 --- /dev/null +++ b/docs/src/explanations/fixing_inference.md @@ -0,0 +1,165 @@ +# Techniques for fixing inference problems + +Here we assume you've dug into your code with a tool like Cthulhu, and want to know how to fix some of the problems that you discover. Below is a collection of specific cases and some tricks for handling them. + +Note that there is also a [tutorial on fixing inference](@ref inferrability) that delves into advanced topics. + +## Adding type annotations + +### Using concrete types + +Defining variables like `list = []` can be convenient, but it creates a `list` of type `Vector{Any}`. This prevents inference from knowing the type of items extracted from `list`. Using `list = String[]` for a container of strings, etc., is an excellent fix. When in doubt, check the type with `isconcretetype`: a common mistake is to think that `list_of_lists = Array{Int}[]` gives you a vector-of-vectors, but + +```jldoctest +julia> isconcretetype(Array{Int}) +false +``` + +reminds you that `Array` requires a second parameter indicating the dimensionality of the array. (Or use `list_of_lists = Vector{Int}[]` instead, as `Vector{Int} === Array{Int, 1}`.) + +Many valuable tips can be found among [Julia's performance tips](https://docs.julialang.org/en/v1/manual/performance-tips/), and readers are encouraged to consult that page. + +### Working with non-concrete types + +In cases where invalidations occur, but you can't use concrete types (there are indeed many valid uses of `Vector{Any}`), +you can often prevent the invalidation using some additional knowledge. +One common example is extracting information from an [`IOContext`](https://docs.julialang.org/en/v1/manual/networking-and-streams/#IO-Output-Contextual-Properties-1) structure, which is roughly defined as + +```julia +struct IOContext{IO_t <: IO} <: AbstractPipe + io::IO_t + dict::ImmutableDict{Symbol, Any} +end +``` + +There are good reasons that `dict` uses a value-type of `Any`, but that makes it impossible for the compiler to infer the type of any object looked up in an `IOContext`. +Fortunately, you can help! +For example, the documentation specifies that the `:color` setting should be a `Bool`, and since it appears in documentation it's something we can safely enforce. +Changing + +``` +iscolor = get(io, :color, false) +``` + +to + +``` +iscolor = get(io, :color, false)::Bool # assert that the rhs is Bool-valued +``` + +will throw an error if it isn't a `Bool`, and this allows the compiler to take advantage of the type being known in subsequent operations. + +If the return type is one of a small number of possibilities (generally three or fewer), you can annotate the return type with `Union{...}`. This is generally advantageous only when the intersection of what inference already knows about the types of a variable and the types in the `Union` results in an concrete type. + +As a more detailed example, suppose you're writing code that parses Julia's `Expr` type: + +```julia +julia> ex = :(Array{Float32,3}) +:(Array{Float32, 3}) + +julia> dump(ex) +Expr + head: Symbol curly + args: Vector{Any(3,)) + 1: Symbol Array + 2: Symbol Float32 + 3: Int64 3 +``` + +`ex.args` is a `Vector{Any}`. +However, for a `:curly` expression only certain types will be found among the arguments; you could write key portions of your code as + +```julia +a = ex.args[2] +if a isa Symbol + # inside this block, Julia knows `a` is a Symbol, and so methods called on `a` will be resistant to invalidation + foo(a) +elseif a isa Expr && length((a::Expr).args) > 2 + a::Expr # sometimes you have to help inference by adding a type-assert + x = bar(a) # `bar` is now resistant to invalidation +elseif a isa Integer + # even though you've not made this fully-inferrable, you've at least reduced the scope for invalidations + # by limiting the subset of `foobar` methods that might be called + y = foobar(a) +end +``` + +Other tricks include replacing broadcasting on `v::Vector{Any}` with `Base.mapany(f, v)`--`mapany` avoids trying to narrow the type of `f(v[i])` and just assumes it will be `Any`, thereby avoiding invalidations of many `convert` methods. + +Adding type-assertions and fixing inference problems are the most common approaches for fixing invalidations. +You can discover these manually, but using Cthulhu is highly recommended. + +## Inferrable field access for abstract types + +When invalidations happen for methods that manipulate fields of abstract types, often there is a simple solution: create an "interface" for the abstract type specifying that certain fields must have certain types. +Here's an example: + +``` +abstract type AbstractDisplay end + +struct Monitor <: AbstractDisplay + height::Int + width::Int + maker::String +end + +struct Phone <: AbstractDisplay + height::Int + width::Int + maker::Symbol +end + +function Base.show(@nospecialize(d::AbstractDisplay), x) + str = string(x) + w = d.width + if length(str) > w # do we have to truncate to fit the display width? + ... +``` + +In this `show` method, we've deliberately chosen to prevent specialization on the specific type of `AbstractDisplay` (to reduce the total number of times we have to compile this method). +As a consequence, Julia's inference may not realize that `d.width` returns an `Int`. + +Fortunately, you can help by defining an interface for generic `AbstractDisplay` objects: + +``` +function Base.getproperty(d::AbstractDisplay, name::Symbol) + if name === :height + return getfield(d, :height)::Int + elseif name === :width + return getfield(d, :width)::Int + elseif name === :maker + return getfield(d, :maker)::Union{String,Symbol} + end + return getfield(d, name) +end +``` + +Julia's [constant propagation](https://en.wikipedia.org/wiki/Constant_folding) will ensure that most accesses of those fields will be determined at compile-time, so this simple change robustly fixes many inference problems. + +## Fixing `Core.Box` + +[Julia issue 15276](https://github.com/JuliaLang/julia/issues/15276) is one of the more surprising forms of inference failure; it is the most common cause of a `Core.Box` annotation. +If other variables depend on the `Box`ed variable, then a single `Core.Box` can lead to widespread inference problems. +For this reason, these are also among the first inference problems you should tackle. + +Read [this explanation of why this happens and what you can do to fix it](https://docs.julialang.org/en/v1/manual/performance-tips/#man-performance-captured). +If you are directed to find `Core.Box` inference triggers via [`suggest`](@ref), you may need to explore around the call site a bit-- +the inference trigger may be in the closure itself, but the fix needs to go in the method that creates the closure. + +Use of `ascend` is highly recommended for fixing `Core.Box` inference failures. + +## Handling edge cases + +You can sometimes get invalidations from failing to handle "formal" possibilities. +For example, operations with regular expressions might return a `Union{Nothing, RegexMatch}`. +You can sometimes get poor type inference by writing code that fails to take account of the possibility that `nothing` might be returned. +For example, a comprehension + +```julia +ms = [m.match for m in match.((rex,), my_strings)] +``` +might be replaced with +```julia +ms = [m.match for m in match.((rex,), my_strings) if m !== nothing] +``` +and return a better-typed result. diff --git a/docs/src/explanations/gotchas.md b/docs/src/explanations/gotchas.md new file mode 100644 index 00000000..d49a0b02 --- /dev/null +++ b/docs/src/explanations/gotchas.md @@ -0,0 +1,26 @@ +# Precompilation "gotcha"s + +## [Running code during module definition](@id running-during-pc) + +Suppose you're working on an astronomy package and your source code has a line + +``` +const planets = map(makeplanet, ["Mercury", ...]) +``` + +Julia will dutifully create `planets` and store it in the package's precompile cache file. This also runs `makeplanet`, and if this is the first time it gets run, it will compile `makeplanet`. Assuming that `makeplanet` is a method defined in the package, the compiled code for `makeplanet` will be stored in the cache file. + +However, two circumstances can lead to puzzling omissions from the cache files: +- if `makeplanet` is a method defined in a dependency of your package, it will *not* be cached in your package. You'd want to add precompilation of `makeplanet` to the package that creates that method. +- if `makeplanet` is poorly-infered and uses runtime dispatch, any such callees that are not owned by your package will not be cached. For example, suppose `makeplanet` ends up calling methods in Base Julia or its standard libraries that are not precompiled into Julia itself: the compiled code for those methods will not be added to the cache file. + +One option to ensure this dependent code gets cached is to create `planets` inside `PrecompileTools.@compile_workload`: + +``` +@compile_workload begin + global planets + const planet = map(makeplanet, ["Mercury", ...]) +end +``` + +Note that your package definition can have multiple `@compile_workload` blocks. diff --git a/docs/src/explanations/tools.md b/docs/src/explanations/tools.md new file mode 100644 index 00000000..cdb1dcaa --- /dev/null +++ b/docs/src/explanations/tools.md @@ -0,0 +1,29 @@ +# Package roles and alternatives + +## SnoopCompile + +SnoopCompileCore is a tiny package with no dependencies; it's used for collecting data, and it has been designed in such a way that it cannot cause any invalidations of its own. Collecting data on invalidations and inference with SnoopCompileCore is the only way you can be sure you are observing the "native state" of your code. + +## SnoopCompile + +SnoopCompile is a much larger package that performs analysis on the data collected by SnoopCompileCore; loading SnoopCompile can (and does) trigger invalidations. +Consequently, you're urged to always collect data with just SnoopCompileCore loaded, +and wait to load SnoopCompile until after you've finished collecting the data. + +## Cthulhu + +[Cthulhu](https://github.com/JuliaDebug/Cthulhu.jl) is a companion package that gives deep insights into the origin of invalidations or inference failures. + +## AbstractTrees + +[AbstractTrees](https://github.com/JuliaCollections/AbstractTrees.jl) is the one package in this list that can be both a "workhorse" and a developer tool. SnoopCompile uses it mostly for pretty-printing. + +## JET + +[JET](https://github.com/aviatesk/JET.jl) is a powerful developer tool that in some ways is an alternative to SnoopCompile. While the two have different goals, the packages have some overlap in what they can tell you about your code. However, their mechanisms of action are fundamentally different: + +- JET is a "static analyzer," which means that it analyzes the code itself. JET can tell you about inference failures (runtime dispatch) much like SnoopCompile, with a major advantage: SnoopCompileCore omits information about any callees that are already compiled, but JET's `@report_opt` provides *exhaustive* information about the entire *inferable* callgraph (i.e., the part of the callgraph that inference can predict from the initial call) regardless of whether it has been previously compiled. With JET, you don't have to remember to run each analysis in a fresh session. + +- SnoopCompileCore collects data by watching normal inference at work. On code that hasn't been compiled previously, this can yield results similar to JET's, with a different major advantage: JET can't "see through" runtime dispatch, but SnoopCompileCore can. With SnoopCompile, you can immediately get a wholistic view of your entire callgraph. + +Combining JET and SnoopCompile can provide insights that are difficult to obtain with either package in isolation. See the [Tutorial on JET integration](@ref). diff --git a/docs/src/index.md b/docs/src/index.md index baf2e9e8..30971ce1 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,101 +1,36 @@ # SnoopCompile.jl -SnoopCompile "snoops" on the Julia compiler, causing it to record the -functions and argument types it's compiling. From these lists of methods, -you can generate lists of `precompile` directives that may reduce the latency between -loading packages and using them to do "real work." +Julia is fast, but its execution speed depends on optimizing code through *compilation*. Code must be compiled before you can use it, and unfortunately compilation is slow. This can cause *latency* the first time you use code: this latency is often called *time-to-first-plot* (TTFP) or more generally *time-to-first-execution* (TTFX). If something feels slow the first time you use it, and fast thereafter, you're probably experiencing the latency of compilation. Note that TTFX is distinct from time-to-load (TTL, which refers to the time you spend waiting for `using MyPkg` to finish), even though both contribute to latency. -SnoopCompile can also detect and analyze *method cache invalidations*, -which occur when new method definitions alter dispatch in a way that forces Julia to discard previously-compiled code. -Any later usage of invalidated methods requires recompilation. -Invalidation can trigger a domino effect, in which all users of invalidated code also become invalidated, propagating all the way back to the top-level call. -When a source of invalidation can be identified and either eliminated or mitigated, -you can reduce the amount of work that the compiler needs to repeat and take better advantage of precompilation. +Modern versions of Julia can store compiled code to disk (*precompilation*) to reduce or eliminate latency. Users and developers who are interested in reducing TTFX should first head to [PrecompileTools](https://github.com/JuliaLang/PrecompileTools.jl), read its documentation thoroughly, and try using it to solve latency problems. -Finally, SnoopCompile interacts with other important diagnostics and debugging tools in the Julia ecosystem. -For example, the combination of SnoopCompile and [JET](https://github.com/aviatesk/JET.jl) allows you to analyze an entire call-chain for -potential errors; see the page on [JET integration](@ref JET) for more information. +This package, **SnoopCompile**, should be considered when: -## Background +- precompilation doesn't reduce TTFX as much as you wish +- precompilation "works," but only in isolation: as soon as you load (certain) additional packages, TTFX is bad again +- you're wondering if you can reduce the amount of time needed to precompile your package and/or the size of the precompilation cache files -Julia uses -[Just-in-time (JIT) compilation](https://en.wikipedia.org/wiki/Just-in-time_compilation) to -generate the code that runs on your CPU. -Broadly speaking, there are two major steps: *inference* and *code generation*. -Inference is the process of determining the type of each object, which in turn -determines which specific methods get called; once type inference is complete, -code generation performs optimizations and ultimately generates the assembly -language (native code) used on CPUs. -Some aspects of this process are documented [here](https://docs.julialang.org/en/v1/devdocs/eval/). +In other words, SnoopCompile is a diagonostic package that helps reveal the causes of latency. Historically, it proceeded PrecompileTools, and indeed PrecompileTools was split out from SnoopCompile. Today, SnoopCompile is generally needed only when PrecompileTools fails to deliver the desired benefits. -Every time you load a package in a fresh Julia session, the methods you use need -to be JIT-compiled, and this contributes to the latency of using the package. -In some circumstances, you can cache the results of compilation to files to -reduce the latency when your package is used. These files are the the `*.ji` and -`*.so` files that live in the `compiled` directory of the Julia depot, usually -located at `~/.julia/compiled`. However, if these files become large, loading -them can be another source for latency. Julia needs time both to load and -validate the cached compiled code. Minimizing the latency of using a package -involves focusing on caching the compilation of code that is both commonly used -and takes time to compile. +## SnoopCompile analysis modes -This is called *precompilation*. Julia is able to save inference results in the -`*.ji` files and ([since Julia -1.9](https://julialang.org/blog/2023/04/julia-1.9-highlights/#caching_of_native_code)) -native code in the `*.so` files, and thus precompilation can eliminate the time -needed for type inference and native code compilation (though what does get -saved can sometimes be invalidated by loading other packages). +SnoopCompile "snoops" on the Julia compiler, collecting information that may be useful to developers. Here are some of the things you can do with SnoopCompile: -SnoopCompile is designed to try to allow you to analyze the costs of JIT-compilation, identify -key bottlenecks that contribute to latency, and set up `precompile` directives to see whether -it produces measurable benefits. +- diagnose *invalidations*, cases where Julia must throw away previously-compiled code (see [Tutorial on `@snoop_invalidations`](@ref)) +- trace *inference*, to learn what code is being newly (or freshly) analyzed in an early stage of the compilation pipeline ([Tutorial on `@snoop_inference`](@ref)) +- trace *code generation by LLVM*, a late stage in the compilation pipeline ([Tutorial on `@snoop_llvm`](@ref)) +- reveal methods with excessive numbers of compiler-generated specializations, a.k.a.*profile-guided despecialization* ([Tutorial on PGDS](@ref pgds)) +- integrate with tools like [JET](https://github.com/aviatesk/JET.jl) to further reduce the risk that your lovingly-precompiled code will be invalidated by loading other packages ([Tutorial on JET integration](@ref)) -## Who should use this package - -SnoopCompile is intended primarily for package *developers* who want to improve the -experience for their users. -Because the results of SnoopCompile are typically stored in the `*.ji` precompile files, -users automatically get the benefit of any latency reductions achieved by adding -`precompile` directives to the source code of your package. - -[PackageCompiler](https://github.com/JuliaLang/PackageCompiler.jl) is an alternative -that *non-developer users* may want to consider for their own workflow. -It builds an entire system image (Julia + a set of selected packages) and caches both the -results of type inference and the native code. -Typically, PackageCompiler reduces latency more than just "plain" `precompile` directives. -However, PackageCompiler does have significant downsides, of which the largest is that -it is incompatible with package updates--any packages built into your system image -cannot be updated without rebuilding the entire system. -Particularly for people who develop or frequently update their packages, the downsides of -PackageCompiler may outweigh its benefits. - -Finally, another alternative for reducing latency without any modifications -to package files is [Revise](https://github.com/timholy/Revise.jl). -It can be used in conjunction with SnoopCompile. +## Background information -## [A note on Julia versions and the recommended workflow](@id workflow) +If nothing else, you should know this: +- invalidations occur when you *load* code (e.g., `using MyPkg`) or otherwise define new methods +- inference and other stages of compilation occur the first time you *run* code for a particular combination of input types -SnoopCompile is closely intertwined with Julia's own internals. -Some "data collection" and analysis features are available only on newer versions of Julia. -In particular, some of the most powerful tools were made possible through several additions made in Julia 1.6; -SnoopCompile just exposes these tools in convenient form. +The individual tutorials briefly explain core concepts. More detail can be found in [Understanding SnoopCompile and Julia's compilation pipeline](@ref). -If you're a developer looking to reduce the latency of your packages, you are *strongly* -encouraged to use SnoopCompile on Julia 1.6 or later. The fruits of your labors will often -reduce latency even for users of earlier Julia versions, but your ability to understand -what changes need to be made will be considerably enhanced by using the latest tools. - -For developers who can use Julia 1.6+, the recommended sequence is: - -1. Check for [invalidations](@ref), and if egregious make fixes before proceeding further -2. Record inference data with [`@snoop_inference`](@ref). Analyze the data to: - + adjust method specialization in your package or its dependencies (see [pgds](@ref)) - + fix problems in [inferrability](@ref) - + add [precompilation](@ref) - -Under 2, the first two sub-points can often be done at the same time; the last item is best done as a final step, because the specific -precompile directives needed depend on the state of your code, and a few fixes in specialization -and/or type inference can alter or even decrease the number of necessary precompile directives. +## Who should use this package -Although there are other tools within SnoopCompile available, most developers can probably stop after the steps above. -The documentation will describe the tools in this order, followed by descriptions of additional and older tools. +SnoopCompile is intended primarily for package *developers* who want to improve the +experience for their users. It is also recommended for users who are willing to "dig deep" and understand why packages they depend on have high latency. **Your experience with latency may be personal, as it can depend on the specific combination of packages you load.** If latency troubles you, don't make the assumption that it must be unfixable: you might be the first person affected by that specific cause of latency. diff --git a/docs/src/reference.md b/docs/src/reference.md index ea058dbb..860f4e2a 100644 --- a/docs/src/reference.md +++ b/docs/src/reference.md @@ -3,9 +3,9 @@ ## Data collection ```@docs -@snoop_invalidations -@snoop_inference -@snoop_llvm +SnoopCompileCore.@snoop_invalidations +SnoopCompileCore.@snoop_inference +SnoopCompileCore.@snoop_llvm ``` ## GUIs @@ -23,6 +23,7 @@ invalidation_trees precompile_blockers filtermod findcaller +report_invalidations ``` ## Analysis of `@snoop_inference` @@ -50,12 +51,10 @@ report_callees report_caller ``` -## Other utilities +## Analysis of LLVM ```@docs -SnoopCompile.read SnoopCompile.read_snoop_llvm -SnoopCompile.format_userimg ``` ## Demos diff --git a/docs/src/snoop_inference_analysis.md b/docs/src/snoop_inference_analysis.md deleted file mode 100644 index 36cda6c8..00000000 --- a/docs/src/snoop_inference_analysis.md +++ /dev/null @@ -1,804 +0,0 @@ -# [Using `@snoop_inference` results to improve inferrability](@id inferrability) - -As indicated in the [workflow](@ref), the recommended steps to reduce latency are: - -- check for invalidations -- adjust method specialization in your package or its dependencies -- fix problems in type inference -- add `precompile` directives - -The importance of fixing "problems" in type-inference was indicated in the [tutorial](@ref): successful precompilation requires a chain of ownership, but runtime dispatch (when inference cannot predict the callee) results in breaks in this chain. By improving inferrability, you can convert short, unconnected call-trees into a smaller number of large call-trees that all link back to your package(s). - -In practice, it also turns out that opportunities to adjust specialization are often revealed by analyzing inference failures, so this page is complementary to the previous one. - -Finally, improving inference may also yield improvements in runtime performance, itself an excellent outcome. - -!!! note - [JET also detects inference failures](https://aviatesk.github.io/JET.jl/dev/optanalysis/), but JET and SnoopCompile use different mechanisms: JET performs *static* analysis of a particular call, while SnoopCompile performs *dynamic* analysis of new inference. As a consequence, JET's detection of inference failures is reproducible (you can run the same analysis repeatedly and get the same result) but terminates at any non-inferrable node of the call graph: you will miss runtime dispatch in any non-inferrable callees. Conversely, SnoopCompile's detection of inference failures can explore the entire callgraph, but only for those portions that have not been previously inferred, and the analysis cannot be repeated in the same session. - -Throughout this page, we'll use the `OptimizeMe` demo, which ships with `SnoopCompile`. - -!!! note - To understand what follows, it's essential to refer to [`OptimizeMe` source code](https://github.com/timholy/SnoopCompile.jl/blob/master/examples/OptimizeMe.jl) as you follow along. - -```julia -julia> using SnoopCompile - -julia> cd(joinpath(pkgdir(SnoopCompile), "examples")) - -julia> include("OptimizeMe.jl") -Main.OptimizeMe - -julia> tinf = @snoop_inference OptimizeMe.main() -lotsa containers: -7-element Vector{Main.OptimizeMe.Container}: - Main.OptimizeMe.Container{Int64}(1) - Main.OptimizeMe.Container{UInt8}(0x01) - Main.OptimizeMe.Container{UInt16}(0xffff) - Main.OptimizeMe.Container{Float32}(2.0f0) - Main.OptimizeMe.Container{Char}('a') - Main.OptimizeMe.Container{Vector{Int64}}([0]) - Main.OptimizeMe.Container{Tuple{String, Int64}}(("key", 42)) -3.14 is great -2.718 is jealous -6-element Vector{Main.OptimizeMe.Object}: - Main.OptimizeMe.Object(1) - Main.OptimizeMe.Object(2) - Main.OptimizeMe.Object(3) - Main.OptimizeMe.Object(4) - Main.OptimizeMe.Object(5) - Main.OptimizeMe.Object(7) -InferenceTimingNode: 1.423913/2.713560 on InferenceFrameInfo for Core.Compiler.Timings.ROOT() with 77 direct children - -julia> fg = flamegraph(tinf) -Node(FlameGraphs.NodeData(ROOT() at typeinfer.jl:75, 0x00, 0:2713559552)) -``` - -If you visualize `fg` with ProfileView, you'll see something like this: - -![flamegraph-OptimizeMe](assets/flamegraph-OptimizeMe.png) - -From the standpoint of precompilation, this has some obvious problems: - -- even though we called a single method, `OptimizeMe.main()`, there are many distinct flames separated by blank spaces. This indicates that many calls are being made by runtime dispatch: each separate flame is a fresh entrance into inference. -- several of the flames are marked in red, indicating that they are not precompilable. While SnoopCompile does have the capability to automatically emit `precompile` directives for the non-red bars that sit on top of the red ones, in some cases the red extends to the highest part of the flame. In such cases there is no available precompile directive, and therefore no way to avoid the cost of type-inference. - -Our goal will be to improve the design of `OptimizeMe` to make it more precompilable. - -## Analyzing inference triggers - -We'll first extract the "triggers" of inference, which is just a repackaging of part of the information contained within `tinf`. -Specifically an [`InferenceTrigger`](@ref) captures callee/caller relationships that straddle a fresh entrance to type-inference, allowing you to identify which calls were made by runtime dispatch and what `MethodInstance` they called. - -```julia -julia> itrigs = inference_triggers(tinf) -76-element Vector{InferenceTrigger}: - Inference triggered to call MethodInstance for vect(::Int64, ::Vararg{Any, N} where N) from lotsa_containers (/pathto/SnoopCompile/examples/OptimizeMe.jl:13) with specialization MethodInstance for lotsa_containers() - Inference triggered to call MethodInstance for promote_typeof(::Int64, ::UInt8, ::Vararg{Any, N} where N) from vect (./array.jl:126) with specialization MethodInstance for vect(::Int64, ::Vararg{Any, N} where N) - Inference triggered to call MethodInstance for promote_typeof(::UInt8, ::UInt16, ::Vararg{Any, N} where N) from promote_typeof (./promotion.jl:272) with specialization MethodInstance for promote_typeof(::Int64, ::UInt8, ::Vararg{Any, N} where N) - ⋮ -``` - -This indicates that a whopping 76 calls were (1) made by runtime dispatch and (2) the callee had not previously been inferred. -(There was a 77th call that had to be inferred, the original call to `main()`, but by default [`inference_triggers`](@ref) excludes calls made directly from top-level. You can change that through keyword arguments.) - -!!! tip - In the REPL, `SnoopCompile` displays `InferenceTrigger`s with yellow coloration for the callee, red for the caller method, and blue for the caller specialization. This makes it easier to quickly identify the most important information. - -In some cases, this might indicate that you'll need to fix 76 separate callers; fortunately, in many cases fixing the origin of inference problems can fix a number of later callees. - -### [Method triggers](@id methtrigs) - -Most often, it's most convenient to organize them by the method triggering the need for inference: - -```julia -julia> mtrigs = accumulate_by_source(Method, itrigs) -18-element Vector{SnoopCompile.TaggedTriggers{Method}}: - print_matrix_row(io::IO, X::AbstractVecOrMat{T} where T, A::Vector{T} where T, i::Integer, cols::AbstractVector{T} where T, sep::AbstractString) in Base at arrayshow.jl:96 (1 callees from 1 callers) - show(io::IO, x::T, forceuntyped::Bool, fromprint::Bool) where T<:Union{Float16, Float32, Float64} in Base.Ryu at ryu/Ryu.jl:111 (1 callees from 1 callers) - Pair(a, b) in Base at pair.jl:15 (1 callees from 1 callers) - vect(X...) in Base at array.jl:125 (1 callees from 1 callers) - makeobjects() in Main.OptimizeMe at /pathto/SnoopCompile/examples/OptimizeMe.jl:36 (1 callees from 1 callers) - show_delim_array(io::IO, itr, op, delim, cl, delim_one, i1, n) in Base at show.jl:1058 (1 callees from 1 callers) - typeinfo_prefix(io::IO, X) in Base at arrayshow.jl:515 (2 callees from 1 callers) - (::REPL.var"#38#39")(io) in REPL at /home/tim/src/julia-master/usr/share/julia/stdlib/v1.6/REPL/src/REPL.jl:214 (2 callees from 1 callers) - _cat_t(dims, ::Type{T}, X...) where T in Base at abstractarray.jl:1633 (2 callees from 1 callers) - contain_list(list) in Main.OptimizeMe at /pathto/SnoopCompile/examples/OptimizeMe.jl:27 (4 callees from 1 callers) - promote_typeof(x, xs...) in Base at promotion.jl:272 (4 callees from 4 callers) - combine_eltypes(f, args::Tuple) in Base.Broadcast at broadcast.jl:740 (5 callees from 1 callers) - lotsa_containers() in Main.OptimizeMe at /pathto/SnoopCompile/examples/OptimizeMe.jl:12 (7 callees from 1 callers) - alignment(io::IO, x) in Base at show.jl:2528 (7 callees from 7 callers) - var"#sprint#386"(context, sizehint::Integer, ::typeof(sprint), f::Function, args...) in Base at strings/io.jl:100 (8 callees from 2 callers) - alignment(io::IO, X::AbstractVecOrMat{T} where T, rows::AbstractVector{T} where T, cols::AbstractVector{T} where T, cols_if_complete::Integer, cols_otherwise::Integer, sep::Integer) in Base at arrayshow.jl:60 (8 callees from 2 callers) - copyto_nonleaf!(dest, bc::Base.Broadcast.Broadcasted, iter, state, count) in Base.Broadcast at broadcast.jl:1070 (9 callees from 3 callers) - _show_default(io::IO, x) in Base at show.jl:397 (12 callees from 1 callers) -``` - -The methods triggering the largest number of inference runs are shown at the bottom. -You can select methods from a particular module: - -```julia -julia> modtrigs = filtermod(OptimizeMe, mtrigs) -3-element Vector{SnoopCompile.TaggedTriggers{Method}}: - makeobjects() in Main.OptimizeMe at /home/tim/.julia/dev/SnoopCompile/examples/OptimizeMe.jl:36 (1 callees from 1 callers) - contain_list(list) in Main.OptimizeMe at /home/tim/.julia/dev/SnoopCompile/examples/OptimizeMe.jl:27 (4 callees from 1 callers) - lotsa_containers() in Main.OptimizeMe at /home/tim/.julia/dev/SnoopCompile/examples/OptimizeMe.jl:12 (7 callees from 1 callers) -``` - -Rather than filter by a single module, you can alternatively call `SnoopCompile.parcel(mtrigs)` to split them out by module. -In this case, most of the triggers came from `Base`, not `OptimizeMe`. -However, many of the failures in `Base` were nevertheless indirectly due to `OptimizeMe`: our methods in `OptimizeMe` call `Base` methods with arguments that trigger internal inference failures. -Fortunately, we'll see that using more careful design in `OptimizeMe` can avoid many of those problems. - -!!! tip - If you have a longer list of inference triggers than you feel comfortable tackling, filtering by your package's module is probably the best way to start. - Fixing issues in the package itself can end up resolving many of the "indirect" triggers too. - Also be sure to note the ability to filter out likely "noise" from [test suites](@ref test-suites). - -If you're hoping to fix inference problems, one of the most efficient things you can do is call `summary`: - -```julia -julia> mtrig = modtrigs[1] -makeobjects() in Main.OptimizeMe at /home/tim/.julia/dev/SnoopCompile/examples/OptimizeMe.jl:36 (1 callees from 1 callers) - -julia> summary(mtrig) -makeobjects() in Main.OptimizeMe at /home/tim/.julia/dev/SnoopCompile/examples/OptimizeMe.jl:36 had 1 specializations -Triggering calls: -Inlined _cat at ./abstractarray.jl:1630: calling cat_t##kw (1 instances) -``` - -Sometimes from these hints alone you can figure out how to fix the problem. -(`Inlined _cat` means that the inference trigger did not come directly from a source line of `makeobjects` but from a call, `_cat`, that got inlined into the compiled version. -Below we'll see more concretely how to interpret this hint.) - -You can also say `edit(mtrig)` and be taken directly to the method you're analyzing in your editor. -Finally, you can recover the individual triggers: - -```julia -julia> mtrig.itrigs[1] -Inference triggered to call MethodInstance for (::Base.var"#cat_t##kw")(::NamedTuple{(:dims,), Tuple{Val{1}}}, ::typeof(Base.cat_t), ::Type{Int64}, ::UnitRange{Int64}, ::Vararg{Any, N} where N) from _cat (./abstractarray.jl:1630) inlined into MethodInstance for makeobjects() (/home/tim/.julia/dev/SnoopCompile/examples/OptimizeMe.jl:37) -``` - -This is useful if you want to analyze a method via [`Cthulhu.ascend`](@ref ascend-itrig). -`Method`-based triggers, which may aggregate many different individual triggers, are particularly useful mostly because tools like [Cthulhu.jl](https://github.com/JuliaDebug/Cthulhu.jl) show you the inference results for the entire `MethodInstance`, allowing you to fix many different inference problems at once. - -### Trigger trees - -While method triggers are probably the most useful way of organizing these inference triggers, for learning purposes here we'll use a more detailed scheme, which organizes inference triggers in a tree: - -```julia -julia> itree = trigger_tree(itrigs) -TriggerNode for root with 14 direct children - -julia> using AbstractTrees - -julia> print_tree(itree) -root -├─ MethodInstance for vect(::Int64, ::Vararg{Any, N} where N) -│ └─ MethodInstance for promote_typeof(::Int64, ::UInt8, ::Vararg{Any, N} where N) -│ └─ MethodInstance for promote_typeof(::UInt8, ::UInt16, ::Vararg{Any, N} where N) -│ └─ MethodInstance for promote_typeof(::UInt16, ::Float32, ::Vararg{Any, N} where N) -│ └─ MethodInstance for promote_typeof(::Float32, ::Char, ::Vararg{Any, N} where N) -│ ⋮ -│ -├─ MethodInstance for combine_eltypes(::Type, ::Tuple{Vector{Any}}) -│ ├─ MethodInstance for return_type(::Any, ::Any) -│ ├─ MethodInstance for return_type(::Any, ::Any, ::UInt64) -│ ├─ MethodInstance for return_type(::Core.Compiler.NativeInterpreter, ::Any, ::Any) -│ ├─ MethodInstance for contains_is(::Core.SimpleVector, ::Any) -│ └─ MethodInstance for promote_typejoin_union(::Type{Main.OptimizeMe.Container}) -├─ MethodInstance for Main.OptimizeMe.Container(::Int64) -⋮ -``` - -The parent-child relationships are based on the backtraces at the entrance to inference, -and the nodes are organized in the order in which inference occurred. - -We're going to march through these systematically. Let's start with the first of these. - -### `suggest` and a fix involving manual `eltype` specification - -Because the analysis of inference failures is somewhat complex, `SnoopCompile` attempts to [`suggest`](@ref) an interpretation and/or remedy for each trigger: - -``` -julia> suggest(itree.children[1]) -/pathto/SnoopCompile/examples/OptimizeMe.jl:13: invoked callee is varargs (ignore this one, homogenize the arguments, declare an umbrella type, or force-specialize the callee MethodInstance for vect(::Int64, ::Vararg{Any, N} where N)) -immediate caller(s): -1-element Vector{Base.StackTraces.StackFrame}: - main() at OptimizeMe.jl:42 -└─ ./array.jl:126: caller is varargs (ignore this one, specialize the caller vect(::Int64, ::Vararg{Any, N} where N) at array.jl:126, or improve inferrability of its caller) - immediate caller(s): - 1-element Vector{Base.StackTraces.StackFrame}: - lotsa_containers() at OptimizeMe.jl:13 - └─ ./promotion.jl:272: caller is varargs (ignore this one, specialize the caller promote_typeof(::Int64, ::UInt8, ::Vararg{Any, N} where N) at promotion.jl:272, or improve inferrability of its caller) - immediate caller(s): - 1-element Vector{Base.StackTraces.StackFrame}: - vect(::Int64, ::Vararg{Any, N} where N) at array.jl:126 - └─ ./promotion.jl:272: caller is varargs (ignore this one, specialize the caller promote_typeof(::UInt8, ::UInt16, ::Vararg{Any, N} where N) at promotion.jl:272, or improve inferrability of its caller) - immediate caller(s): - 1-element Vector{Base.StackTraces.StackFrame}: - promote_typeof(::Int64, ::UInt8, ::Vararg{Any, N} where N) at promotion.jl:272 - └─ ./promotion.jl:272: caller is varargs (ignore this one, specialize the caller promote_typeof(::UInt16, ::Float32, ::Vararg{Any, N} where N) at promotion.jl:272, or improve inferrability of its caller) - immediate caller(s): - 1-element Vector{Base.StackTraces.StackFrame}: - promote_typeof(::UInt8, ::UInt16, ::Vararg{Any, N} where N) at promotion.jl:272 - └─ ./promotion.jl:272: caller is varargs (ignore this one, specialize the caller promote_typeof(::Float32, ::Char, ::Vararg{Any, N} where N) at promotion.jl:272, or improve inferrability of its caller) - immediate caller(s): - 1-element Vector{Base.StackTraces.StackFrame}: - promote_typeof(::UInt16, ::Float32, ::Vararg{Any, N} where N) at promotion.jl:272 - ⋮ -``` - -!!! tip - In the REPL, interpretations are highlighted in color to help distinguish individual suggestions. - -In this case, the interpretation for the first node is "invoked callee is varargs" and suggestions are to choose one of "ignore...homogenize...umbrella type...force-specialize". -Initially, this may seem pretty opaque. -It helps if we look at the referenced line `OptimizeMe.jl:13`: - -```julia -list = [1, 0x01, 0xffff, 2.0f0, 'a', [0], ("key", 42)] -``` - -You'll notice above that the callee for the first node is `vect`; that's what handles the creation of the vector `[1, ...]`. -If you look back up at the `itree`, you can see that a lot of `promote_typeof` calls follow, and you can see that the types listed in the arguments match the elements in `list`. -The problem, here, is that `vect` has never been inferred for this particular combination of argument types, and the fact that the types are diverse means that Julia has decided not to specialize it for this combination. -(If Julia had specialized it, it would have been inferred when `lotsa_containers` was inferred; the fact that it is showing up as a trigger means it wasn't.) - -Let's see what kind of object this line creates: - -```julia -julia> typeof(list) -Vector{Any} (alias for Array{Any, 1}) -``` - -Since it creates a `Vector{Any}`, perhaps we should just tell Julia to create such an object directly: we modify `[1, 0x01, ...]` to `Any[1, 0x01, ...]` (note the `Any` in front of `[`), so that Julia doesn't have to deduce the container type on its own. -This follows the "declare an umbrella type" suggestion. - -!!! note - "Force-specialize" means to encourage Julia to violate its heuristics and specialize the callee. - Often this can be achieved by supplying a "spurious" type parameter. - Examples include replacing `higherorderfunction(f::Function, args...)` with `function higherorderfunction(f::F, args...) where F<:Function`, - or `function getindex(A::MyArrayType{T,N}, idxs::Vararg{Int,N}) where {T,N}` instead of just `getindex(A::MyArrayType, idxs::Int...)`. - (In the latter case, the `N` parameter is the crucial one: it forces specialization for a particular number of `Int` arguments.) - - This technique is not useful for the particular case we analyzed here, but it can be in other settings. - -Making this simple 3-character fix eliminates that entire branch of the tree (a savings of 6 inference triggers). - -### `eltype`s and reducing specialization in `broadcast` - -Let's move on to the next entry: - -``` -julia> print_tree(itree.children[2]) -MethodInstance for combine_eltypes(::Type, ::Tuple{Vector{Any}}) -├─ MethodInstance for return_type(::Any, ::Any) -├─ MethodInstance for return_type(::Any, ::Any, ::UInt64) -├─ MethodInstance for return_type(::Core.Compiler.NativeInterpreter, ::Any, ::Any) -├─ MethodInstance for contains_is(::Core.SimpleVector, ::Any) -└─ MethodInstance for promote_typejoin_union(::Type{Main.OptimizeMe.Container}) - -julia> suggest(itree.children[2]) -./broadcast.jl:905: regular invoke (perhaps precompile lotsa_containers() at OptimizeMe.jl:14) -├─ ./broadcast.jl:740: I've got nothing to say for MethodInstance for return_type(::Any, ::Any) consider `stacktrace(itrig)` or `Cthulhu.ascend(itrig)` -├─ ./broadcast.jl:740: I've got nothing to say for MethodInstance for return_type(::Any, ::Any, ::UInt64) consider `stacktrace(itrig)` or `Cthulhu.ascend(itrig)` -├─ ./broadcast.jl:740: I've got nothing to say for MethodInstance for return_type(::Core.Compiler.NativeInterpreter, ::Any, ::Any) consider `stacktrace(itrig)` or `Cthulhu.ascend(itrig)` -├─ ./broadcast.jl:740: I've got nothing to say for MethodInstance for contains_is(::Core.SimpleVector, ::Any) consider `stacktrace(itrig)` or `Cthulhu.ascend(itrig)` -└─ ./broadcast.jl:740: non-inferrable call, perhaps annotate combine_eltypes(f, args::Tuple) in Base.Broadcast at broadcast.jl:740 with type MethodInstance for promote_typejoin_union(::Type{Main.OptimizeMe.Container}) - If a noninferrable argument is a type or function, Julia's specialization heuristics may be responsible. - immediate caller(s): - 3-element Vector{Base.StackTraces.StackFrame}: - copy at broadcast.jl:905 [inlined] - materialize at broadcast.jl:883 [inlined] - lotsa_containers() at OptimizeMe.jl:14 -``` - -While this tree is attributed to `broadcast`, you can see several references here to `OptimizeMe.jl:14`, which contains: - -```julia -cs = Container.(list) -``` - -`Container.(list)` is a broadcasting operation, and once again we find that this has inferrability problems. -In this case, the initial suggestion "perhaps precompile `lotsa_containers`" is *not* helpful. -(The "regular invoke" just means that the initial call was one where inference knew all the argument types, and hence in principle might be precompilable, but from this tree we see that this broke down in some of its callees.) -Several children have no interpretation ("I've got nothing to say..."). -Only the last one, "non-inferrable call", is (marginally) useful, it means that a call was made with arguments whose types could not be inferred. - -!!! warning - You should always view these suggestions skeptically. - Often, they flag downstream issues that are better addressed at the source; frequently the best fix may be at a line a bit before the one identified in a trigger, or even in a dependent callee of a line prior to the flagged one. - This is a product of the fact that *returning* a non-inferrable argument is not the thing that forces a new round of inference; - it's *doing something* (making a specialization-worthy call) with the object of non-inferrable type that triggers a fresh entrance into inference. - -How might we go about fixing this? -One hint is to notice that `itree.children[3]` through `itree.children[7]` also ultimiately derive from this one line of `OptimizeMe`, -but from a later line within `broadcast.jl` which explains why they are not bundled together with `itree.children[2]`. -May of these correspond to creating different `Container` types, for example: - -``` -└─ MethodInstance for restart_copyto_nonleaf!(::Vector{Main.OptimizeMe.Container}, ::Vector{Main.OptimizeMe.Container{Int64}}, ::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1}, Tuple{Base.OneTo{Int64}}, Type{Main.OptimizeMe.Container}, Tuple{Base.Broadcast.Extruded{Vector{Any}, Tuple{Bool}, Tuple{Int64}}}}, ::Main.OptimizeMe.Container{UInt8}, ::Int64, ::Base.OneTo{Int64}, ::Int64, ::Int64) - ├─ MethodInstance for Main.OptimizeMe.Container(::UInt16) - ├─ MethodInstance for Main.OptimizeMe.Container(::Float32) - ├─ MethodInstance for Main.OptimizeMe.Container(::Char) - ├─ MethodInstance for Main.OptimizeMe.Container(::Vector{Int64}) - └─ MethodInstance for Main.OptimizeMe.Container(::Tuple{String, Int64}) -``` - -We've created a `Container{T}` for each specific `T` of the objects in `list`. -In some cases, there may be good reasons for such specialization, and in such cases we just have to live with these inference failures. -However, in other cases the specialization might be detrimental to compile-time and/or runtime performance. -In such cases, we might decide to create them all as `Container{Any}`: - -```julia -cs = Container{Any}.(list) -``` - -This 5-character change ends up eliminating 45 of our original 76 triggers. -Not only did we eliminate the triggers from broadcasting, but we limited the number of different `show(::IO, ::Container{T})`-`MethodInstance`s we need from later calls in `main`. - -When the `Container` constructor does more complex operations, in some cases you may find that `Container{Any}(args...)` still gets specialized for different types of `args...`. -In such cases, you can create a special constructor that instructs Julia to avoid specialization in specific instances, e.g., - -```julia -struct Container{T} - field1::T - morefields... - - # This constructor permits specialization on `args` - Container{T}(args...) where {T} = new{T}(args...) - - # For Container{Any}, we prevent specialization - Container{Any}(@nospecialize(args...)) = new{Any}(args...) -end -``` - -If you're following along, the best option is to make these fixes and go back to the beginning, re-collecting `tinf` and processing the triggers. -We're down to 32 inference triggers. - -### [Adding type-assertions](@id typeasserts) - -If you've made the fixes above, the first child of `itree` is one for `show(::IOContext{Base.TTY}, ::MIME{Symbol("text/plain")}, ::Vector{Main.OptimizeMe.Container{Any}})`; -we'll skip that one for now, because it's a bit more sophisticated. -Right below it, we see - -``` -├─ MethodInstance for combine_eltypes(::Type, ::Tuple{Vector{Any}}) -│ ├─ MethodInstance for return_type(::Any, ::Any) -│ ├─ MethodInstance for return_type(::Any, ::Any, ::UInt64) -``` - -and related nodes for `similar`, `copyto_nonleaf!`, etc., just as we saw above, so this looks like another case of broadcasting failure. -In this case, `suggest` quickly indicates that it's the broadcasting in - -```julia -function contain_list(list) - cs = Container.(list) - return concat_string(cs...) -end -``` - -Now we know the problem: `main` creates `list = [2.718, "is jealous"]`, -a vector with different object types, and this leads to inference failures in broadcasting. -But wait, you might notice, `contain_concrete` gets called before `contain_list`, why doesn't it have a problem? -The reason is that `contain_concrete` and its callee, `concat_string`, provide opportunities for inference to handle each object in a separate argument; -the problems arise from bundling objects of different types into the same container. - -There are several ways we could go about fixig this example: - -- we could delete `contain_list` altogether and use `contain_concrete` for everything. -- we could try creating `list` as a tuple rather than a `Vector{Any}`; (small) tuples sometimes allow inference to succeed even when each element has a different type. This is as simple as changing `list = [2.718, "is jealous"]` to `list = (2.718, "is jealous")`, but whether it works to solve all your inference problems depends on the particular case. -- we could use external knowledge to annotate the types of the items in `list::Vector{Any}`. - -Here we'll illustrate the last of these, since it's the only one that's nontrivial. -(It's also often a useful pattern in many real-world contexts, such as cases where you have a `Dict{String,Any}` but know something about the kinds of value-types associated with particular string keys.) -We could rewrite `contain_list` so it looks like this: - -```julia -function contain_list(list) - length(list) == 2 || throw(DimensionMismatch("list must have length 2")) - item1 = list[1]::Float64 - item2 = list[2]::String - return contain_concrete(item1, item2) # or we could repeat the body of contain_concrete -end -``` - -The type-assertions tell inference that the corresponding items have the given types, and assist inference in cases where it has no mechanism to deduce the answer on its own. -Julia will throw an error if the type-assertion fails. -In some cases, a more forgiving option might be - -```julia -item1 = convert(Float64, list[1])::Float64 -``` -which will attempt to convert `list[1]` to a `Float64`, and therefore handle a wider range of number types stored in the first element of `list`. -Believe it or not, both the `convert()` and the `::Float64` type-assertion are necessary: -since `list[1]` is of type `Any`, Julia will not be able to deduce which `convert` method will be used to perform the conversion, and it's always possible that someone has written a sloppy `convert` that doesn't return a value of the requested type. -Without that final `::Float64`, inference cannot simply assume that the result is a `Float64`. -The type-assert `::Float64` enforces the fact that you're expecting that `convert` call to actually return a `Float64`--it will error if it fails to do so, and it's this error that allows inference to be certain that for the purposes of any later code it must be a `Float64`. - -Of course, this just trades one form of inference failure for another--the call to `convert` will be made by runtime dispatch--but this can nevertheless be a big win for two reasons: - -- even though the `convert` call will be made by runtime dispatch, in this particular case `convert(Float64, ::Float64)` is already compiled in Julia itself. Consequently it doesn't require a fresh run of inference. -- even in cases where the types are such that `convert` might need to be inferred & compiled, the type-assertion allows Julia to assume that `item1` is henceforth a `Float64`. This makes it possible for inference to succeed for any code that follows. When that's a large amount of code, the savings can be considerable. - -Let's make that fix and also annotate the container type from `main`, `list = Any[2.718, "is jealous"]`. -Just to see how we're progressing, we start a fresh session and discover we're down to 20 triggers with just three direct branches. - -### Vararg homogenization - -We'll again skip over the `show` branches (they are two of the remaining three), and focus on this one: - -```julia -julia> node = itree.children[2] -TriggerNode for MethodInstance for (::Base.var"#cat_t##kw")(::NamedTuple{(:dims,), Tuple{Val{1}}}, ::typeof(Base.cat_t), ::Type{Int64}, ::UnitRange{Int64}, ::Vararg{Any, N} where N) with 2 direct children - -julia> print_tree(node) -MethodInstance for (::Base.var"#cat_t##kw")(::NamedTuple{(:dims,), Tuple{Val{1}}}, ::typeof(Base.cat_t), ::Type{Int64}, ::UnitRange{Int64}, ::Vararg{Any, N} where N) -├─ MethodInstance for cat_similar(::UnitRange{Int64}, ::Type, ::Tuple{Int64}) -└─ MethodInstance for __cat(::Vector{Int64}, ::Tuple{Int64}, ::Tuple{Bool}, ::UnitRange{Int64}, ::Vararg{Any, N} where N) - -julia> suggest(node) -./abstractarray.jl:1630: invoked callee is varargs (ignore this one, force-specialize the callee MethodInstance for (::Base.var"#cat_t##kw")(::NamedTuple{(:dims,), Tuple{Val{1}}}, ::typeof(Base.cat_t), ::Type{Int64}, ::UnitRange{Int64}, ::Vararg{Any, N} where N), or declare an umbrella type) -immediate caller(s): -1-element Vector{Base.StackTraces.StackFrame}: - main() at OptimizeMe.jl:48 -├─ ./abstractarray.jl:1636: caller is varargs (ignore this one, specialize the caller _cat_t(::Val{1}, ::Type{Int64}, ::UnitRange{Int64}, ::Vararg{Any, N} where N) at abstractarray.jl:1636, or improve inferrability of its caller) -│ immediate caller(s): -│ 1-element Vector{Base.StackTraces.StackFrame}: -│ cat_t(::Type{Int64}, ::UnitRange{Int64}, ::Vararg{Any, N} where N; dims::Val{1}) at abstractarray.jl:1632 -└─ ./abstractarray.jl:1640: caller is varargs (ignore this one, specialize the caller _cat_t(::Val{1}, ::Type{Int64}, ::UnitRange{Int64}, ::Vararg{Any, N} where N) at abstractarray.jl:1640, or improve inferrability of its caller) - immediate caller(s): - 1-element Vector{Base.StackTraces.StackFrame}: - cat_t(::Type{Int64}, ::UnitRange{Int64}, ::Vararg{Any, N} where N; dims::Val{1}) at abstractarray.jl:1632 -``` - -Due to Julia's optimization and inlining, it's sometimes a bit hard to tell from these shortened displays where a particular trigger comes from. -(It turns out that this is finally the trigger we looked at in greatest detail in [method-based triggers](@ref methtrigs).) -In this case we extract the specific trigger and show the stacktrace: - -```julia -julia> itrig = node.itrig -Inference triggered to call MethodInstance for (::Base.var"#cat_t##kw")(::NamedTuple{(:dims,), Tuple{Val{1}}}, ::typeof(Base.cat_t), ::Type{Int64}, ::UnitRange{Int64}, ::Vararg{Any, N} where N) from _cat (./abstractarray.jl:1630) inlined into MethodInstance for makeobjects() (/tmp/OptimizeMe.jl:39) - -julia> stacktrace(itrig) -24-element Vector{Base.StackTraces.StackFrame}: - exit_current_timer at typeinfer.jl:166 [inlined] - typeinf(interp::Core.Compiler.NativeInterpreter, frame::Core.Compiler.InferenceState) at typeinfer.jl:208 - typeinf_ext(interp::Core.Compiler.NativeInterpreter, mi::Core.MethodInstance) at typeinfer.jl:835 - typeinf_ext_toplevel(interp::Core.Compiler.NativeInterpreter, linfo::Core.MethodInstance) at typeinfer.jl:868 - typeinf_ext_toplevel(mi::Core.MethodInstance, world::UInt64) at typeinfer.jl:864 - _cat at abstractarray.jl:1630 [inlined] - #cat#127 at abstractarray.jl:1769 [inlined] - cat at abstractarray.jl:1769 [inlined] - vcat at abstractarray.jl:1698 [inlined] - makeobjects() at OptimizeMe.jl:39 - main() at OptimizeMe.jl:48 - top-level scope at snoop_inference.jl:53 - eval(m::Module, e::Any) at boot.jl:360 - eval_user_input(ast::Any, backend::REPL.REPLBackend) at REPL.jl:139 - repl_backend_loop(backend::REPL.REPLBackend) at REPL.jl:200 - start_repl_backend(backend::REPL.REPLBackend, consumer::Any) at REPL.jl:185 - run_repl(repl::REPL.AbstractREPL, consumer::Any; backend_on_current_task::Bool) at REPL.jl:317 - run_repl(repl::REPL.AbstractREPL, consumer::Any) at REPL.jl:305 - (::Base.var"#872#874"{Bool, Bool, Bool})(REPL::Module) at client.jl:387 - #invokelatest#2 at essentials.jl:707 [inlined] - invokelatest at essentials.jl:706 [inlined] - run_main_repl(interactive::Bool, quiet::Bool, banner::Bool, history_file::Bool, color_set::Bool) at client.jl:372 - exec_options(opts::Base.JLOptions) at client.jl:302 - _start() at client.jl:485 -``` - -(You can also call `stacktrace` directly on `node`.) -It's the lines immediately following `typeinf_ext_toplevel` that need concern us: -you can see that the "last stop" on code we wrote here was `makeobjects() at OptimizeMe.jl:39`, after which it goes fairly deep into the concatenation pipeline before suffering an inference trigger at `_cat at abstractarray.jl:1630`. - -In this case, the first hint is quite useful, if we know how to interpret it. -The `invoked callee is varargs` reassures us that the immediate caller, `_cat`, knows exactly which method it is calling (that's the meaning of the `invoked`). -The real problem is that it doesn't know how to specialize it. -The suggestion to `homogenize the arguments` is the crucial hint: -the problem comes from the fact that in - -```julia -xs = [1:5; 7] -``` - -`1:5` is a `UnitRange{Int}` whereas `7` is an `Int`, and the fact that these are two different types prevents Julia from knowing how to specialize that varargs call. -But this is easy to fix, because the result will be identical if we write this as - -```julia -xs = [1:5; 7:7] -``` - -in which case both arguments are `UnitRange{Int}`, and this allows Julia to specialize the varargs call. - -!!! note - It's generally a good thing that Julia doesn't specialize each and every varargs call, because the lack of specialization reduces latency. - However, when you can homogenize the argument types and make it inferrable, you make it more worthy of precompilation, which is a different and ultimately more impactful approach to latency reduction. - -### Defining `show` methods for custom types - -Finally we are left with nodes that are related to `show`. -We'll temporarily skip the first of these and examine - -```julia -julia> print_tree(node) -MethodInstance for show(::IOContext{Base.TTY}, ::MIME{Symbol("text/plain")}, ::Vector{Main.OptimizeMe.Object}) -└─ MethodInstance for var"#sprint#386"(::IOContext{Base.TTY}, ::Int64, ::typeof(sprint), ::Function, ::Main.OptimizeMe.Object) - └─ MethodInstance for sizeof(::Main.OptimizeMe.Object) -``` - -We'll use this as an excuse to point out that if you don't know how to deal with the root node of this (sub)tree, you can tackle later nodes: - -```julia -julia> itrigsnode = flatten(node) -3-element Vector{InferenceTrigger}: - Inference triggered to call MethodInstance for show(::IOContext{Base.TTY}, ::MIME{Symbol("text/plain")}, ::Vector{Main.OptimizeMe.Object}) from #38 (/home/tim/src/julia-master/usr/share/julia/stdlib/v1.6/REPL/src/REPL.jl:220) with specialization MethodInstance for (::REPL.var"#38#39"{REPL.REPLDisplay{REPL.LineEditREPL}, MIME{Symbol("text/plain")}, Base.RefValue{Any}})(::Any) - Inference triggered to call MethodInstance for var"#sprint#386"(::IOContext{Base.TTY}, ::Int64, ::typeof(sprint), ::Function, ::Main.OptimizeMe.Object) from sprint##kw (./strings/io.jl:101) inlined into MethodInstance for alignment(::IOContext{Base.TTY}, ::Vector{Main.OptimizeMe.Object}, ::UnitRange{Int64}, ::UnitRange{Int64}, ::Int64, ::Int64, ::Int64) (./arrayshow.jl:68) - Inference triggered to call MethodInstance for sizeof(::Main.OptimizeMe.Object) from _show_default (./show.jl:402) with specialization MethodInstance for _show_default(::IOContext{IOBuffer}, ::Any) - -julia> itrig = itrigsnode[end] -Inference triggered to call MethodInstance for sizeof(::Main.OptimizeMe.Object) from _show_default (./show.jl:402) with specialization MethodInstance for _show_default(::IOContext{IOBuffer}, ::Any) -``` - -The stacktrace begins - -```julia -julia> stacktrace(itrig) -35-element Vector{Base.StackTraces.StackFrame}: - exit_current_timer at typeinfer.jl:166 [inlined] - typeinf(interp::Core.Compiler.NativeInterpreter, frame::Core.Compiler.InferenceState) at typeinfer.jl:208 - typeinf_ext(interp::Core.Compiler.NativeInterpreter, mi::Core.MethodInstance) at typeinfer.jl:835 - typeinf_ext_toplevel(interp::Core.Compiler.NativeInterpreter, linfo::Core.MethodInstance) at typeinfer.jl:868 - typeinf_ext_toplevel(mi::Core.MethodInstance, world::UInt64) at typeinfer.jl:864 - _show_default(io::IOContext{IOBuffer}, x::Any) at show.jl:402 - show_default at show.jl:395 [inlined] - show(io::IOContext{IOBuffer}, x::Any) at show.jl:390 - sprint(f::Function, args::Main.OptimizeMe.Object; context::IOContext{Base.TTY}, sizehint::Int64) at io.jl:103 -⋮ -``` - -You can see that `sprint` called `show` which called `_show_default`; -`_show_default` clearly needed to call `sizeof`. -The hint, in this case, suggests the impossible: - -``` -julia> suggest(itrig) -./show.jl:402: non-inferrable call, perhaps annotate _show_default(io::IO, x) in Base at show.jl:397 with type MethodInstance for sizeof(::Main.OptimizeMe.Object) -If a noninferrable argument is a type or function, Julia's specialization heuristics may be responsible. -immediate caller(s): -2-element Vector{Base.StackTraces.StackFrame}: - show_default at show.jl:395 [inlined] - show(io::IOContext{IOBuffer}, x::Any) at show.jl:390 -``` - -Because `Base` doesn't know about `OptimizeMe.Object`, you could not add such an annotation, and it wouldn't be correct in the vast majority of cases. - -As the name implies, `_show_default` is the fallback `show` method. -We can fix this by adding our own `show` method - -```julia -Base.show(io::IO, o::Object) = print(io, "Object x: ", o.x) -``` - -to the module definition. -`Object` is so simple that this is slightly silly, but in more complex cases adding good `show` methods improves usability of your packages tremendously. -(SnoopCompile has many `show` specializations, and without them it would be practically unusable.) - -When you do define a custom `show` method, you own it, so of course it will be precompilable. -So we've circumvented this particular issue. - -### Creating "warmup" methods - -Finally, it is time to deal with those long-delayed `show(::IOContext{Base.TTY}, ::MIME{Symbol("text/plain")}, ::T)` triggers and the triggers they inspire. -We have two of them, one for `T = Vector{Main.OptimizeMe.Container{Any}}` and one for `T = Vector{Main.OptimizeMe.Object}`. -Let's look at just the trigger associated with the first: - -``` -julia> itrig -Inference triggered to call MethodInstance for show(::IOContext{Base.TTY}, ::MIME{Symbol("text/plain")}, ::Vector{Main.OptimizeMeFixed.Container{Any}}) from #38 (/pathto/julia/usr/share/julia/stdlib/v1.6/REPL/src/REPL.jl:220) with specialization MethodInstance for (::REPL.var"#38#39"{REPL.REPLDisplay{REPL.LineEditREPL}, MIME{Symbol("text/plain")}, Base.RefValue{Any}})(::Any) -``` - -In this case we see that the method is `#38`. This is a `gensym`, or generated symbol, indicating that the method was generated during Julia's lowering pass, and might indicate a macro, a `do` block or other anonymous function, the generator for a `@generated` function, etc. - -!!! warning - It's particularly worthwhile to improve inferrability for gensym-methods. The number assiged to a gensymmed-method may change as you or other developers modify the package (possibly due to changes at very difference source-code locations), and so any explicit `precompile` directives involving gensyms may not have a long useful life. - - But not all methods with `#` in their name are problematic: methods ending in `##kw` or that look like `##funcname#39` are *keyword* and *body* methods, respectively, for methods that accept keywords. They can be obtained from the main method, and so `precompile` directives for such methods will not be outdated by incidental changes to the package. - -`edit(itrig)` (or equivalently, `edit(node)` where `node` is a child of `itree`) takes us to this method in `Base`: - -```julia -function display(d::REPLDisplay, mime::MIME"text/plain", x) - x = Ref{Any}(x) - with_repl_linfo(d.repl) do io - io = IOContext(io, :limit => true, :module => Main::Module) - get(io, :color, false) && write(io, answer_color(d.repl)) - if isdefined(d.repl, :options) && isdefined(d.repl.options, :iocontext) - # this can override the :limit property set initially - io = foldl(IOContext, d.repl.options.iocontext, init=io) - end - show(io, mime, x[]) - println(io) - end - return nothing -end -``` - -The generated method corresponds to the `do` block here. -The call to `show` comes from `show(io, mime, x[])`. -This implementation uses a clever trick, wrapping `x` in a `Ref{Any}(x)`, to prevent specialization of the method defined by the `do` block on the specific type of `x`. -This trick is designed to limit the number of `MethodInstance`s inferred for this `display` method. - -Unfortunately, from the standpoint of precompilation we have something of a conundrum. -It turns out that this trigger corresponds to the first of the big red flames in the flame graph. -`show(::IOContext{Base.TTY}, ::MIME{Symbol("text/plain")}, ::Vector{Main.OptimizeMe.Container{Any}})` is not precompilable because `Base` owns the `show` method for `Vector`; -we might own the element type, but we're leveraging the generic machinery in `Base` and consequently it owns the method. -If these were all packages, you might request its developers to add a `precompile` directive, but that will work only if the package that owns the method knows about the relevant type. -In this situation, Julia's `Base` module doesn't know about `OptimizeMe.Container{Any}`, so we're stuck. - -There are a couple of ways one might go about improving matters. -First, one option is that this should be changed in Julia itself: since the caller, `display`, has gone to some lengths to reduce specialization, it would be worth contemplating whether `show(io::IO, ::MIME"text/plain", X::AbstractArray)` should have a `@nospecialize` around `X`. -Here, we'll pursue a simple "cheat," one that allows us to directly precompile this method. -The trick is to link it, via a chain of backedges, to a method that our package owns: - -```julia -# "Stub" callers for precompilability (we don't use this function for any real work) -function warmup() - mime = MIME("text/plain") - io = Base.stdout::Base.TTY - # Container{Any} - v = [Container{Any}(0)] - show(io, mime, v) - show(IOContext(io), mime, v) - # Object - v = [Object(0)] - show(io, mime, v) - show(IOContext(io), mime, v) - return nothing -end - -precompile(warmup, ()) -``` - -We handled not just `Vector{Container{Any}}` but also `Vector{Object}`, since that turns out to correspond to the other wide block of red bars. -If you make this change, start a fresh session, and recreate the flame graph, you'll see that the wide red flames are gone: - -![flamegraph-OptimizeMeFixed](assets/flamegraph-OptimizeMeFixed.png) - - -!!! info - It's worth noting that this `warmup` method needed to be carefully written to succeed in its mission. `stdout` is not inferrable (it's a global that can be replaced by `redirect_stdout`), so we needed to annotate its type. We also might have been tempted to use a loop, `for io in (stdout, IOContext(stdout)) ... end`, but inference needs a dedicated call-site where it knows all the types. ([Union-splitting](https://julialang.org/blog/2018/08/union-splitting/) can sometimes come to the rescue, but not if the list is long or elements non-inferrable.) The safest option is to make each call from a separate site in the code. - -The next trigger, a call to `sprint` from inside `Base.alignment(io::IO, x::Any)`, could also be handled using this `warmup` trick, but the flamegraph says this call (also marked in red) isn't an expensive method to infer. In such cases, it's fine to choose to leave it be. - -### Implementing or requesting `precompile` directives in upstream packages - -Of the remaining triggers (now numbering 14), the flamegraph indicates that the most expensive inference run is - -``` -Inference triggered to call MethodInstance for show(::IOContext{IOBuffer}, ::Float32) from _show_default (./show.jl:412) with specialization MethodInstance for _show_default(::IOContext{IOBuffer}, ::Any) -``` - -You can check that by listing the children of `ROOT` in order of `inclusive` time: - -```julia -julia> nodes = sort(tinf.children; by=inclusive) -14-element Vector{SnoopCompileCore.InferenceTimingNode}: - InferenceTimingNode: 0.000053/0.000053 on InferenceFrameInfo for ==(::Type, nothing::Nothing) with 0 direct children - InferenceTimingNode: 0.000054/0.000054 on InferenceFrameInfo for sizeof(::Main.OptimizeMeFixed.Container{Any}) with 0 direct children - InferenceTimingNode: 0.000061/0.000061 on InferenceFrameInfo for Base.typeinfo_eltype(::Type) with 0 direct children - InferenceTimingNode: 0.000075/0.000380 on InferenceFrameInfo for show(::IOContext{IOBuffer}, ::Any) with 1 direct children - InferenceTimingNode: 0.000445/0.000445 on InferenceFrameInfo for Pair{Symbol, DataType}(::Any, ::Any) with 0 direct children - InferenceTimingNode: 0.000663/0.000663 on InferenceFrameInfo for print(::IOContext{Base.TTY}, ::String, ::String, ::Vararg{String, N} where N) with 0 direct children - InferenceTimingNode: 0.000560/0.001049 on InferenceFrameInfo for Base.var"#sprint#386"(::IOContext{Base.TTY}, ::Int64, sprint::typeof(sprint), ::Function, ::Main.OptimizeMeFixed.Object) with 4 direct children - InferenceTimingNode: 0.000441/0.001051 on InferenceFrameInfo for Pair(::Symbol, ::Type) with 1 direct children - InferenceTimingNode: 0.000627/0.001140 on InferenceFrameInfo for Base.var"#sprint#386"(::IOContext{Base.TTY}, ::Int64, sprint::typeof(sprint), ::Function, ::Main.OptimizeMeFixed.Container{Any}) with 4 direct children - InferenceTimingNode: 0.000321/0.001598 on InferenceFrameInfo for show(::IOContext{IOBuffer}, ::UInt16) with 4 direct children - InferenceTimingNode: 0.000190/0.012516 on InferenceFrameInfo for show(::IOContext{IOBuffer}, ::Vector{Int64}) with 3 direct children - InferenceTimingNode: 0.021179/0.033940 on InferenceFrameInfo for Base.Ryu.writeshortest(::Vector{UInt8}, ::Int64, ::Float32, ::Bool, ::Bool, ::Bool, ::Int64, ::UInt8, ::Bool, ::UInt8, ::Bool, ::Bool) with 29 direct children - InferenceTimingNode: 0.000083/0.035496 on InferenceFrameInfo for show(::IOContext{IOBuffer}, ::Tuple{String, Int64}) with 1 direct children - InferenceTimingNode: 0.000188/0.092555 on InferenceFrameInfo for show(::IOContext{IOBuffer}, ::Float32) with 1 direct children -``` - -You can see it's the most expensive remaining root, weighing in at nearly 100ms. -This method is defined in the `Base.Ryu` module, - -```julia -julia> node = nodes[end] -InferenceTimingNode: 0.000188/0.092555 on InferenceFrameInfo for show(::IOContext{IOBuffer}, ::Float32) with 1 direct children - -julia> Method(node) -show(io::IO, x::T) where T<:Union{Float16, Float32, Float64} in Base.Ryu at ryu/Ryu.jl:111 -``` - -Now, we could add this to `warmup` and at least solve the inference problem. -However, on the flamegraph you might note that this is followed shortly by a couple of calls to `Ryu.writeshortest` (the third-most expensive to infer), followed by a long gap. -That hints that other steps, like native code generation, may be expensive. -Since these are base Julia methods, and `Float32` is a common type, it would make sense to file an issue or pull request that Julia should come shipped with these precompiled--that would cache not only the type-inference but also the native code, and thus represents a far more complete solution. - -Later, we'll see how `parcel` can generate such precompile directives automatically, so this is not a step you need to implement entirely on your own. - -Another `show` `MethodInstance`, `show(::IOContext{IOBuffer}, ::Tuple{String, Int64})`, seems too specific to be worth worrying about, so we call it quits here. - -### [Advanced analysis: `Cthulhu.ascend`](@id ascend-itrig) - -One thing that hasn't yet been covered is that when you really need more insight, you can use `ascend`: - -```julia -julia> itrig = itrigs[5] -Inference triggered to call MethodInstance for show(::IOContext{IOBuffer}, ::Float32) from _show_default (./show.jl:412) with specialization MethodInstance for _show_default(::IOContext{IOBuffer}, ::Any) - -julia> ascend(itrig) -Choose a call for analysis (q to quit): - > show(::IOContext{IOBuffer}, ::Float32) - _show_default(::IOContext{IOBuffer}, ::Any) at ./show.jl:412 - show_default at ./show.jl:395 => show(::IOContext{IOBuffer}, ::Any) at ./show.jl:390 - #sprint#386(::IOContext{Base.TTY}, ::Int64, ::typeof(sprint), ::Function, ::Main.OptimizeMeFixed.Container{Any}) at ./strings/io.jl:103 - sprint##kw at ./strings/io.jl:101 => alignment at ./show.jl:2528 => alignment(::IOContext{Base.TTY}, ::Vector{Main.OptimizeMeFixed.Container{Any}}, ::UnitRange{Int64}, ::UnitRange{Int64}, :: - print_matrix(::IOContext{Base.TTY}, ::AbstractVecOrMat{T} where T, ::String, ::String, ::String, ::String, ::String, ::String, ::Int64, ::Int64) at ./arrayshow.jl:197 - print_matrix at ./arrayshow.jl:169 => print_array at ./arrayshow.jl:323 => show(::IOContext{Base.TTY}, ::MIME{Symbol("text/plain")}, ::Vector{Main.OptimizeMeFixed.Container{Any}}) at ./a - (::REPL.var"#38#39"{REPL.REPLDisplay{REPL.LineEditREPL}, MIME{Symbol("text/plain")}, Base.RefValue{Any}})(::Any) at /home/tim/src/julia-master/usr/share/julia/stdlib/v1.6/REPL/src/REPL - with_repl_linfo(::Any, ::REPL.LineEditREPL) at /home/tim/src/julia-master/usr/share/julia/stdlib/v1.6/REPL/src/REPL.jl:462 -v display(::REPL.REPLDisplay, ::MIME{Symbol("text/plain")}, ::Any) at /home/tim/src/julia-master/usr/share/julia/stdlib/v1.6/REPL/src/REPL.jl:213 - -``` - -`ascend` was covered in much greater detail in [fixing invalidations](@ref invalidations), and you can read about using it on that page. -Here, one twist is that some lines contain content like - -``` -show_default at ./show.jl:395 => show(::IOContext{IOBuffer}, ::Any) at ./show.jl:390 -``` - -This indicates that `show_default` was inlined into `show`. -`ascend` needs the full non-inlined `MethodInstance` to descend into, so the tree only includes such nodes. -However, within Cthulhu you can toggle optimization and thereby descend into some of these inlined method, or see the full consequence of their inlining into the caller. - -## [A note on analyzing test suites](@id test-suites) - -If you're doing a package analysis, it's convenient to use the package's `runtests.jl` script as a way to cover much of the package's functionality. -SnoopCompile has a couple of enhancements designed to make it easier to ignore inference triggers that come from the test suite itself. -First, `suggest.(itrigs)` may show something like this: - -``` - ./broadcast.jl:1315: inlineable (ignore this one) - ./broadcast.jl:1315: inlineable (ignore this one) - ./broadcast.jl:1315: inlineable (ignore this one) - ./broadcast.jl:1315: inlineable (ignore this one) -``` - -This indicates a broadcasting operation in the `@testset` itself. -Second, while it's a little dangerous (because `suggest` cannot entirely be trusted), you can filter these out: - -```julia -julia> itrigsel = [itrig for itrig in itrigs if !isignorable(suggest(itrig))]; - -julia> length(itrigs) -222 - -julia> length(itrigsel) -71 -``` - -While there is some risk of discarding triggers that provide clues about the origin of other triggers (e.g., they would have shown up in the same branch of the `trigger_tree`), the shorter list may help direct your attention to the "real" issues. - -## Results from the improvements - -An improved version of `OptimizeMe` can be found in `OptimizeMeFixed.jl` in the same directory. -Let's see where we stand: - -```julia -julia> tinf = @snoop_inference OptimizeMeFixed.main() -3.14 is great -2.718 is jealous -... - Object x: 7 -InferenceTimingNode: 0.888522055/1.496965222 on InferenceFrameInfo for Core.Compiler.Timings.ROOT() with 15 direct children -``` - -We've substantially shrunk the overall inclusive time from 2.68s to about 1.5s. -Some of this came from our single `precompile` directive, for `warmup`. -But even more of it came from limiting specialization (using `Container{Any}` instead of `Container`) and by making some results easier on type-inference (e.g., our changes for the `vcat` pipeline). - -On the next page, we'll wrap all this up with more explicit `precompile` directives. diff --git a/docs/src/snoop_inference_parcel.md b/docs/src/snoop_inference_parcel.md deleted file mode 100644 index 164b83d1..00000000 --- a/docs/src/snoop_inference_parcel.md +++ /dev/null @@ -1,215 +0,0 @@ -# [Using `@snoop_inference` results for precompilation](@id precompilation) - -Improving inferrability, specialization, and precompilability may sometimes feel like "eating your vegetables": really good for you, but it sometimes feels like work. (Depending on tastes, of course; I love vegetables.) -While we've already gotten some payoff, now we're going to collect an additional reward for our hard work: the "dessert" of adding `precompile` directives. -It's worth emphasing that if we hadn't done the analysis of inference triggers and made improvements to our package, the benefit of adding `precompile` directives would have been substantially smaller. - -## Running work - -One of the simplest ways to force precompilation is to execute code. This has several advantages: - -- It is typically more robust across Julia versions -- It automatically handles architecture differences like 32- vs 64-bit machines -- It precompiles even the runtime-dispatch dependencies of a command - if the dependent methods are in the same package. This typically - results in much shorter precompile files than those that explicitly - use `precompile`. - -This approach looks like the following: - -``` -module MyPkg - -# All of your code that defines `MyPkg` goes here - -# precompile as the final step of the module definition: -if ccall(:jl_generating_output, Cint, ()) == 1 # if we're precompiling the package - let - x = rand(Int, 5) - my_function(x) # this will force precompilation `my_function(::Vector{Int}`) - end -end - -end # module MyPkg -``` - -When your module is being precompiled (`[ Info: Precompiling MyPkg [...]`), just before the module "closes" your block of work will be executed. This forces compilation, and these compiled MethodInstances will be cached. - -After adding such directives, it's recommended to check the flamegraph again and see if there are any major omissions. You may need to add similar directives to some of the packages you depend on: precompilation is only effective if performed from the module that owns the method. (One advantage of `parcel` is that it automatically assigns `precompile` directives to the appropriate package.) - -!!! note - The work done inside this block is only executed when the package is - being precompiled, not when it is loaded with `using - MyPkg`. Precompilation essentially takes a "snapshot" of the - module; `using` just reloads that snapshot, it does not re-execute - all the commands used to produce that snapshot. - - The only role for the `ccall` is to prevent this work from being done - if you've started Julia with `--compiled-modules=no`. - -!!! warn - This style of precompilation may be undesirable or impossible if - your statements have side effects like opening new windows. In such - cases, you may be able to use it for lower-level calls. - -## Parcel - -`precompile` directives have to be emitted by the module that owns the method. -SnoopCompile comes with a tool, `parcel`, that splits out the "root-most" precompilable MethodInstances into their constituent modules. -In our case, since we've made almost every call precompilable, this will typically correspond to the bottom row of boxes in the flame graph. -In cases where you have some non-precompilable MethodInstances, they will include MethodInstances from higher up in the call tree. - -Let's use `SnoopCompile.parcel` on `OptimizeMeFixed` in its current state: - -```julia -julia> ttot, pcs = SnoopCompile.parcel(tinf); - -julia> ttot -0.6084431670000001 - -julia> pcs -4-element Vector{Pair{Module, Tuple{Float64, Vector{Tuple{Float64, Core.MethodInstance}}}}}: - Core => (0.000135179, [(0.000135179, MethodInstance for (NamedTuple{(:sizehint,), T} where T<:Tuple)(::Tuple{Int64}))]) - Base => (0.028383533000000002, [(3.2456e-5, MethodInstance for getproperty(::IOBuffer, ::Symbol)), (4.7474e-5, MethodInstance for ==(::Type, ::Nothing)), (5.7944e-5, MethodInstance for typeinfo_eltype(::Type)), (0.00039092299999999994, MethodInstance for show(::IOContext{IOBuffer}, ::Any)), (0.000433143, MethodInstance for IOContext(::IOBuffer, ::IOContext{Base.TTY})), (0.000484984, MethodInstance for Pair{Symbol, DataType}(::Any, ::Any)), (0.000742383, MethodInstance for print(::IOContext{Base.TTY}, ::String, ::String, ::Vararg{String, N} where N)), (0.001293705, MethodInstance for Pair(::Symbol, ::Type)), (0.0018914350000000003, MethodInstance for show(::IOContext{IOBuffer}, ::UInt16)), (0.010604793000000001, MethodInstance for show(::IOContext{IOBuffer}, ::Tuple{String, Int64})), (0.012404293, MethodInstance for show(::IOContext{IOBuffer}, ::Vector{Int64}))]) - Base.Ryu => (0.15733664599999997, [(0.05721630600000001, MethodInstance for writeshortest(::Vector{UInt8}, ::Int64, ::Float32, ::Bool, ::Bool, ::Bool, ::Int64, ::UInt8, ::Bool, ::UInt8, ::Bool, ::Bool)), (0.10012033999999997, MethodInstance for show(::IOContext{IOBuffer}, ::Float32))]) - Main.OptimizeMeFixed => (0.4204474180000001, [(0.4204474180000001, MethodInstance for main())]) -``` - -This tells us that a total of ~0.6s were spent on inference. -`parcel` discovered precompilable MethodInstances for four modules, `Core`, `Base`, `Base.Ryu`, and `OptimizeMeFixed`. -These are listed in increasing order of inference time. - -Let's look specifically at `OptimizeMeFixed`, since that's under our control: - -```julia -julia> pcmod = pcs[end] -Main.OptimizeMeFixed => (0.4204474180000001, Tuple{Float64, Core.MethodInstance}[(0.4204474180000001, MethodInstance for main())]) - -julia> tmod, tpcs = pcmod.second; - -julia> tmod -0.4204474180000001 - -julia> tpcs -1-element Vector{Tuple{Float64, Core.MethodInstance}}: - (0.4204474180000001, MethodInstance for main()) -``` - -0.42s of that time is due to `OptimizeMeFixed`, and `parcel` discovered a single MethodInstances to precompile, `main()`. - -We could look at the other modules (packages) similarly. - -## SnoopCompile.write - -You can generate files that contain ready-to-use `precompile` directives using `SnoopCompile.write`: - -```julia -julia> SnoopCompile.write("/tmp/precompiles_OptimizeMe", pcs) -Core: no precompile statements out of 0.000135179 -Base: precompiled 0.026194226 out of 0.028383533000000002 -Base.Ryu: precompiled 0.15733664599999997 out of 0.15733664599999997 -Main.OptimizeMeFixed: precompiled 0.4204474180000001 out of 0.4204474180000001 -``` - -!!! tip - For packages that support just Julia 1.6 and higher, you may be able to slim down the precompile file by - adding `has_bodyfunction=true` to the arguments for `write`. - This setting applies for all packges in `pcs`, so you may need to call `write` twice (with both `false` and `true`) and select the appropriate precompile file for each package. - -You'll now find a directory `/tmp/precompiles_OptimizeMe`, and inside you'll find three files, for `Base`, `Base.Ryu`, and `OptimizeMeFixed`, respectively. -The contents of the last of these should be recognizable: - -```julia -function _precompile_() - ccall(:jl_generating_output, Cint, ()) == 1 || return nothing - Base.precompile(Tuple{typeof(main)}) # time: 0.4204474 -end -``` - -The first `ccall` line ensures we only pay the cost of running these `precompile` directives if we're building the package; this is relevant mostly if you're running Julia with `--compiled-modules=no`, which can be a convenient way to disable precompilation and examine packages in their "native state." -(It would also matter if you've set `__precompile__(false)` at the top of your module, but if so why are you reading this?) - -This file is ready to be moved into the `OptimizeMe` repository and `include`d into your module definition. -Since we added `warmup` manually, you could consider moving `precompile(warmup, ())` into this function. - -In general, it's recommended to run precompilation from inside a block - -```julia -if Base.VERSION >= v"1.4.2" - include("precompile.jl") - _precompile_() -end -``` - -because earlier versions of Julia occasionally crashed on certain precompile directives. -It's also perfectly fine to omit the function call, and use - -```julia -if Base.VERSION >= v"1.4.2" - Base.precompile(Tuple{typeof(main)}) # time: 0.4204474 - precompile(warmup, ()) -end -``` - -directly in the `OptimizeMeFixed` module, usually as the last block of the module definition. - -You might also consider submitting some of the other files (or their `precompile` directives) to the packages you depend on. -In some cases, the specific argument type combinations may be too "niche" to be worth specializing; one such case is found here, a `show` method for `Tuple{String, Int64}` for `Base`. -But in other cases, these may be very worthy additions to the package. - -## Final results - -Let's check out the final results of adding these `precompile` directives to `OptimizeMeFixed`. -First, let's build both modules as precompiled packages: - -```julia -ulia> push!(LOAD_PATH, ".") -4-element Vector{String}: - "@" - "@v#.#" - "@stdlib" - "." - -julia> using OptimizeMe -[ Info: Precompiling OptimizeMe [top-level] - -julia> using OptimizeMeFixed -[ Info: Precompiling OptimizeMeFixed [top-level] -``` - -Now in fresh sessions, - -```julia -julia> @time (using OptimizeMe; OptimizeMe.main()) -3.14 is great -2.718 is jealous -⋮ -Object x: 7 - 3.159908 seconds (10.63 M allocations: 582.091 MiB, 5.19% gc time, 99.67% compilation time) -``` - -versus - -```julia -julia> @time (using OptimizeMeFixed; OptimizeMeFixed.main()) -3.14 is great -2.718 is jealous -⋮ - Object x: 7 - 1.840034 seconds (5.38 M allocations: 289.402 MiB, 5.03% gc time, 96.70% compilation time) -``` - -We've cut down on the latency by nearly a factor of two. -Moreover, if Julia someday caches generated code, we're well-prepared to capitalize on the benefits, because the same improvements in "code ownership" are almost certain to pay dividends there too. - -If you inspect the results, you may sometimes suffer a few disappointments: some methods that we expected to precompile don't "take." -At the moment there appears to be a small subset of methods that fail to precompile, and the reasons are not yet widely understood. -At present, the best advice seems to be to comment-out any precompile directives that don't "take," since otherwise they increase the build time for the package without material benefit. -These failures may be addressed in future versions of Julia. -It's also worth appreciating how much we have succeeded in reducing latency, with the awareness that we may be able to get even greater benefit in the future. - -## Summary - -`@snoop_inference` collects enough data to learn which methods are triggering inference, how heavily methods are being specialized, and so on. -Examining your code from the standpoint of inference and specialization may be unfamiliar at first, but like other aspects of package development (testing, documentation, and release compatibility management) it can lead to significant improvements in the quality-of-life for you and your users. -By optimizing your packages and then adding `precompile` directives, you can often cut down substantially on latency. diff --git a/docs/src/snoop_invalidations.md b/docs/src/snoop_invalidations.md deleted file mode 100644 index 5daa32f7..00000000 --- a/docs/src/snoop_invalidations.md +++ /dev/null @@ -1,649 +0,0 @@ -# [Snooping on and fixing invalidations: `@snoop_invalidations`](@id invalidations) - -!!! compat - `@snoop_invalidations` is available on `Julia 1.6.0-DEV.154` or above, but the results can be relevant for all Julia versions. - -Invalidations occur when there is a danger that new methods would supersede older methods in previously-compiled code. -For safety, Julia's compiler *invalidates* that old code, marking it as unsuitable for use; the next time you call -that method, it will have to be compiled again from scratch. (If no one ever needs that method again, there is no -major loss.) - -Some packages define new methods that force invalidation of previously-compiled code. If your package, or any of your -dependencies, triggers many invalidations, it has several bad effects: - -- any invalidated methods you need for the functionality in your package will have to be recompiled. - This will lead to a direct (and occasionally large) slowdown for your package. -- invalidations by your dependencies (packages you rely on) can block precompilation of methods in your package, - preventing you from taking advantage of some of the other features of SnoopCompile. -- even if you don't need the invalidated code for your package, any invalidations triggered by your package - might harm packages that depend on yours. - -For these reasons, it's advisable to begin by analyzing invalidations. -On recent Julia versions, most packages do not trigger a large number of invalidations; often, all that is needed is a quick glance at invalidations before moving on to the next step. -Occasionally, checking for invalidations can save you a lot of confusion and frustration at later steps, so it is well worth taking a look. - -Readers who want more background and context are encouraged to read [this blog post](https://julialang.org/blog/2020/08/invalidations/). - -!!! note - Invalidatons occur only for compiled code; method definitions themselves cannot be invalidated. - As a consequence, it's possible to have latent invalidation risk; this risk can become exposed - if you use some intermediate functionality before loading your package, or if your dependencies someday add `precompile` - directives. So even if you've checked for invalidations previously, sometimes it's worth taking a fresh look. - -## Recording invalidations - -```@meta -DocTestFilters = r"(in|@) [a-zA-Z0-9]* (REPL\[\d+\]|none):\d+" -DocTestSetup = quote - using SnoopCompile -end -``` - -To record the invalidations caused by defining new methods, use [`@snoop_invalidations`](@ref). -`@snoop_invalidations` is exported by `SnoopCompile`, but the recommended approach is to record invalidations using the minimalistic `SnoopCompileCore` package, and then load `SnoopCompile` to do the analysis. -_**Remember**_ to run julia with the `--startup-file="no"` flag set, if you load packages such as [`Revise`](https://github.com/timholy/Revise.jl) in your startup file. -Otherwise invalidations relating to those packages will also show up. - -```julia -using SnoopCompileCore -invalidations = @snoop_invalidations begin - # package loads and/or method definitions that might invalidate other code -end -using SnoopCompile # now that we've collected the data, load the complete package to analyze the results -``` - -!!! note - `SnoopCompileCore` was split out from `SnoopCompile` to reduce the risk of invalidations from loading `SnoopCompile` itself. - Once a `MethodInstance` gets invalidated, it doesn't show up in future `@snoop_invalidations` results, so anything that - gets invalidated in order to provide `@snoop_invalidations` would be omitted from the results. - `SnoopCompileCore` is a very small package with no dependencies and which avoids extending any of Julia's own functions, - so it cannot invalidate any other code. - -## Analyzing invalidations - -### A first example - -We'll walk through this process with the following example: - -```jldoctest invalidations -julia> f(::Real) = 1; - -julia> callf(container) = f(container[1]); - -julia> call2f(container) = callf(container); -``` - -Because code doesn't get compiled until it gets run, and invalidations only affect compiled code, let's run this with three different container types: - -```jldoctest invalidations -julia> c64 = [1.0]; c32 = [1.0f0]; cabs = AbstractFloat[1.0]; # Vector{Float64}, Vector{Float32}, and Vector{AbstractFloat}, respectively - -julia> call2f(c64) -1 - -julia> call2f(c32) -1 - -julia> call2f(cabs) -1 -``` - -!!! warning - If you're following along, be sure you actually execute these methods, or you won't obtain the results below. - -Now we'll define a new `f` method, one specialized for `Float64`. -So we can see the consequences for the compiled code, we'll make this definition while snooping on the compiler with `@snoop_invalidations`: - -```jldoctest invalidations -julia> using SnoopCompileCore - -julia> invalidations = @snoop_invalidations f(::Float64) = 2; -``` - -As should be apparent, running `call2f` on `c64` should produce a different result than formerly, so Julia certainly -needs to invalidate that code. Let's see what that looks like. The simplest thing we can do is list or count invalidations: - -```jldoctest invalidations -julia> using SnoopCompile - -julia> length(uinvalidated(invalidations)) # collect the unique MethodInstances & count them -6 -``` - -The length of this set is your simplest insight into the extent of invalidations triggered by this method definition. - -If you want to fix invalidations, it's crucial to know *why* certain `MethodInstance`s were invalidated. -For that, it's best to use a tree structure, in which children are invalidated because their parents get invalidated: - -```jldoctest invalidations; filter=[r"(in Main at|@ Main) (REPL\[\d+\]|none)"] -julia> trees = invalidation_trees(invalidations) -1-element Vector{SnoopCompile.MethodInvalidations}: - inserting f(::Float64) @ Main none:1 invalidated: - backedges: 1: superseding f(::Real) @ Main none:1 with MethodInstance for f(::Float64) (2 children) - 2: superseding f(::Real) @ Main none:1 with MethodInstance for f(::AbstractFloat) (2 children) - 1 mt_cache -``` - -The output, `trees`, is a vector of `MethodInvalidations`, a data type defined in `SnoopCompile`; each of these is the set of invalidations triggered by a particular method definition. -In this case, we only defined one method, so we can get at most one `MethodInvalidation`. -`@snoop_invalidations using SomePkg` might result in a list of such objects, each connected to a particular method defined in a particular package (either `SomePkg` itself or one of its dependencies). - -In this case, "`inserting f(::Float64)`" indicates that we added a method with signature `f(::Float64)`, and that this method triggered invalidations. -(Invalidations can also be triggered by method deletion, although this should not happen in typical usage.) -Next, notice the `backedges` line, and the fact that there are two items listed for it. -This indicates that there were two proximal triggers for the invalidation, both of which superseded the method `f(::Real)`. -One of these had been compiled specifically for `Float64`, due to our `call2f(c64)`. -The other had been compiled specifically for `AbstractFloat`, due to our `call2f(cabs)`. - -You can look at these invalidation trees in greater detail: - -```jldoctest invalidations -julia> method_invalidations = trees[1]; # invalidations stemming from a single method - -julia> root = method_invalidations.backedges[1] # get the first triggered invalidation -MethodInstance for f(::Float64) at depth 0 with 2 children - -julia> show(root) -MethodInstance for f(::Float64) (2 children) - MethodInstance for callf(::Vector{Float64}) (1 children) - ⋮ - -julia> show(root; minchildren=0) -MethodInstance for f(::Float64) (2 children) - MethodInstance for callf(::Vector{Float64}) (1 children) - MethodInstance for call2f(::Vector{Float64}) (0 children) -``` - -The indentation here reveals that `call2f` called `callf` which called `f`, -and shows the entire "chain" of invalidations triggered by this method definition. -Examining `root2 = method_invalidations.backedges[2]` yields similar results, but for `Vector{AbstractFloat}`. - -### `mt_backedges` invalidations - -`MethodInvalidations` can have a second field, `mt_backedges`. -These are invalidations triggered via the `MethodTable` for a particular function. -When extracting `mt_backedges`, in addition to a root `MethodInstance` these also indicate a particular signature that triggered the invalidation. -We can illustrate this by returning to the `call2f` example above: - -```jldoctest invalidations; filter=[r"(in Main at|@ Main) (REPL\[\d+\]|none)"] -julia> call2f(["hello"]) -ERROR: MethodError: no method matching f(::String) -[...] - -julia> invalidations = @snoop_invalidations f(::AbstractString) = 2; - -julia> trees = invalidation_trees(invalidations) -1-element Vector{SnoopCompile.MethodInvalidations}: - inserting f(::AbstractString) @ Main none:1 invalidated: - mt_backedges: 1: signature Tuple{typeof(f), String} triggered MethodInstance for callf(::Vector{String}) (1 children) - - -julia> sig, root = trees[1].mt_backedges[end]; - -julia> sig -Tuple{typeof(f), String} - -julia> root -MethodInstance for callf(::Vector{String}) at depth 0 with 1 children -``` - -You can see that the invalidating signature, `f(::String)`, is more specific than the signature of the defined method, but that it is what was minimally needed by `callf(::Vector{String})`. - -`mt_backedges` invalidations often reflect "unhandled" conditions in methods that have already been compiled. - -### A more complex example - -The structure of these trees can be considerably more complicated. For example, if `callf` -also got called by some other method, and that method had also been executed (forcing it to be compiled), -then `callf` would have multiple children. -This is often seen with more complex, real-world tests. -As a medium-complexity example, try the following: - -!!! info - Any demonstration involving real-world packages might be altered from what is shown here by new releases of the relevant packages. - -```julia -julia> using Revise - -julia> using SnoopCompileCore - -julia> invalidations = @snoop_invalidations using FillArrays; - -julia> using SnoopCompile - -julia> trees = invalidation_trees(invalidations) -3-element Vector{SnoopCompile.MethodInvalidations}: - inserting all(f::Function, x::FillArrays.AbstractFill) in FillArrays at /pathto/.julia/packages/FillArrays/NjFh2/src/FillArrays.jl:556 invalidated: - backedges: 1: superseding all(f::Function, a::AbstractArray; dims) in Base at reducedim.jl:880 with MethodInstance for all(::Base.var"#388#389"{_A} where _A, ::AbstractArray) (3 children) - 2: superseding all(f, itr) in Base at reduce.jl:918 with MethodInstance for all(::Base.var"#388#389"{_A} where _A, ::Any) (3 children) - - inserting any(f::Function, x::FillArrays.AbstractFill) in FillArrays at /pathto/.julia/packages/FillArrays/NjFh2/src/FillArrays.jl:555 invalidated: - backedges: 1: superseding any(f::Function, a::AbstractArray; dims) in Base at reducedim.jl:877 with MethodInstance for any(::typeof(ismissing), ::AbstractArray) (1 children) - 2: superseding any(f, itr) in Base at reduce.jl:871 with MethodInstance for any(::typeof(ismissing), ::Any) (1 children) - 3: superseding any(f, itr) in Base at reduce.jl:871 with MethodInstance for any(::LoweredCodeUtils.var"#11#12"{_A} where _A, ::Any) (2 children) - 4: superseding any(f::Function, a::AbstractArray; dims) in Base at reducedim.jl:877 with MethodInstance for any(::LoweredCodeUtils.var"#11#12"{_A} where _A, ::AbstractArray) (4 children) - - inserting broadcasted(::Base.Broadcast.DefaultArrayStyle{N}, op, r::FillArrays.AbstractFill{T,N,Axes} where Axes) where {T, N} in FillArrays at /pathto/.julia/packages/FillArrays/NjFh2/src/fillbroadcast.jl:8 invalidated: - backedges: 1: superseding broadcasted(::S, f, args...) where S<:Base.Broadcast.BroadcastStyle in Base.Broadcast at broadcast.jl:1265 with MethodInstance for broadcasted(::Base.Broadcast.BroadcastStyle, ::typeof(JuliaInterpreter._Typeof), ::Any) (1 children) - 2: superseding broadcasted(::S, f, args...) where S<:Base.Broadcast.BroadcastStyle in Base.Broadcast at broadcast.jl:1265 with MethodInstance for broadcasted(::Base.Broadcast.BroadcastStyle, ::typeof(string), ::AbstractArray) (177 children) -``` - -Your specific results may differ from this, depending on which version of Julia and of packages you are using. -In this case, you can see that three methods (one for `all`, one for `any`, and one for `broadcasted`) triggered invalidations. -Perusing this list, you can see that methods in `Base`, `LoweredCodeUtils`, and `JuliaInterpreter` (the latter two were loaded by `Revise`) got invalidated by methods defined in `FillArrays`. - -The most consequential ones (the ones with the most children) are listed last, and should be where you direct your attention first. -That last entry looks particularly problematic, so let's extract it: - -```julia -julia> methinvs = trees[end]; - -julia> root = methinvs.backedges[end] -MethodInstance for broadcasted(::Base.Broadcast.BroadcastStyle, ::typeof(string), ::AbstractArray) at depth 0 with 177 children - -julia> show(root; maxdepth=10) -MethodInstance for broadcasted(::Base.Broadcast.BroadcastStyle, ::typeof(string), ::AbstractArray) (177 children) - MethodInstance for broadcasted(::typeof(string), ::AbstractArray) (176 children) - MethodInstance for #unpack#104(::Bool, ::typeof(Pkg.PlatformEngines.unpack), ::String, ::String) (175 children) - MethodInstance for (::Pkg.PlatformEngines.var"#unpack##kw")(::NamedTuple{(:verbose,),Tuple{Bool}}, ::typeof(Pkg.PlatformEngines.unpack), ::String, ::String) (174 children) - MethodInstance for #download_verify_unpack#109(::Nothing, ::Bool, ::Bool, ::Bool, ::Bool, ::typeof(Pkg.PlatformEngines.download_verify_unpack), ::String, ::Nothing, ::String) (165 children) - MethodInstance for (::Pkg.PlatformEngines.var"#download_verify_unpack##kw")(::NamedTuple{(:ignore_existence, :verbose),Tuple{Bool,Bool}}, ::typeof(Pkg.PlatformEngines.download_verify_unpack), ::String, ::Nothing, ::String) (33 children) - MethodInstance for (::Pkg.Artifacts.var"#39#40"{Bool,String,Nothing})(::String) (32 children) - MethodInstance for create_artifact(::Pkg.Artifacts.var"#39#40"{Bool,String,Nothing}) (31 children) - MethodInstance for #download_artifact#38(::Bool, ::Bool, ::typeof(Pkg.Artifacts.download_artifact), ::Base.SHA1, ::String, ::Nothing) (30 children) - MethodInstance for (::Pkg.Artifacts.var"#download_artifact##kw")(::NamedTuple{(:verbose, :quiet_download),Tuple{Bool,Bool}}, ::typeof(Pkg.Artifacts.download_artifact), ::Base.SHA1, ::String, ::Nothing) (23 children) - MethodInstance for (::Pkg.Artifacts.var"#download_artifact##kw")(::NamedTuple{(:verbose, :quiet_download),Tuple{Bool,Bool}}, ::typeof(Pkg.Artifacts.download_artifact), ::Base.SHA1, ::String) (22 children) - ⋮ - ⋮ - MethodInstance for (::Pkg.PlatformEngines.var"#download_verify_unpack##kw")(::NamedTuple{(:ignore_existence,),Tuple{Bool}}, ::typeof(Pkg.PlatformEngines.download_verify_unpack), ::String, ::Nothing, ::String) (130 children) - MethodInstance for (::Pkg.Types.var"#94#97"{Pkg.Types.Context,String,Pkg.Types.RegistrySpec})(::String) (116 children) - MethodInstance for #mktempdir#21(::String, ::typeof(mktempdir), ::Pkg.Types.var"#94#97"{Pkg.Types.Context,String,Pkg.Types.RegistrySpec}, ::String) (115 children) - MethodInstance for mktempdir(::Pkg.Types.var"#94#97"{Pkg.Types.Context,String,Pkg.Types.RegistrySpec}, ::String) (114 children) - MethodInstance for mktempdir(::Pkg.Types.var"#94#97"{Pkg.Types.Context,String,Pkg.Types.RegistrySpec}) (113 children) - MethodInstance for clone_or_cp_registries(::Pkg.Types.Context, ::Vector{Pkg.Types.RegistrySpec}, ::String) (112 children) - ⋮ - ⋮ - ⋮ -``` - -Here you can see a much more complex branching structure. -From this, you can see that methods in `Pkg` are the most significantly affected; -you could expect that loading `FillArrays` might slow down your next `Pkg` operation (perhaps depending on which operation you choose) executed in this same session. - -Again, if you're following along, it's possible that you'll see something quite different, if subsequent development has protected `Pkg` against this form of invalidation. - -## Filtering invalidations - -!!! info - The experimental tool [`SnoopCompile.precompile_blockers`](@ref) may be - able to help you identify just the invalidations you need to fix - for your use-case. - -Some method definitions trigger widespread invalidation. -If you don't have time to fix all of them, you might want to focus on a specific set of invalidations. -For instance, you might be the author of `PkgA` and you've noted that loading `PkgB` invalidates a lot of `PkgA`'s code. -In that case, you might want to find just those invalidations triggered in your package. -You can find them with [`filtermod`](@ref): - -```julia -trees = invalidation_trees(@snoop_invalidations using PkgB) -ftrees = filtermod(PkgA, trees) -``` - -By default, `filtermod` only selects trees where the root method was defined in the specified module. -`filtermod(PkgA, trees; recursive=true)` will return all trees that lead to any method defined -in `PkgA`. - -A more selective yet exhaustive tool is [`findcaller`](@ref), which allows you to find the path through the trees to a particular method: - -```julia -m = @which f(data) # look for the "path" that invalidates this method -f(data) # run once to force compilation -using SnoopCompile -trees = invalidation_trees(@snoop_invalidations using SomePkg) -invs = findcaller(m, trees) # select the branch that invalidated a compiled instance of `m` -``` - -When you don't know which method to choose, but know an operation that got slowed down by loading `SomePkg`, you can use `@snoopi` to find methods that needed to be recompiled. See [`findcaller`](@ref) for further details. - - -## Fixing invalidations - -In addition to the text below, there is a -[video](https://www.youtube.com/watch?v=7VbXbI6OmYo) illustrating many -of the same package features. The video also walks through a real-world -example fixing invalidations that stemmed from inference problems in -some of `Pkg`'s code. - -### `ascend` - -SnoopCompile, partnering with the remarkable [Cthulhu.jl](https://github.com/JuliaDebug/Cthulhu.jl), -provides a tool called `ascend` to simplify diagnosing and fixing invalidations. -To demonstrate this tool, let's use it on our test methods defined above. -For best results, you'll want to copy those method definitions into a file: - -```julia -f(::Real) = 1 -callf(container) = f(container[1]) -call2f(container) = callf(container) - -c64 = [1.0]; c32 = [1.0f0]; cabs = AbstractFloat[1.0]; -call2f(c64) -call2f(c32) -call2f(cabs) - -using SnoopCompileCore -invalidations = @snoop_invalidations f(::Float64) = 2 -using SnoopCompile -trees = invalidation_trees(invalidations) -method_invalidations = trees[1] -``` - -and `include` it into a fresh session. (The full functionality of `ascend` doesn't work for methods defined at the REPL, but does if the methods are defined in a file.) -In this demo, I called that file `/tmp/snoop_invalidations.jl`. - - -We start with - -```julia -julia> root = method_invalidations.backedges[end] -MethodInstance for f(::AbstractFloat) at depth 0 with 2 children -``` - -(It's common to start from the last element of `backedges` or `mt_backedges` since these have the largest number of children and are therefore most consequential.) -Then: - -```julia -julia> ascend(root) -Choose a call for analysis (q to quit): - > f(::AbstractFloat) - callf(::Vector{AbstractFloat}) - call2f(::Vector{AbstractFloat}) -``` - -This is an interactive menu: press the down arrow to go down, the up arrow to go up, and `Enter` to select an item for more detailed analysis. -In large trees, you may also want to "fold" nodes of the tree (collapsing it so that the children are no longer displayed), particularly if you are working your way through a long series of invalidations and want to hide ones you've already dealt with. You toggle folding using the space bar, and folded nodes are printed with a `+` in front of them. - -For example, if we press the down arrow once, we get - -```julia -julia> ascend(root) -Choose a call for analysis (q to quit): - f(::AbstractFloat) - > callf(::Vector{AbstractFloat}) - call2f(::Vector{AbstractFloat}) -``` - -Now hit `Enter` to select it: - -```julia -Choose caller of MethodInstance for f(::AbstractFloat) or proceed to typed code: - > "/tmp/snoop_invalidations.jl", callf: lines [2] - Browse typed code -``` - -This is showing you another menu, with only two options (a third is to go back by hitting `q`). -The first entry shows you the option to open the "offending" source file in `callf` at the position of the call to the parent node of `callf`, which in this case is `f`. -(Sometimes there will be more than one call to the parent within the method, in which case instead of showing `[1]` it might show `[1, 17, 39]` indicating each separate location.) -Selecting this option, when available, is typically the best way to start because you can sometimes resolve the problem just by inspection of the source. - -If you hit the down arrow - -```julia -Choose caller of MethodInstance for f(::AbstractFloat) or proceed to typed code: - "/tmp/snoop_invalidations.jl", callf: lines [2] - > Browse typed code -``` - -and then hit `Enter`, this is what you see: - -```julia -│ ─ %-1 = invoke callf(::Vector{AbstractFloat})::Int64 -Variables - #self#::Core.Const(callf, false) - container::Vector{AbstractFloat} - -Body::Int64 - @ /tmp/snoop_invalidations.jl:2 within `callf' -1 ─ %1 = Base.getindex(container, 1)::AbstractFloat -│ %2 = Main.f(%1)::Int64 -└── return %2 - -Select a call to descend into or ↩ to ascend. [q]uit. [b]ookmark. -Toggles: [o]ptimize, [w]arn, [d]ebuginfo, [s]yntax highlight for Source/LLVM/Native. -Show: [S]ource code, [A]ST, [L]LVM IR, [N]ative code -Advanced: dump [P]arams cache. - - • %1 = invoke getindex(::Vector{AbstractFloat},::Int64)::AbstractFloat - %2 = call #f(::AbstractFloat)::Int64 - ↩ -``` - -This is output from Cthulhu, and you should see its documentation for more information. -(See also [this video](https://www.youtube.com/watch?v=qf9oA09wxXY).) -While it takes a bit of time to master Cthulhu, it is an exceptionally powerful tool for diagnosing and fixing inference issues. - -### "Dead ends": finding runtime callers with MethodAnalysis - -When a call is made by runtime dispatch and the world of available methods to handle the call does not narrow the types -beyond what is known to the caller, the call-chain terminates. -Here is a real-world example (one that may already be "fixed" by the time you read this) from analyzing invalidations triggered by specializing `Base.unsafe_convert(::Type{Ptr{T}}, ::Base.RefValue{S})` for specific types `S` and `T`: - -``` -julia> ascend(root) -Choose a call for analysis (q to quit): - > unsafe_convert(::Type{Ptr{Nothing}}, ::Base.RefValue{_A} where _A) - _show_default(::IOBuffer, ::Any) - show_default(::IOBuffer, ::Function) - show_function(::IOBuffer, ::Function, ::Bool) - print(::IOBuffer, ::Function) - show_default(::IOBuffer, ::ProcessFailedException) - show(::IOBuffer, ::ProcessFailedException) - print(::IOBuffer, ::ProcessFailedException) - show_default(::IOBuffer, ::Sockets.IPAddr) - show(::IOBuffer, ::Sockets.IPAddr) -``` - -Unfortunately for our investigations, none of these "top level" callers have defined backedges. (Overall, it's very fortunate that they don't, in that runtime dispatch without backedges avoids any need to invalidate the caller; the alternative would be extremely long chains of completely unnecessary invalidation, which would have many undesirable consequences.) - -If you want to fix such "short chains" of invalidation, one strategy is to identify callers by brute force search enabled by the [MethodAnalysis.jl](https://github.com/timholy/MethodAnalysis.jl) package. -For example, one can discover the caller of `show(::IOBuffer, ::Sockets.IPAddr)` with - -```julia -using MethodAnalysis # best from a fresh Julia session -mis = methodinstances(); # collect all *existing* MethodInstances (any future compilation will be ignored) -# Create a predicate that finds these argument types -using Sockets -argmatch(typs) = length(typs) >= 2 && typs[1] === IOBuffer && typs[2] === Sockets.IPAddr -# Find any callers -callers = findcallers(show, argmatch, mis) -``` - -which yields a single hit in `print(::IOBuffer, ::IPAddr)`. -This too lacks any backedges, so a second application `findcallers(print, argmatch, mis)` links to `print_to_string(::IPAddr)`. -This MethodInstance has a backedge to `string(::IPAddr)`, which has backedges to the method `Distributed.connect_to_worker(host::AbstractString, port::Integer)`. -A bit of digging shows that this calls `Sockets.getaddrinfo` to look up an IP address, and this is inferred to return an `IPAddr` but the concrete type is unknown. -A potential fix for this situation is described below. - -This does not always work; for example, trying something similar for `ProcessExitedException` fails, likely because the call was made with even less type information. -We might be able to find it with a more general predicate, for example - -``` -argmatch(typs) = length(typs) >= 2 && typs[1] === IOBuffer && ProcessExitedException <: typs[2] -``` - -but this returns a lot of candidates and it is difficult to guess which of these might be the culprit(s). -Finally, `findcallers` only detects method calls that are "hard-wired" into type-inferred code; if the call we're seeking was made from toplevel, or if the function itself was a runtime variable, there is no hope that `findcallers` will detect it. - -### Tips for fixing invalidations - -Invalidations occur in situations like our `call2f(c64)` example, where we changed our mind about what value `f` should return for `Float64`. -Julia could not have returned the newly-correct answer without recompiling the call chain. - -Aside from cases like these, most invalidations occur whenever new types are introduced, -and some methods were previously compiled for abstract types. -In some cases, this is inevitable, and the resulting invalidations simply need to be accepted as a consequence of a dynamic, updatable language. -(As recommended above, you can often minimize invalidations by loading all your code at the beginning of your session, before triggering the compilation of more methods.) -However, in many circumstances an invalidation indicates an opportunity to improve code. -In our first example, note that the call `call2f(c32)` did not get invalidated: this is because the compiler -knew all the specific types, and new methods did not affect any of those types. -The main tips for writing invalidation-resistant code are: - -- use [concrete types](https://docs.julialang.org/en/v1/manual/performance-tips/#man-performance-abstract-container-1) wherever possible -- write inferrable code -- don't engage in [type-piracy](https://docs.julialang.org/en/v1/manual/style-guide/#Avoid-type-piracy-1) (our `c64` example is essentially like type-piracy, where we redefined behavior for a pre-existing type) - -Since these tips also improve performance and allow programs to behave more predictably, -these guidelines are not intrusive. -Indeed, searching for and eliminating invalidations can help you improve the quality of your code. - -#### Fixing `Core.Box` - -[Julia issue 15276](https://github.com/JuliaLang/julia/issues/15276) is one of the more surprising forms of inference failure; it is the most common cause of a `Core.Box` annotation. -If other variables depend on the `Box`ed variable, then a single `Core.Box` can lead to widespread inference problems. -For this reason, these are also among the first inference problems you should tackle. - -Read [this explanation of why this happens and what you can do to fix it](https://docs.julialang.org/en/v1/manual/performance-tips/#man-performance-captured). -If you are directed to find `Core.Box` inference triggers via [`suggest`](@ref), you may need to explore around the call site a bit-- -the inference trigger may be in the closure itself, but the fix needs to go in the method that creates the closure. - -Use of `ascend` is highly recommended for fixing `Core.Box` inference failures. - -#### Adding type annotations - -In cases where invalidations occur, but you can't use concrete types (there are indeed many valid uses of `Vector{Any}`), -you can often prevent the invalidation using some additional knowledge. -One common example is extracting information from an [`IOContext`](https://docs.julialang.org/en/v1/manual/networking-and-streams/#IO-Output-Contextual-Properties-1) structure, which is roughly defined as - -```julia -struct IOContext{IO_t <: IO} <: AbstractPipe - io::IO_t - dict::ImmutableDict{Symbol, Any} -end -``` - -There are good reasons to use a value-type of `Any`, but that makes it impossible for the compiler to infer the type of any object looked up in an `IOContext`. -Fortunately, you can help! -For example, the documentation specifies that the `:color` setting should be a `Bool`, and since it appears in documentation it's something we can safely enforce. -Changing - -``` -iscolor = get(io, :color, false) -``` - -to - -``` -iscolor = get(io, :color, false)::Bool # assert that the rhs is Bool-valued -``` - -will throw an error if it isn't a `Bool`, and this allows the compiler to take advantage of the type being known in subsequent operations. - -We've already seen another relevant example above, where `getaddrinfo(::AbstractString)` was inferred to return an `IPAddr`, which is an abstract type. -Since there are only two such types supported by the Sockets library, -one potential fix is to annotate the returned value from `getaddrinfo` to be `Union{IPv4,IPv6}`. -This will allow Julia to [union-split](https://julialang.org/blog/2018/08/union-splitting/) future operations made using the returned value. - -Before turning to a more complex example, it's worth noting that this trick applied to field accesses of abstract types is often one of the simplest ways to fix widespread inference problems. -This is such an important case that it is described in the section below. - -As a more detailed example, suppose you're writing code that parses Julia's `Expr` type: - -```julia -julia> ex = :(Array{Float32,3}) -:(Array{Float32, 3}) - -julia> dump(ex) -Expr - head: Symbol curly - args: Vector{Any(3,)) - 1: Symbol Array - 2: Symbol Float32 - 3: Int64 3 -``` - -`ex.args` is a `Vector{Any}`. -However, for a `:curly` expression only certain types will be found among the arguments; you could write key portions of your code as - -```julia -a = ex.args[2] -if a isa Symbol - # inside this block, Julia knows `a` is a Symbol, and so methods called on `a` will be resistant to invalidation - foo(a) -elseif a isa Expr && length((a::Expr).args) > 2 - a::Expr # sometimes you have to help inference by adding a type-assert - x = bar(a) # `bar` is now resistant to invalidation -elseif a isa Integer - # even though you've not made this fully-inferrable, you've at least reduced the scope for invalidations - # by limiting the subset of `foobar` methods that might be called - y = foobar(a) -end -``` - -Other tricks include replacing broadcasting on `v::Vector{Any}` with `Base.mapany(f, v)`--`mapany` avoids trying to narrow the type of `f(v[i])` and just assumes it will be `Any`, thereby avoiding invalidations of many `convert` methods. - -Adding type-assertions and fixing inference problems are the most common approaches for fixing invalidations. -You can discover these manually, but using Cthulhu is highly recommended. - -#### Inferrable field access for abstract types - -When invalidations happen for methods that manipulate fields of abstract types, often there is a simple solution: create an "interface" for the abstract type specifying that certain fields must have certain types. -Here's an example: - -``` -abstract type AbstractDisplay end - -struct Monitor <: AbstractDisplay - height::Int - width::Int - maker::String -end - -struct Phone <: AbstractDisplay - height::Int - width::Int - maker::Symbol -end - -function Base.show(@nospecialize(d::AbstractDisplay), x) - str = string(x) - w = d.width - if length(str) > w # do we have to truncate to fit the display width? - ... -``` - -In this `show` method, we've deliberately chosen to prevent specialization on the specific type of `AbstractDisplay` (to reduce the total number of times we have to compile this method). -As a consequence, Julia's inference generally will not realize that `d.width` returns an `Int`--it might be able to discover that by exhaustively checking all subtypes, but if there are a lot of such subtypes then such checks would slow compilation considerably. - -Fortunately, you can help by defining an interface for generic `AbstractDisplay` objects: - -``` -function Base.getproperty(d::AbstractDisplay, name::Symbol) - if name === :height - return getfield(d, :height)::Int - elseif name === :width - return getfield(d, :width)::Int - elseif name === :maker - return getfield(d, :maker)::Union{String,Symbol} - end - return getfield(d, name) -end -``` - -Julia's [constant propagation](https://en.wikipedia.org/wiki/Constant_folding) will ensure that most accesses of those fields will be determined at compile-time, so this simple change robustly fixes many inference problems. - -#### Handling edge cases - -You can sometimes get invalidations from failing to handle "formal" possibilities. -For example, operations with regular expressions might return a `Union{Nothing, RegexMatch}`. -You can sometimes get poor type inference by writing code that fails to take account of the possibility that `nothing` might be returned. -For example, a comprehension - -```julia -ms = [m.match for m in match.((rex,), my_strings)] -``` -might be replaced with -```julia -ms = [m.match for m in match.((rex,), my_strings) if m !== nothing] -``` -and return a better-typed result. diff --git a/docs/src/snoopi.md b/docs/src/snoopi.md deleted file mode 100644 index b59b9f28..00000000 --- a/docs/src/snoopi.md +++ /dev/null @@ -1,375 +0,0 @@ -# [Snooping on inference: `@snoopi`](@id macro-snoopi) - -If you can't use `@snoop_inference` due to Julia version constraints, the most useful choice is `@snoopi`, which is available on Julia 1.2 or higher. - -Julia can cache inference results, so you can use `@snoopi` to generate `precompile` -directives for your package. Executing these directives when the package is compiled -may reduce compilation (inference) time when the package is used. - -Here's a quick demo: - -```julia -using SnoopCompile - -a = rand(Float16, 5) - -julia> inf_timing = @snoopi sum(a) -1-element Array{Tuple{Float64,Core.MethodInstance},1}: - (0.011293888092041016, MethodInstance for sum(::Array{Float16,1})) -``` - -We defined the argument `a`, and then called `sum(a)` while "snooping" on inference. -(The `i` in `@snoopi` means "inference.") -The return is a list of "top level" methods that got compiled, together with the amount of -time spent on inference. -In this case it was just a single method, which required approximately 11ms of inference time. -(Inferring `sum` required inferring all the methods that it calls, but these are -subsumed into the top level inference of `sum` itself.) -Note that the method that got called, - -```julia -julia> @which sum(a) -sum(a::AbstractArray) in Base at reducedim.jl:652 -``` - -is much more general (i.e., defined for `AbstractArray`) than the `MethodInstance` -(defined for `Array{Float16,1}`). This is because precompilation requires the -types of the arguments to specialize the code appropriately. - -The information obtained from `@snoopi` can be used in several ways, primarily to reduce latency during usage of your package: - -- to help you understand which calls take the most inference time -- to help you write `precompile` directives that run inference on specific calls during package precompilation, so that you don't pay this cost repeatedly each time you use the package -- to help you identify inference problems that prevent successful or comprehensive precompilation - -If you're starting a project to try to reduce latency in your package, broadly speaking there are two paths you can take: - -1. you can use SnoopCompile, perhaps together with [CompileBot](https://github.com/aminya/CompileBot.jl), - to automatically generate lists of precompile directives that may reduce latency; -2. you can use SnoopCompile primarily as an analysis tool, and then intervene manually to reduce latency. - -Beginners often leap at option 1, but experience shows there are good reasons to consider option 2. -To avoid introducing too much complexity early on, we'll defer this discussion to the end of this page, but readers who are serious about reducing latency should be sure to read [Understanding precompilation and its limitations](@ref). - -!!! note - Because invalidations can prevent effective precompilation, developers analyzing their - packages with `@snoopi` are encouraged to use Julia versions (1.6 and higher) that have a lower risk - of invalidations in Base and the standard library. - -## [Precompile scripts](@id pcscripts) - -You can use `@snoopi` to come up with a list of precompile-worthy functions. -A recommended approach is to write a script that "exercises" the functionality -you'd like to precompile. -One option is to use your package's `"runtests.jl"` file, or you can write a custom -script for this purpose. -Here's an example for the -[FixedPointNumbers package](https://github.com/JuliaMath/FixedPointNumbers.jl): - -``` -using FixedPointNumbers - -x = N0f8(0.2) -y = x + x -y = x - x -y = x*x -y = x/x -y = Float32(x) -y = Float64(x) -y = 0.3*x -y = x*0.3 -y = 2*x -y = x*2 -y = x/15 -y = x/8.0 -``` - -Save this as a file `"snoopfpn.jl"` and navigate at the Julia REPL to that directory, -and then do - -```julia -julia> using SnoopCompile - -julia> inf_timing = @snoopi tmin=0.01 include("snoopfpn.jl") -2-element Array{Tuple{Float64,Core.MethodInstance},1}: - (0.03108978271484375, MethodInstance for *(::Normed{UInt8,8}, ::Normed{UInt8,8})) - (0.04189491271972656, MethodInstance for Normed{UInt8,8}(::Float64)) -``` - -Here, note the `tmin=0.01`, which causes any methods that take less than 10ms of inference -time to be discarded. - -!!! note - If you're testing this, you might get different results depending on - the speed of your machine. Moreover, if FixedPointNumbers has - already precompiled these method and type combinations---perhaps - by incorporating a precompile file produced by SnoopCompile---then - those methods will be absent. For packages whose precompile - directives are executed only when `ccall(:jl_generating_output, - Cint, ()) == 1`, you can start Julia with `--compiled-modules=no` - to disable them. Alternatively, you can `dev` the package and - comment them out. - -You can inspect these results and write your own precompile file, or use the automated -tools provided by SnoopCompile. - -## [Producing precompile directives automatically](@id auto) - -You can take the output of `@snoopi` and "parcel" it into packages: - -```julia -julia> pc = SnoopCompile.parcel(inf_timing) -Dict{Symbol,Array{String,1}} with 1 entry: - :FixedPointNumbers => ["precompile(Tuple{typeof(*),Normed{UInt8,8},Normed{UInt8,8}})", "precompile(Tuple{Type{Normed{UInt8,8}},Float64})"] -``` - -This splits the calls up into a dictionary, `pc`, indexed by the package which "owns" -each call. -(In this case there is only one, `FixedPointNumbers`, but in more complex cases there may -be several.) You can then write the results to files: - -```julia -julia> SnoopCompile.write("/tmp/precompile", pc) -``` - -If you look in the `/tmp/precompile` directory, you'll see one or more -files, named by their parent package, that may be suitable for `include`ing into the package. -In this case: - -``` -/tmp/precompile$ cat precompile_FixedPointNumbers.jl -function _precompile_() - ccall(:jl_generating_output, Cint, ()) == 1 || return nothing - precompile(Tuple{typeof(*),Normed{UInt8,8},Normed{UInt8,8}}) - precompile(Tuple{Type{Normed{UInt8,8}},Float64}) -end -``` - -If you copy this file to a `precompile.jl` file in the `src` directory, -you can incorporate it into the package like this: - -```julia -module FixedPointNumbers - -# All the usual commands that define the module go here - -# ... followed by: - -include("precompile.jl") -_precompile_() - -end # module FixedPointNumbers -``` - -The listed method/type combinations should have their inference results cached. -Load the package once to precompile it, and then in a fresh Julia session try this: - -```julia -julia> using SnoopCompile - -julia> inf_timing = @snoopi tmin=0.01 include("snoopfpn.jl") -0-element Array{Tuple{Float64,Core.MethodInstance},1} -``` - -The fact that no methods were returned is a sign of success: Julia didn't need to call -inference on those methods, because it used the inference results from the cache file. - -!!! note - Sometimes, `@snoopi` will show method & type combinations that you precompiled. - This is a sign that despite your attempts, Julia declined to cache the inference - results for those methods. - You can either delete those directives from the precompile file, or hope that - they will become useful in a future version of Julia. - Note that having many "useless" precompile directives can slow down precompilation. - -!!! note - As you develop your package, it's possible you'll modify or delete some of the - methods that appear in your `"precompile.jl"` file. - This will *not* result in an error; by default `precompile` fails silently. - If you want to be certain that your precompile directives don't go stale, - you can check that `precompile` returns `true` and otherwise issue a warning. - By default, [`SnoopCompile.write`](@ref) generates - a macro, `@warnpcfail`, and you can use it by - changing `precompile(args...)` to `@warnpcfail precompile(args...)`. - - -If you find that some precompile directives are -ineffective (they appear in a new `@snoopi` despite being precompiled) and their -inference time is substantial, sometimes a bit of manual investigation of the callees -can lead to insights. For example, you might be able to introduce a precompile in a -dependent package that can mitigate the total time. -(`@snoop_inference` makes the analysis and resolution of these issues more straightforward.) - -!!! tip - For packages that support just Julia 1.6 and higher, you may be able to slim down the precompile file by - adding `has_bodyfunction=true` to the arguments for `parcel`. - This setting applies for all packges in `inf_timing`, so you may need to call `parcel` twice (with both `false` and `true`) and select the appropriate precompile file for each package. - -## Producing precompile directives manually - -While this "automated" approach is often useful, sometimes it makes more sense to -inspect the results and write your own precompile directives. -For example, for FixedPointNumbers a more elegant and comprehensive precompile file -might be - -```julia -function _precompile_() - ccall(:jl_generating_output, Cint, ()) == 1 || return nothing - for T in (N0f8, N0f16) # Normed types we want to support - for f in (+, -, *, /) # operations we want to support - precompile(Tuple{typeof(f),T,T}) - for S in (Float32, Float64, Int) # other number types we want to support - precompile(Tuple{typeof(f),T,S}) - precompile(Tuple{typeof(f),S,T}) - end - end - for S in (Float32, Float64) - precompile(Tuple{Type{T},S}) - precompile(Tuple{Type{S},T}) - end - end -end -``` - -This covers `+`, `-`, `*`, `/`, and conversion for various combinations of types. -The results from `@snoopi` can suggest method/type combinations that might be useful to -precompile, but often you can generalize its suggestions in useful ways. - -## Analyzing omitted methods - -There are some method signatures that cannot be precompiled. -For example, suppose you have two packages, `A` and `B`, that are independent of one another. -Then `A.f([B.Object(1)])` cannot be precompiled, because `A` does not know about `B.Object`, -and `B` does not know about `A.f`, unless both `A` and `B` get included into a third package. - -Such problematic method signatures are removed automatically. -If you want to be informed about these removals, you can use Julia's logging framework -while running `parcel`: - -``` -julia> using Base.CoreLogging - -julia> logger = SimpleLogger(IOBuffer(), CoreLogging.Debug); - -julia> pc = with_logger(logger) do - SnoopCompile.parcel(inf_timing) - end - -julia> msgs = String(take!(logger.stream)) -``` - -The omitted method signatures will be logged to the string `msgs`. - - -## Understanding precompilation and its limitations - -Suppose your package includes the following method: - -```julia -""" - idx = index_midsum(a) - -Return the index of the first item more than "halfway to the cumulative sum," -meaning the smallest integer so that `sum(a[begin:idx]) >= sum(a)/2`. -""" -function index_midsum(a::AbstractVector) - ca = cumsum(vcat(0, a)) # cumulative sum of items in a, starting from 0 - s = ca[end] # the sum of all elements - return findfirst(x->x >= s/2, ca) - 1 # compensate for inserting 0 -end -``` -Now, suppose that you'd like to reduce latency in using this method, and you know that an important use case is when `a` is a `Vector{Int}`. -Therefore, you might precompile it: - -```julia -julia> precompile(index_midsum, (Vector{Int},)) -true -``` -This will cause Julia to infer this method for the given argument types. If you add such statements to your package, it potentially saves your users from having to wait for it to be inferred each time they use your package. - -!!! note - The `true` indicates that Julia was successfully able to find a method supporting this signature and precompile it. - See the note about `@warnpcfail` above for ways to exploit this in your package. - - -But if you execute these lines in the REPL, and then check how well it worked, you might see something like the following: -```julia -julia> using SnoopCompile - -julia> tinf = @snoopi index_midsum([1,2,3,4,100]) -3-element Vector{Tuple{Float64, Core.MethodInstance}}: - (0.00048613548278808594, MethodInstance for cat_similar(::Int64, ::Type, ::Tuple{Int64})) - (0.010090827941894531, MethodInstance for (::Base.var"#cat_t##kw")(::NamedTuple{(:dims,), Tuple{Val{1}}}, ::typeof(Base.cat_t), ::Type{Int64}, ::Int64, ::Vararg{Any, N} where N)) - (0.016659975051879883, MethodInstance for __cat(::Vector{Int64}, ::Tuple{Int64}, ::Tuple{Bool}, ::Int64, ::Vararg{Any, N} where N)) -``` -Even though we'd already said `precompile(index_midsum, (Vector{Int},))` in this session, somehow we needed *more* inference of various concatenation methods. -Why does this happen? -A detailed investigation (e.g., using [Cthulhu](https://github.com/JuliaDebug/Cthulhu.jl) or `@code_warntype`) would reveal that `vcat(0, a)` is not inferrable "all the way down," and hence the `precompile` directive couldn't predict everything that was going to be needed. - -No problem, you say: let's just precompile those methods too. The most expensive is the last one. You might not know where `__cat` is defined, but you can find out with -```julia -julia> mi = tinf[end][2] # get the MethodInstance -MethodInstance for __cat(::Vector{Int64}, ::Tuple{Int64}, ::Tuple{Bool}, ::Int64, ::Vararg{Any, N} where N) - -julia> mi.def # get the Method -__cat(A, shape::Tuple{Vararg{Int64, M}}, catdims, X...) where M in Base at abstractarray.jl:1599 - -julia> mi.def.module # which module was this method defined in? -Base -``` - -!!! note - When using `@snoopi` you might sometimes see entries like - `MethodInstance for (::SomeModule.var"#10#12"{SomeType})(::AnotherModule.AnotherType)`. - These typically correspond to closures/anonymous functions defined with `->` or `do` blocks, - but it may not be immediately obvious where these come from. - `mi.def` will show you the file/line number that these are defined on. - You can either convert them into named functions to make them easier to precompile, - or you can fix inference problems that prevent automatic precompilation (as illustrated below). - -Armed with this knowledge, let's start a fresh session (so that nothing is precompiled yet), and in addition to defining `index_midsum` and precompiling it, we also execute - -```julia -julia> precompile(Base.__cat, (Vector{Int64}, Tuple{Int64}, Tuple{Bool}, Int, Vararg{Any, N} where N)) -true -``` - -Now if you try that `tinf = @snoopi index_midsum([1,2,3,4,100])` line, you'll see that the `__cat` call is omitted, suggesting success. - -However, if you copy both `precompile` directives into your package source files and then check it with `@snoopi` again, -you may be in for a rude surprise: the `__cat` precompile directive doesn't "work." -That turns out to be because your package doesn't "own" that `__cat` method--the module is `Base` rather than `YourPackage`--and because inference cannot determine that it's needed by by `index_midsum(::Vector{Int})`, Julia doesn't know which `*.ji` file to use to -store its precompiled form. - -How to fix this? -Fundamentally, the problem is that `vcat` call: if we can write `index_midsum` in a way so that inference succeeds, then all these problems go away. -(You can use `ascend(mi)`, with Cthulhu.jl, where `mi` was obtained above, to discover that `__cat` gets called from `vcat`. See [`Cthulhu.ascend`](@ref ascend-itrig) for more information.) -It turns out that `vcat` is inferrable if all the arguments have the same type, so just changing `vcat(0, a)` to `vcat([zero(eltype(a))], a)` fixes the problem. -(Alternatively, you could make a copy and then use `pushfirst!`.) -In a fresh Julia session: - -```julia -function index_midsum(a::AbstractVector) - ca = cumsum(vcat([zero(eltype(a))], a)) # cumulative sum of items in a, starting from 0 - s = ca[end] # the sum of all elements - return findfirst(x->x >= s/2, ca) - 1 # compensate for inserting 0 -end - -julia> precompile(index_midsum, (Vector{Int},)) -true - -julia> using SnoopCompile - -julia> tinf = @snoopi index_midsum([1,2,3,4,100]) -Tuple{Float64, Core.MethodInstance}[] -``` - -Tada! No additional inference was needed, ensuring that your users will not suffer any latency due to type-inference of this particular method/argument combination. -In addition to identifing a call deserving of precompilation, `@snoopi` helped us identify a weakness in its implementation. -Fixing that weakness reduced latency, made the code more resistant to invalidation, and may improve runtime performance. - -In other cases, manual inspection of the results from `@snoopi` may lead you in a different direction: you may discover that a huge number of specializations are being created for a method that doesn't need them. -Typical examples are methods that take types or functions as inputs: for example, there is no reason to recompile `methods(f)` for each separate `f`. -In such cases, by far your best option is to add `@nospecialize` annotations to one or more of the arguments of that method. Such changes can have dramatic impact on the latency of your package. - -The ability to make interventions like these--which can both reduce latency and improve runtime speed--is a major reason to consider `@snoopi` primarily as an analysis tool rather than just a utility to blindly generate lists of precompile directives. diff --git a/docs/src/tutorial.md b/docs/src/tutorial.md deleted file mode 100644 index 7f6a9577..00000000 --- a/docs/src/tutorial.md +++ /dev/null @@ -1,257 +0,0 @@ -# [Tutorial on the foundations](@id tutorial) - -Certain concepts and types will appear repeatedly, so it's worth -spending a little time to familiarize yourself at the outset. -You can find a more expansive version of this page in [this blog post](https://julialang.org/blog/2021/01/precompile_tutorial/). - -## Cut to the Chase: A copy-paste analysis of invalidations - -The following is a quick "grab and go" script for analyzing invalidations. -Insert package loads (`using` or `import` statements) and/or method definitions into the `@snoop_invalidations` block, -and put the workload you want to be fast in the `@snoop_inference` block. -The resulting plot shows the distribution of the invalidations sorted by the number of children affected. -Generally, invalidations with many children matter more than those -with few children, and thus this shows how many "bad actors" need to be investigated. `show(trees[end])` show the method which leads to the most -invalidations, with `show(trees[end-1])` being the second most, and so forth. -While the plot shows total invalidations (`trees`), only the ones in `staletrees` affect the workload in `@snoop_inference`. - -```julia -using SnoopCompileCore -invalidations = @snoop_invalidations using PkgA, PkgB; -tinf = @snoop_inference begin - some_workload() -end -using SnoopCompile -trees = invalidation_trees(invalidations) -staletrees = precompile_blockers(trees, tinf) - -@show length(uinvalidated(invalidations)) # show total invalidations - -show(trees[end]) # show the most invalidating method - -# Count number of children (number of invalidations per invalidated method) -n_invalidations = map(SnoopCompile.countchildren, trees) - -# (optional) plot the number of children per method invalidations -import Plots -Plots.plot( - 1:length(trees), - n_invalidations; - markershape=:circle, - xlabel="i-th method invalidation", - label="Number of children per method invalidations" -) - -# (optional) report invalidations summary -using PrettyTables # needed for `report_invalidations` to be defined -SnoopCompile.report_invalidations(; - invalidations, - process_filename = x -> last(split(x, ".julia/packages/")), - n_rows = 0, # no-limit (show all invalidations) - ) -``` - -## `MethodInstance`s, type-inference, and backedges - -Our first goal is to understand how code connects together. -We'll try some experiments using the following: - -```julia -double(x::Real) = 2x -calldouble(container) = double(container[1]) -calldouble2(container) = calldouble(container) -``` - -```@meta -DocTestSetup = quote - double(x::Real) = 2x - calldouble(container) = double(container[1]) - calldouble2(container) = calldouble(container) -end -``` - -Let's create a `container` and run this code: - -```jldoctest tutorial -julia> c64 = [1.0] -1-element Vector{Float64}: - 1.0 - -julia> calldouble2(c64) -2.0 -``` - -Using the [MethodAnalysis](https://github.com/timholy/MethodAnalysis.jl) package, we can get some insights into how Julia represents this code and its compilation dependencies: - -```jldoctest tutorial; setup=:(calldouble2(c64)) -julia> using MethodAnalysis - -julia> mi = methodinstance(double, (Float64,)) -MethodInstance for double(::Float64) - -julia> using AbstractTrees - -julia> print_tree(mi) -MethodInstance for double(::Float64) -└─ MethodInstance for calldouble(::Vector{Float64}) - └─ MethodInstance for calldouble2(::Vector{Float64}) -``` - -This indicates that the result for type-inference on `calldouble2(::Vector{Float64})` depended on the result for `calldouble(::Vector{Float64})`, which in turn depended on `double(::Float64)`. - -Now let's create a new container, one with abstract element type, so that Julia's type-inference cannot accurately predict the type of elements in the container: - -```jldoctest tutorial -julia> cabs = AbstractFloat[1.0f0] # put a Float32 in a Vector{AbstractFloat} -1-element Vector{AbstractFloat}: - 1.0f0 - -julia> calldouble2(cabs) -2.0f0 -``` - -Now let's look at the available instances: - -```jldoctest tutorial; setup=:(calldouble2(c64); calldouble2(cabs)) -julia> mis = methodinstances(double) -3-element Vector{Core.MethodInstance}: - MethodInstance for double(::Float64) - MethodInstance for double(::AbstractFloat) - MethodInstance for double(::Float32) - -julia> print_tree(mis[1]) -MethodInstance for double(::Float64) -└─ MethodInstance for calldouble(::Vector{Float64}) - └─ MethodInstance for calldouble2(::Vector{Float64}) - -julia> print_tree(mis[2]) -MethodInstance for double(::AbstractFloat) - -julia> print_tree(mis[3]) -MethodInstance for double(::Float32) -``` - -`double(::Float64)` has backedges to `calldouble` and `calldouble2`, but the second two do not because `double` was only called via runtime dispatch. However, `calldouble` has backedges to `calldouble2` - -```julia -julia> mis = methodinstances(calldouble) -2-element Vector{Core.MethodInstance}: - MethodInstance for calldouble(::Vector{Float64}) - MethodInstance for calldouble(::Vector{AbstractFloat}) - -julia> print_tree(mis[1]) -MethodInstance for calldouble(::Vector{Float64}) -└─ MethodInstance for calldouble2(::Vector{Float64}) - -julia> print_tree(mis[2]) -MethodInstance for calldouble(::Vector{AbstractFloat}) -└─ MethodInstance for calldouble2(::Vector{AbstractFloat}) -``` - -because `Vector{AbstractFloat}` is a concrete type, whereas `AbstractFloat` is not. - -If we create `c32 = [1.0f0]` and then `calldouble2(c32)`, we would also see backedges from `double(::Float32)` all the way back to `calldouble2(::Vector{Float32})`. - -## Precompilation - -During *package precompilation*, Julia creates a `*.ji` file typically stored in `.julia/compiled/v1.x/`, where `1.x` is your version of Julia. -Your `*.ji` file might just have definitions of constants, types, and methods, but optionally you can also include the results of type-inference. -This happens automatically if you run code while your package is being built, but generally the recommended procedure is to add *precompile directives*. - -Let's turn the example above into a package. In a fresh session, - -```julia -(@v1.6) pkg> generate SnoopCompileDemo - Generating project SnoopCompileDemo: - SnoopCompileDemo/Project.toml - SnoopCompileDemo/src/SnoopCompileDemo.jl - -julia> open("SnoopCompileDemo/src/SnoopCompileDemo.jl", "w") do io - write(io, """ - module SnoopCompileDemo - - double(x::Real) = 2x - calldouble(container) = double(container[1]) - calldouble2(container) = calldouble(container) - - precompile(calldouble2, (Vector{Float32},)) - precompile(calldouble2, (Vector{Float64},)) - precompile(calldouble2, (Vector{AbstractFloat},)) - - end - """) - end -282 - -julia> push!(LOAD_PATH, "SnoopCompileDemo/") -4-element Vector{String}: - "@" - "@v#.#" - "@stdlib" - "SnoopCompileDemo/" - -julia> using SnoopCompileDemo -[ Info: Precompiling SnoopCompileDemo [44c70eed-03a3-46c0-8383-afc033fb6a27] - -julia> using MethodAnalysis - -julia> methodinstances(SnoopCompileDemo.double) -3-element Vector{Core.MethodInstance}: - MethodInstance for double(::Float32) - MethodInstance for double(::Float64) - MethodInstance for double(::AbstractFloat) -``` - -Because of those `precompile` statements, the `MethodInstance`s exist after loading the package even though we haven't run the code in this session--not because it precompiled them when the package loaded, but because they were precompiled during the `Precompiling SnoopCompileDemo...` phase, stored to `*.ji` file, and then reloaded whenever we use the package. -You can also verify that the same backedges get created as when we ran this code interactively above. - -By having these `MethodInstance`s "pre-loaded" we can save some of the time needed to run type-inference: not much time in this case because the code is so simple, but for more complex methods the savings can be substantial. - -This code got cached in `SnoopCompileDemo.ji`. It's worth noting that even though the `precompile` directive got issued from this package, it might save `MethodInstances` for methods defined in other packages. -For example, Julia does not come pre-built with the inferred code for `Int * Float32`: in a fresh session, - -```julia -julia> using MethodAnalysis - -julia> mi = methodinstance(*, (Int, Float32)) - -``` -returns `nothing` (the `MethodInstance` doesn't exist), whereas if we've loaded `SnoopCompileDemo` then - -```julia -julia> mi = methodinstance(*, (Int, Float32)) -MethodInstance for *(::Int64, ::Float32) - -julia> mi.def -*(x::Number, y::Number) in Base at promotion.jl:322 -``` - -So even though the method is defined in `Base`, because `SnoopCompileDemo` needed this code it got stashed in `SnoopCompileDemo.ji`. - -*The ability to cache `MethodInstance`s from code defined in other packages or libraries is fundamental to latency reduction; however, it has significant limitations.* Most crucially, `*.ji` files can only hold code they "own," either: - -- to a method defined in the package -- through a chain of backedges to methods owned by the package - -If we add - -```julia -precompile(*, (Int, Float16)) -``` - -to the definition of `SnoopCompileDemo.jl`, nothing happens: - -```julia -julia> mi = methodinstance(*, (Int, Float16)) - # nothing -``` - -because there is no "chain of ownership" to `SnoopCompileDemo`. -Consequently, we can't precompile methods defined in other modules in and of themselves; we can only do it if those methods are linked by backedges to this package. - -Because backedges are created during successful type-inference, the consequence is that *precompilation works better when type inference succeeds.* -For some packages, time invested in improving inferrability can make your `precompile` directives work better. - -```@meta -DocTestSetup = nothing -``` diff --git a/docs/src/tutorials/invalidations.md b/docs/src/tutorials/invalidations.md new file mode 100644 index 00000000..8d4d1f9d --- /dev/null +++ b/docs/src/tutorials/invalidations.md @@ -0,0 +1,282 @@ +# Tutorial on `@snoop_invalidations` + +## What are invalidations? + +In this context, *invalidation* means discarding previously-compiled code. Invalidations occur because of interactions between independent pieces of code. Invalidations are essential to make Julia fast, interactive, and correct: you *need* invalidations if you want to be able to define some methods, run (compile) some code, and then in the same session define new methods that might lead to different answers if you were to recompile the code in the presence of the new methods. + +Invalidations can happen just from loading packages. Packages are precompiled in isolation, but you can load many packages into a single interactive session. It's impossible for the individual packages to anticipate the full "world of methods" in your interactive session, so sometimes Julia has to discard code that was compiled in a smaller world because it's at risk for being incorrect in the larger world. + +The downside of invalidations is that they make latency worse, as code must be recompiled when you first run it. The benefits of precompilation are partially lost, and the work done during precompilation is partially wasted. + +While some invalidations are unavoidable, in practice a good developer can often design packages to minimize the number and/or impact of invalidations. Invalidation-resistant code is often faster, with smaller binary size, than code that is vulnerable to invalidation. + +A good first step is to measure what's being invalidated, and why. + +## Learning to observe, diagnose, and fix invalidations + +We'll illustrate invalidations by creating two packages, where loading the second package invalidates some code that was compiled in the first one. We'll then go over approaches for "fixing" invalidations (i.e., preventing them from occuring). + +!!! tip + Since SnoopCompile's tools are interactive, you are strongly encouraged to try these examples yourself as you read along. + +### Add SnoopCompileCore, SnoopCompile, and helper packages to your environment + +Here, we'll add these packages to your [default environment](https://pkgdocs.julialang.org/v1/environments/). (With the exception of `AbstractTrees`, these "developer tool" packages should not be added to the Project file of any real packages unless you're extending the tool itself.) From your default environment (i.e., in package mode you should see something like `(@v1.10) pkg>`), do + +``` +using Pkg +Pkg.add(["SnoopCompileCore", "SnoopCompile", "AbstractTrees", "Cthulhu"]); +``` + +### Create the demonstration packages + +We're going to implement a toy version of the card game [blackjack](https://www.wikihow.com/Play-Blackjack), where players take cards with the aim of collecting 21 points. The higher you go the better, *unless* you go over 21 points, in which case you "go bust" (i.e., you lose). Because our real goal is to illustrate invalidations, we'll create a "blackjack ecosystem" that involves an interaction between two packages. + +While [PkgTemplates](https://github.com/JuliaCI/PkgTemplates.jl) is recommended for creating packages, here we'll just use the basic capabilities in `Pkg`. +To create the (empty) packages, the code below executes the following steps: +- navigate to a temporary directory and create both packages +- make the first package (`Blackjack`) depend on [PrecompileTools](https://github.com/JuliaLang/PrecompileTools.jl) (we're interested in reducing latency!) +- make the second package (`BlackjackFacecards`) depend on the first one (`Blackjack`) + +```@repl tutorial-invalidations +oldproj = Base.active_project() # hide +cd(mktempdir()) +using Pkg +Pkg.generate("Blackjack"); +Pkg.activate("Blackjack") +Pkg.add("PrecompileTools"); +Pkg.generate("BlackjackFacecards"); +Pkg.activate("BlackjackFacecards") +Pkg.develop(PackageSpec(path=joinpath(pwd(), "Blackjack"))); +``` + +Now it's time to create the code for `Blackjack`. Normally, you'd do this with an editor, but to make it reproducible here we'll use code to create these packages. The package code we'll create below defines the following: +- a `score` function to assign a numeric value to a card +- `tallyscore` which adds the total score for a hand of cards +- `playgame` which uses a simple strategy to decide whether to take another card from the deck and add it to the hand + +To reduce latency on first use, we then precompile `playgame`. In a real application, we'd also want a function to manage the `deck` of cards, but for brevity we'll omit this and do it manually. + +```@repl tutorial-invalidations +write(joinpath("Blackjack", "src", "Blackjack.jl"), """ + module Blackjack + + using PrecompileTools + + export playgame + + const deck = [] # the deck of cards that can be dealt + + # Compute the score of one card + score(card::Int) = card + + # Add up the score in a hand of cards + function tallyscores(cards) + s = 0 + for card in cards + s += score(card) + end + return s + end + + # Play the game! We use a simple strategy to decide whether to draw another card. + function playgame() + myhand = [] + while tallyscores(myhand) <= 14 && !isempty(deck) + push!(myhand, pop!(deck)) # "Hit me!" + end + myscore = tallyscores(myhand) + return myscore <= 21 ? myscore : "Busted" + end + + # Precompile `playgame`: + @setup_workload begin + push!(deck, 8, 10) # initialize the deck + @compile_workload begin + playgame() + end + end + + end + """) +``` + +Suppose you use `Blackjack` and like it, but you notice it doesn't support face cards. Perhaps you're nervous about contributing to the `Blackjack` package (you shouldn't be!), and so you decide to start your own package that extends its functionality. You create `BlackjackFacecards` to add scoring of the jack, queen, king, and ace (for simplicity we'll make the ace always worth 11): + +```@repl tutorial-invalidations +write(joinpath("BlackjackFacecards", "src", "BlackjackFacecards.jl"), """ + module BlackjackFacecards + + using Blackjack + + # Add a new `score` method: + Blackjack.score(card::Char) = card ∈ ('J', 'Q', 'K') ? 10 : + card == 'A' ? 11 : error(card, " not known") + + end + """) +``` + +!!! warning + Because `BlackjackFacecards` "owns" neither `Char` nor `score`, this is [piracy](https://docs.julialang.org/en/v1/manual/style-guide/#Avoid-type-piracy-1) and should generally be avoided. Piracy is one way to cause invalidations, but it's not the only one. `BlackjackFacecards` could avoid committing piracy by defining a `struct Facecard ... end` and defining `score(card::Facecard)` instead of `score(card::Char)`. However, this would *not* fix the invalidations--all the factors described below are unchanged. + +Now we're ready! + +### Recording invalidations + +Here are the steps executed by the code below +- load `SnoopCompileCore` +- load `Blackjack` and `BlackjackFacecards` while *recording invalidations* with the `@snoop_invalidations` macro. +- load `SnoopCompile` and `AbstractTrees` for analysis + +```@repl tutorial-invalidations +using SnoopCompileCore +invs = @snoop_invalidations using Blackjack, BlackjackFacecards; +using SnoopCompile, AbstractTrees +``` + +!!! tip + If you get errors like `Package SnoopCompileCore not found in current path`, a likely explanation is that + you didn't add it to your default environment. In the example above, we're in the `BlackjackFacecards` environment + so we can develop the package, but you also need access to `SnoopCompile` and `SnoopCompileCore`. Having these in your [default environment](https://docs.julialang.org/en/v1/manual/code-loading/#Environment-stacks) lets them be found even if they aren't part of the current environment. + +### Analyzing invalidations + +Now we're ready to see what, if anything, got invalidated: + +```@repl tutorial-invalidations +trees = invalidation_trees(invs) +``` + +This has only one "tree" of invalidations. `trees` is a `Vector` so we can index it: + +```@repl tutorial-invalidations +tree = trees[1] +``` + +Each tree stems from a single *cause* described in the top line. For this tree, the cause was adding the new method `score(::Char)` in `BlackjackFacecards`. + +Each *cause* is associated with one or more *victims* of invalidation, a list here named `mt_backedges`. Let's extract the final (and in this case, only) victim: + +```@repl tutorial-invalidations +sig, victim = tree.mt_backedges[end]; +``` + +!!! note + `mt_backedges` stands for "MethodTable backedges." In other cases you may see a second type of invalidation, just called `backedges`. With these, there is no `sig`, and so you'll use just `victim = tree.backedges[i]`. + +First let's look at the the problematic method `sig`nature: + +```@repl tutorial-invalidations +sig +``` + +This is a type-tuple, i.e., `Tuple{typeof(f), typesof(args)...}`. We see that `score` was called on an object of (inferred) type `Any`. **Calling a function with unknown argument types makes code vulnerable to invalidation, and insertion of the new `score` method "exploited" this vulnerability.** + +`victim` shows which compiled code got invalidated: + +```@repl tutorial-invalidations +victim +``` + +But this is not the full extent of what got invalidated: + +```@repl tutorial-invalidations +print_tree(victim) +``` + +Invalidations propagate throughout entire call trees, here up to `playgame()`: anything that calls code that may no longer be correct is itself at risk for being incorrect. +In general, victims with lots of "children" deserve the greatest attention. + +While `print_tree` can be useful, Cthulhu's `ascend` is a far more powerful tool for gaining deeper insight: + +```julia +julia> using Cthulhu + +julia> ascend(victim) +Choose a call for analysis (q to quit): + > tallyscores(::Vector{Any}) + playgame() +``` + +This is an interactive REPL-menu, described more completely (via text and video) at [ascend](https://github.com/JuliaDebug/Cthulhu.jl?tab=readme-ov-file#usage-ascend). + +There are quite a few other tools for working with `invs` and `trees`, see the [Reference](@ref). If your list of invalidations is dauntingly large, you may be interested in [precompile_blockers](@ref). + +### Why the invalidations occur + +`tallyscores` and `playgame` were compiled in `Blackjack`, a "world" where the `score` method defined in `BlackjackFacecards` does not yet exist. When you load the `BlackjackFacecards` package, Julia must ask itself: now that this new `score` method exists, am I certain that I would compile `tallyscores` the same way? If the answer is "no," Julia invalidates the old compiled code, and compiles a fresh version with full awareness of the new `score` method in `BlackjackFacecards`. + +Why would the compilation of `tallyscores` change? Evidently, `cards` is a `Vector{Any}`, and this means that `tallyscores` can't guess what kind of object `card` might be, and thus it can't guess what kind of objects are passed into `score`. The crux of the invalidation is thus: +- when `Blackjack` is compiled, inference does not know which `score` method will be called. However, at the time of compilation the only `score` method is for `Int`. Thus Julia will reason that anything that isn't an `Int` is going to trigger an error anyway, and so you might as well optimize `tallyscore` expecting all cards to be `Int`s. +- however, when `BlackjackFacecards` is loaded, suddenly there are two `score` methods supporting both `Int` and `Char`. Now Julia's guess that all `cards` will probably be `Int`s doesn't seem so likely to be true, and thus `tallyscores` should be recompiled. + +Thus, invalidations arise from optimization based on what methods and types are "in the world" at the time of compilation (sometimes called *world-splitting*). This form of optimization can have performance benefits, but it also leaves your code vulnerable to invalidation. + +### Fixing invalidations + +In broad strokes, there are three ways to prevent invalidation. + +#### Method 1: defer compilation until the full world is known + +The first and simplest technique is to ensure that the full range of possibilties (the entire "world of code") is present before any compilation occurs. In this case, probably the best approach would be to merge the `BlackjackFacecards` package into `Blackjack` itself. Or, if you are a maintainer of the "Blackjack ecosystem" and have reasons for thinking that keeping the packages separate makes sense, you could alternatively move the `PrecompileTools` workload to `BlackjackFacecards`. Either approach should prevent the invalidations from occuring. + +#### Method 2: improve inferability + +The second way to prevent invalidations is to improve the inferability of the victim(s). If `Int` and `Char` really are the only possible kinds of cards, then in `playgame` it would be better to declare + +```julia +myhand = Union{Int,Char}[] +``` +and similarly for `deck` itself. That untyped `[]` is what makes `myhand` (and thus `cards`, when passed to `tallyscore`) a `Vector{Any}`, and the possibilities for `card` are endless. By constraining the possible types, we allow inference to know more clearly what methods might be called. More tips on fixing invalidations through improving inference can be found in [Techniques for fixing inference problems](@ref). + +In this particular case, just annotating `Union{Int,Char}[]` isn't sufficient on its own, because the `score` method for `Char` doesn't yet exist, so Julia doesn't know what to call. However, in most real-world cases this change alone would be sufficient: usually all the needed methods exist, it's just a question of reassuring Julia that no other options are even possible. + +!!! note + This fix leverages [union-splitting](https://julialang.org/blog/2018/08/union-splitting/), which is conceptually related to "world-splitting." However, union-splitting is far more effective at fixing inference problems, as it guarantees that no other possibilities will *ever* exist, no matter how many other methods get defined. + +!!! tip + Many vulnerabilities can be fixed by improving inference. In complex code, it's easy to unwittingly write things in ways that defeat Julia's type inference. Tools that help you discover inference problems, like SnoopCompile and [JET](@ref), help you discover these unwitting "mistakes." + +While in real life it's usually a bad idea to "blame the victim," it's typically the right attitude for fixing invalidations. Keep in mind, though, that the source of the problem may not be the immediate victim: in this case, it was a poor container choice in `playgame` that put `tallyscore` in the bad position of having to operate on a `Vector{Any}`. + +Improving inferability is probably the most broadly-applicable technique, and when applicable it usually gives the best outcomes: not only is your code more resistant to invalidation, but it's likely faster and compiles to smaller binaries. However, of the three approaches it is also the one that requires the deepest understanding of Julia's type system, and thus may be difficult for some coders to use. + +There are cases where there is no good way to make the code inferable, in which case other strategies are needed. + +#### Method 3: disable Julia's speculative optimization + +The third option is to prevent Julia's speculative optimization: one could replace `score(card)` with `invokelatest(score, card)`: + +```julia +function tallyscores(cards) + s = 0 + for card in cards + s += invokelatest(score, card) + end + return s +end +``` + +This forces Julia to always look up the appropriate method of `score` while the code is running, and thus prevents the speculative optimizations that leave the code vulnerable to invalidation. However, the cost is that your code may run somewhat more slowly, particularly here where the call is inside a loop. + +If you plan to define at least two `score` methods, another way to turn off this optimization would be to declare + +```julia +Base.Experimental.@max_methods 1 function score end +``` + +before defining any `score` methods. You can read the documentation on `@max_methods` to learn more about how it works. + +!!! tip + Most of us learn best by doing. Try at least one of these methods of fixing the invalidation, and use SnoopCompile to verify that it works. + +### Undoing the damage from invalidations + +If you can't prevent the invalidation, an alternative approach is to recompile the invalidated code. For example, one could repeat the precompile workload from `Blackjack` in `BlackjackFacecards`. While this will mean that the whole "stack" will be compiled twice and cached twice (which is wasteful), it should be effective in reducing latency for users. + +PrecompileTools also has a `@recompile_invalidations`. This isn't generally recommended for use in package (you can end up with long compile times for things you don't need), but it can be useful in personal "Startup packages" where you want to reduce latency for a particular project you're working on. See the PrecompileTools documentation for details. + +```@repl tutorial-invalidations +Pkg.activate(oldproj) # hide +``` diff --git a/docs/src/jet.md b/docs/src/tutorials/jet.md similarity index 85% rename from docs/src/jet.md rename to docs/src/tutorials/jet.md index 5d403b34..af220d35 100644 --- a/docs/src/jet.md +++ b/docs/src/tutorials/jet.md @@ -1,22 +1,17 @@ -# [JET integration](@id JET) +# Tutorial on JET integration -[JET](https://github.com/aviatesk/JET.jl) is a powerful tool for analyzing call graphs. -Some of its functionality overlaps that of SnoopCompile's, that is, JET also provides mechanisms to detect potential errors. -Conversely, JET is a purely static-analysis tool and lacks SnoopCompile's ability to "bridge" across runtime dispatch. -In summary, JET doesn't need Julia to restart to find inference failures, but JET will only find the first inference failure. -SnoopCompile has to run in a fresh session, but finds all inference failures. - -For this reason, the combination of the tools provides capabilities that neither package has on its own. -Specifically, one can use SnoopCompile to collect data on the callgraph and JET to perform the error analysis. +[JET](https://github.com/aviatesk/JET.jl) is a powerful tool for analyzing your code. +As described [elsewhere](@ref JET), some of its functionality overlaps SnoopCompile, but its mechanism of action is very different. The combination JET and SnoopCompile provides capabilities that neither package has on its own. +Specifically, one can use SnoopCompile to collect data on the full callgraph and JET to perform the exhaustive analysis of individual nodes. The integration between the two packages is bundled into SnoopCompile, specifically [`report_callee`](@ref), -[`report_callees`](@ref), and [`report_caller`](@ref). These take [`InferenceTrigger`](@ref) (see the page on [inference failures](@ref inferrability)) and use them to generate JET reports. +[`report_callees`](@ref), and [`report_caller`](@ref). These take [`InferenceTrigger`](@ref) (see the page on [inference failures](@ref inferrability)) and use them to generate JET reports. These tools focus on error-analysis rather than optimization, as SnoopCompile can already identify runtime dispatch. We can demonstrate both the need and use of these tools with a simple extended example. ## JET usage -JET provides a useful report for the following call: +As a basic introduction to JET, let's analyze the following call from JET's own documentation: ```jldoctest jet; filter=[r"@ reduce.*", r"(in|@)", r"(REPL\[\d+\]|none)"] julia> using JET @@ -62,7 +57,7 @@ ERROR: MethodError: no method matching zero(::Type{Any}) (This can be circumvented with `sum(Any[]; init=0)`.) -This is the kind of bug that can "lurk" undetected for a long time, and JET excels at exposing them. +This is the kind of bug that can lurk undetected for a long time, and JET excels at exposing them. ## JET limitations @@ -85,15 +80,13 @@ Because we "hid" the type of `list` from inference, JET couldn't tell what speci ## JET/SnoopCompile integration -The resolution to this problem is to use SnoopCompile to do the "data collection" and JET to do the analysis. +A resolution to this problem is to use SnoopCompile to do the "data collection" and JET to do the analysis. The key reason is that SnoopCompile is a dynamic analyzer, and is capable of bridging across runtime dispatch. As always, you need to do the data collection in a fresh session where the calls have not previously been inferred. After restarting Julia, we can do this: ```julia -julia> using SnoopCompile, JET, Cthulhu - -julia> using JET # this is necessary to enable the integration +julia> using SnoopCompileCore julia> list = Any[1,2,3]; @@ -101,8 +94,9 @@ julia> lc = Any[list]; # "hide" `list` inside a Vector{Any} julia> callsum(listcontainer) = sum(listcontainer[1]); -julia> tinf = @snoop_inference callsum(lc) -InferenceTimingNode: 0.039239/0.046793 on Core.Compiler.Timings.ROOT() with 2 direct children +julia> tinf = @snoop_inference callsum(lc); + +julia> using SnoopCompile, JET, Cthulhu julia> tinf.children 2-element Vector{SnoopCompileCore.InferenceTimingNode}: @@ -133,7 +127,7 @@ julia> report_callees(inference_triggers(tinf)) ││││││││││││││││└──────────────────── ``` -Because SnoopCompile collected the runtime-dispatched `sum` call, we can pass it to JET. +Because SnoopCompileCore collected the runtime-dispatched `sum` call, we can pass it to JET. `report_callees` filters those calls which generate JET reports, allowing you to focus on potential errors. !!! note diff --git a/docs/src/pgdsgui.md b/docs/src/tutorials/pgdsgui.md similarity index 80% rename from docs/src/pgdsgui.md rename to docs/src/tutorials/pgdsgui.md index 8b8f94a5..17d0dc61 100644 --- a/docs/src/pgdsgui.md +++ b/docs/src/tutorials/pgdsgui.md @@ -1,6 +1,14 @@ # [Profile-guided despecialization](@id pgds) -As indicated in the [workflow](@ref), one of the important early steps is to evaluate and potentially adjust method specialization. +Julia's multiple dispatch allows developers to create methods for specific argument types. On top of this, the Julia compiler performs *automatic specialization*: + +``` +function countnonzeros(A::AbstractArray) + ... +end +``` + +will be compiled separately for `Vector{Int}`, `Matrix{Float64}`, `SubArray{...}`, and so on, if it gets called for each of these types. Each specialization (each `MethodInstance` with different argument types) costs extra inference and code-generation time, so while specialization often improves runtime performance, that has to be weighed against the cost in latency. There are also cases in which [overspecialization can hurt both run-time and compile-time performance](https://docs.julialang.org/en/v1/manual/performance-tips/#The-dangers-of-abusing-multiple-dispatch-(aka,-more-on-types-with-values-as-parameters)). @@ -11,16 +19,27 @@ The name is a reference to a related technique, [profile-guided optimization](ht Both PGO and PGDS use runtime profiling to help guide decisions about code optimization. PGO is often used in languages whose default mode is to avoid specialization, whereas PGDS seems more appropriate for a language like Julia which specializes by default. -While PGO is sometimes an automatic part of the compiler that optimizes code midstream during execution, PGDS is a tool for making static changes in code. +While PGO is sometimes an automatic part of the compiler that optimizes code midstream during execution, SnoopCompile's PGDS is a tool for making static changes (edits) to code. Again, this seems appropriate for a language where specialization typically happens prior to the first execution of the code. +### Add SnoopCompileCore, SnoopCompile, and helper packages to your environment + +We'll add these packages to your [default environment](https://pkgdocs.julialang.org/v1/environments/) so you can use them while in the package environment: + +``` +using Pkg +Pkg.add(["SnoopCompileCore", "SnoopCompile", "PyPlot"]); +``` + +PyPLot is used for the PGDS interface in part to reduce interference with native-Julia plotting packages like Makie--it's a little awkward to depend on a package that you might be simultaneously modifying! + ## Using the PGDS graphical user interface To illustrate the use of PGDS, we'll examine an example in which some methods get specialized for hundreds of types. To keep this example short, we'll create functions that operate on types themselves. !!! note - For a `DataType` `T`, `T.name` returns a `Core.TypeName`, and `T.name.name` returns the name as a `Symbol`. + As background to this example, for a `DataType` `T`, `T.name` returns a `Core.TypeName`, and `T.name.name` returns the name as a `Symbol`. `Base.unwrap_unionall(T)` preserves `DataType`s as-is, but converts a `UnionAll` type into a `DataType`. ```julia @@ -56,12 +75,11 @@ mappushes(f, src) = mappushes!(f, [], src) There are two stages to PGDS: first (and preferrably starting in a fresh Julia session), we profile type-inference: ```julia -julia> using SnoopCompile +julia> using SnoopCompileCore julia> Ts = subtypes(Any); # get a long list of different types -julia> tinf = @snoop_inference mappushes(spelltype, Ts) -InferenceTimingNode: 4.476700/5.591207 on InferenceFrameInfo for Core.Compiler.Timings.ROOT() with 587 direct children +julia> tinf = @snoop_inference mappushes(spelltype, Ts); ``` Then, *in the same session*, profile the runtime: @@ -78,6 +96,8 @@ get a realistic view of where your code spends its time during actual use. Now let's launch the PDGS GUI: ```julia +julia> using SnoopCompile + julia> import PyPlot # the GUI is dependent on PyPlot, must load it before the next line julia> mref, ax = pgdsgui(tinf); @@ -85,7 +105,7 @@ julia> mref, ax = pgdsgui(tinf); You should see something like this: -![pgdsgui](assets/pgds_spec.png) +![pgdsgui](../assets/pgds_spec.png) In this graph, each dot corresponds to a single method; for this method, we plot inference time (vertical axis) against the run time (horizontal axis). The coloration of each dot encodes the number of specializations (the number of distinct `MethodInstance`s) for that method; @@ -151,14 +171,14 @@ end ``` !!! warning - `where` type-parameters force specialization, regardless of `@nospecialize`: in `spelltype(@nospecialize(::Type{T})) where T`, the `@nospecialize` has no impact and you'll get full specialization on `T`. - Instead, use `@nospecialize(T::Type)` as shown. + `where` type-parameters force specialization: in `spelltype(@nospecialize(::Type{T})) where T`, the `@nospecialize` has no impact and you'll get full specialization on `T`. + Instead, use `@nospecialize(T::Type)` (without the `where` statement) as shown. If we now rerun that demo, you should see a plot of the same kind as shown above, but with different costs for each dot. The differences are best appreciated comparing them side-by-side ([`pgdsgui`](@ref) allows you to specify a particular axis into which to plot): -![pgdsgui-compare](assets/pgds_compareplots.png) +![pgdsgui-compare](../assets/pgds_compareplots.png) The results with `@nospecialize` are shown on the right. You can see that: @@ -173,11 +193,30 @@ Reducing specialization, when appropriate, can often yield your biggest reductio !!! tip When you add `@nospecialize`, sometimes it's beneficial to compensate for the loss of inferrability by adding some type assertions. This topic will be discussed in greater detail in the next section, but for the example above we can improve runtime performance by annotating the return type of `Base.unwrap_unionall(T)`: `name = (Base.unwrap_unionall(T)::DataType).name.name`. - Then, later lines in `spell` know that `name` is a `Symbol`. + Then, later lines in `spelltype` know that `name` is a `Symbol`. With this change, the unspecialized variant outperforms the specialized variant in *both compile-time and run-time*. The reason is that the specialized variant of `spell` needs to be called by runtime dispatch, whereas for the unspecialized variant there's only one `MethodInstance`, so its dispatch is handled at compile time. +### Blocking inference: `Base.@nospecializeinfer` + +Perhaps surprisingly, `@nospecialize` doesn't prevent Julia's type-inference from inspecting a method. The reason is that it's sometimes useful if the *caller* knows what type will be returned, even if the *callee* doesn't exploit this information. In our `mappushes` example, this isn't an issue, because `Ts` is a `Vector{Any}` and this already defeats inference. But in other cases, the caller may be inferable but (to save inference time) you'd prefer to block inference from inspecting the method. + +Beginning with Julia 1.10, you can prevent even inference from "looking at" `@nospecialize`d arguments with `Base.@nospecializeinfer`: + +``` +Base.@nospecializeinfer function spelltype(@nospecialize(T::Type)) + name = (Base.unwrap_unionall(T)::DataType).name.name + str = "" + for c in string(name) + str *= c + end + return str +end +``` + +Note that the `::DataType` annotation described in the tip above is still effective and recommended. `@nospecializeinfer` directly affects only arguments that are marked with `@nospecialize`, and in this case the type-assertion prevents type uncertainty from propagating to the remainder of the function. + ### Argument standardization While not immediately relevant to the example above, a very important technique that falls within the domain of reducing specialization is *argument standardization*: instead of @@ -204,7 +243,7 @@ The "standardizing method" `foo(x, y)` is short and therefore quick to compile, Without it, `foo(x, y)` might call itself in an infinite loop, ultimately triggering a StackOverflowError. StackOverflowErrors are a particularly nasty form of error, and the typeassert ensures that you get a simple `TypeError` instead. - In other contexts, such typeasserts would also have the effect of fixing inference problems even if the type of `x` is not well-inferred (this will be discussed in more detail [later](@ref typeasserts)), but in this case dispatch to `foo(x::X, y::Y)` would have ensured the same outcome. + In other contexts, such typeasserts would also have the effect of fixing inference problems even if the type of `x` is not well-inferred, but in this case dispatch to `foo(x::X, y::Y)` would have ensured the same outcome. There are of course cases where you can't implement your code in this way: after all, part of the power of Julia is the ability of generic methods to "do the right thing" for a wide variety of types. But in cases where you're doing a standard task, e.g., writing some data to a file, there's really no good reason to recompile your `save` method for a filename encoded as a `String` and again for a `SubString{String}` and again for a `SubstitutionString` and again for an `AbstractString` and ...: after all, the core of the `save` method probably isn't sensitive to the precise encoding of the filename. In such cases, it should be safe to convert all filenames to `String`, thereby reducing the diversity of input arguments for expensive-to-compile methods. diff --git a/docs/src/snoop_inference.md b/docs/src/tutorials/snoop_inference.md similarity index 62% rename from docs/src/snoop_inference.md rename to docs/src/tutorials/snoop_inference.md index ce493880..ebd1389a 100644 --- a/docs/src/snoop_inference.md +++ b/docs/src/tutorials/snoop_inference.md @@ -1,23 +1,23 @@ -# Snooping on inference: `@snoop_inference` +# Tutorial on `@snoop_inference` -!!! compat - `@snoop_inference` is available on `Julia 1.6.0-DEV.1190` or above, but the results can be relevant for all Julia versions. +Inference may occur when you *run* code. Inference is the first step of *type-specialized* compilation. `@snoop_inference` collects data on what inference is doing, giving you greater insight into what is being inferred and how long it takes. -Currently, `precompile` only caches results for type-inference, not other stages in code generation. -For that reason, efforts at reducing latency should be informed by measuring the amount of time spent on type-inference. -Moreover, because all code needs to be type-inferred before undergoing later stages of code generation, monitoring this "entry point" can give you an overview of the entire compile chain. +Compilation is needed only for "fresh" code; running the demos below on code you've already used will yield misleading results. When analyzing inference, you're advised to always start from a fresh session. See also the [comparison between SnoopCompile and JET](@ref JET). -The rich data collected by `@snoop_inference` are useful for several different purposes; -on this page, we'll describe the basic tool and show how it can be used to profile inference. -On later pages we'll show other ways to use the data to reduce the amount of type-inference or cache its results. +### Add SnoopCompileCore, SnoopCompile, and helper packages to your environment -## Collecting the data +Here, we'll add these packages to your [default environment](https://pkgdocs.julialang.org/v1/environments/). (With the exception of `AbstractTrees`, these "developer tool" packages should not be added to the Project file of any real packages unless you're extending the tool itself.) -Like [`@snoop_invalidations`](@ref), `@snoop_inference` is exported by both `SnoopCompileCore` and `SnoopCompile`, but in this case there is not as much reason to do the data collection by a very minimal package. Consequently here we'll just load `SnoopCompile` at the outset. +``` +using Pkg +Pkg.add(["SnoopCompileCore", "SnoopCompile", "AbstractTrees", "ProfileView"]); +``` + +## Setting up the demo To see `@snoop_inference` in action, we'll use the following demo: -```jldoctest flatten-demo +```jldoctest flatten-demo; filter=r"Main\.var\"Main\"\." module FlattenDemo struct MyType{T} x::T end @@ -42,23 +42,29 @@ FlattenDemo ``` The main call, `packintype`, stores the input in a `struct`, and then calls functions that extract the field value and performs arithmetic on the result. -To profile inference on this call, we simply do the following: -```jldoctest flatten-demo; setup=:(using SnoopCompile), filter=r"([0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?|WARNING: replacing module FlattenDemo\.\n)" -julia> tinf = @snoop_inference FlattenDemo.packintype(1) +## [Collecting the data](@id sccshow) + +To profile inference on this call, do the following: + +```jldoctest flatten-demo; filter=r"([0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?|WARNING: replacing module FlattenDemo\.\n)" +julia> using SnoopCompileCore + +julia> tinf = @snoop_inference FlattenDemo.packintype(1); + +julia> using SnoopCompile + +julia> tinf InferenceTimingNode: 0.002712/0.003278 on Core.Compiler.Timings.ROOT() with 1 direct children ``` !!! tip - Inference gets called only on the *first* invocation of a method with those specific types. You have to redefine the `FlattenDemo` module (by just re-executing the command we used to define it) if you want to collect data with `@snoop_inference` on the same code a second time. - - To make it easier to perform these demonstrations and use them for documentation purposes, `SnoopCompile` includes a function [`SnoopCompile.flatten_demo()`](@ref) that redefines the module and returns `tinf`. + Don't omit the semicolon on the `tinf = @snoop_inference ...` line, or you may get an enormous amount of output. The compact display on the final line is possible only because `SnoopCompile` defines nice `Base.show` methods for the data returned by `@snoop_inference`. These methods cannot be defined in `SnoopCompileCore` because it has a fundamental design constraint: loading `SnoopCompileCore` is not allowed to invalidate any code. Moving those `Base.show` methods to `SnoopCompileCore` would violate that guarantee. This may not look like much, but there's a wealth of information hidden inside `tinf`. ## A quick check for potential invalidations - After running `@snoop_inference`, it's generally recommended to check the output of [`staleinstances`](@ref): ```julia julia> staleinstances(tinf) @@ -66,20 +72,11 @@ SnoopCompileCore.InferenceTiming[] ``` If you see this, all's well. -A non-empty list might indicate method invalidations, which can be checked (in a fresh session) by running the identical workload with [`@snoop_invalidations`](@ref). - -!!! warning - Rampant invalidation can make the process of analyzing `tinf` more confusing: "why am I getting reinference of this `MethodInstance` when I `precompile`d it?" Routine use of `staleinstances` at the beginning can save you some head-scratching later. - -!!! tip - Your workload may load packages and/or (re)define methods; these can be sources of invalidation and therefore non-empty output - from `staleinstances`. - One trick that may circumvent some invalidation is to load the packages and make the method definitions before launching `@snoop_inference`, because it ensures the methods are in place - before your workload triggers compilation. +A non-empty list might indicate method invalidations, which can be checked (in a fresh session) using the tools described in [Tutorial on `@snoop_invalidations`](@ref). If you do have a lot of invalidations, [`precompile_blockers`](@ref) may be an effective way to reveal those invalidations that affect your particular package and workload. -## Viewing the results +## [Viewing the results](@id flamegraph) Let's start unpacking the output of `@snoop_inference` and see how to get more insight. First, notice that the output is an `InferenceTimingNode`: it's the root element of a tree of such nodes, all connected by caller-callee relationships. @@ -89,7 +86,7 @@ You may have noticed that this `ROOT` node prints with two numbers. It will be easier to understand their meaning if we first display the whole tree. We can do that with the [AbstractTrees](https://github.com/JuliaCollections/AbstractTrees.jl) package: -```jldoctest flatten-demo; filter=r"[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?" +```jldoctest flatten-demo; filter=[r"[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?", r"Main\.var\"Main\"\."] julia> using AbstractTrees julia> print_tree(tinf) @@ -105,7 +102,7 @@ InferenceTimingNode: 0.002712/0.003278 on Core.Compiler.Timings.ROOT() with 1 di This tree structure reveals the caller-callee relationships, showing the specific types that were used for each `MethodInstance`. Indeed, as the calls to `getproperty` reveal, it goes beyond the types and even shows the results of [constant propagation](https://en.wikipedia.org/wiki/Constant_folding); -the `getproperty(::MyType{Int64}, x::Symbol)` (note `x::Symbol` instead of just plain `::Symbol`) means that the call was `getproperty(y, :x)`, which corresponds to `y.x` in the definition of `extract`. +the `getproperty(::MyType{Int64}, x::Symbol)` corresponds to `y.x` in the definition of `extract`. !!! note Generally we speak of [call graphs](https://en.wikipedia.org/wiki/Call_graph) rather than call trees. @@ -147,7 +144,7 @@ julia> ProfileView.view(fg) You should see something like this: -![flamegraph](assets/flamegraph-flatten-demo.png) +![flamegraph](../assets/flamegraph-flatten-demo.png) Users are encouraged to read the ProfileView documentation to understand how to interpret this, but briefly: @@ -157,7 +154,7 @@ Users are encouraged to read the ProfileView documentation to understand how to - right-clicking on a box opens the corresponding method in your editor - ctrl-click can be used to zoom in - empty horizontal spaces correspond to activities other than type-inference -- any boxes colored red (there are none in this particular example, but you'll see some later) correspond to *non-precompilable* `MethodInstance`s, in which the method is owned by one module but the types are from another unrelated module. +- any boxes colored red (there are none in this particular example, but you'll see some later) correspond to *naively non-precompilable* `MethodInstance`s, in which the method is owned by one module but the types are from another unrelated module. Such `MethodInstance`s are omitted from the precompile cache file unless they've been "marked" by `PrecompileTools.@compile_workload` or an explicit `precompile` directive. - any boxes colored orange-yellow (there is one in this demo) correspond to methods inferred for specific constants (constant propagation) You can explore this flamegraph and compare it to the output from `print_tree`. @@ -165,5 +162,4 @@ You can explore this flamegraph and compare it to the output from `print_tree`. Finally, [`flatten`](@ref), on its own or together with [`accumulate_by_source`](@ref), allows you to get an sense for the cost of individual `MethodInstance`s or `Method`s. The tools here allow you to get an overview of where inference is spending its time. -Sometimes, this information alone is enough to show you how to change your code to reduce latency: perhaps your code is spending a lot of time inferring cases that are not needed in practice and could be simplified. -However, most efforts at latency reduction will probably leverage additional tools (described next) that help identify the main opportunities for intervention. +This gives you insight into the major contributors to latency. diff --git a/docs/src/tutorials/snoop_inference_analysis.md b/docs/src/tutorials/snoop_inference_analysis.md new file mode 100644 index 00000000..34dee1cb --- /dev/null +++ b/docs/src/tutorials/snoop_inference_analysis.md @@ -0,0 +1,264 @@ +# [Using `@snoop_inference` results to improve inferrability](@id inferrability) + +Throughout this page, we'll use the `OptimizeMe` demo, which ships with `SnoopCompile`. + +!!! note + To understand what follows, it's essential to refer to [`OptimizeMe` source code](https://github.com/timholy/SnoopCompile.jl/blob/master/examples/OptimizeMe.jl) as you follow along. + +```@repl fix-inference +using SnoopCompileCore, SnoopCompile # here we need the SnoopCompile path for the next line (normally you should wait until after data collection is complete) +include(joinpath(pkgdir(SnoopCompile), "examples", "OptimizeMe.jl")) +tinf = @snoop_inference OptimizeMe.main(); +fg = flamegraph(tinf) +``` + +If you visualize `fg` with ProfileView, you may see something like this: + +![flamegraph-OptimizeMe](../assets/flamegraph-OptimizeMe.png) + +From the standpoint of precompilation, this has some obvious problems: + +- even though we called a single method, `OptimizeMe.main()`, there are many distinct flames separated by blank spaces. This indicates that many calls are being made by runtime dispatch: each separate flame is a fresh entrance into inference. +- several of the flames are marked in red, indicating that they are not naively precompilable (see the [Tutorial on `@snoop_inference`](@ref)). While `@compile_workload` can handle these flames, an even more robust solution is to eliminate them altogether. + +Our goal will be to improve the design of `OptimizeMe` to make it more readily precompilable. + +## Analyzing inference triggers + +We'll first extract the "triggers" of inference, which is just a repackaging of part of the information contained within `tinf`. +Specifically an [`InferenceTrigger`](@ref) captures callee/caller relationships that straddle a fresh entrance to type-inference, allowing you to identify which calls were made by runtime dispatch and what `MethodInstance` they called. + +```@repl fix-inference +itrigs = inference_triggers(tinf) +``` + +The number of elements in this `Vector{InferenceTrigger}` tells you how many calls were (1) made by runtime dispatch and (2) the callee had not previously been inferred. + +!!! tip + In the REPL, `SnoopCompile` displays `InferenceTrigger`s with yellow coloration for the callee, red for the caller method, and blue for the caller specialization. This makes it easier to quickly identify the most important information. + +In some cases, this might indicate that you'll need to fix each case separately; fortunately, in many cases fixing one problem addresses many other. + +### [Method triggers](@id methtrigs) + +Most often, it's most convenient to organize them by the method triggering the need for inference: + +```@repl fix-inference +mtrigs = accumulate_by_source(Method, itrigs) +``` + +The methods triggering the largest number of inference runs are shown at the bottom. +You can also select methods from a particular module: + +```@repl fix-inference +modtrigs = filtermod(OptimizeMe, mtrigs) +``` + +Rather than filter by a single module, you can alternatively call `SnoopCompile.parcel(mtrigs)` to split them out by module. +In this case, most of the triggers came from `Base`, not `OptimizeMe`. +However, many of the failures in `Base` were nevertheless indirectly due to `OptimizeMe`: our methods in `OptimizeMe` call `Base` methods with arguments that trigger internal inference failures. +Fortunately, we'll see that using more careful design in `OptimizeMe` can avoid many of those problems. + +!!! tip + If you have a longer list of inference triggers than you feel comfortable tackling, filtering by your package's module or using [`precompile_blockers`](@ref) can be a good way to start. + Fixing issues in the package itself can end up resolving many of the "indirect" triggers too. + Also be sure to note the ability to filter out likely "noise" from [test suites](@ref test-suites). + +You can get an overview of each Method trigger with `summary`: + +```@repl fix-inference +mtrig = modtrigs[1] +summary(mtrig) +``` + +You can also say `edit(mtrig)` and be taken directly to the method you're analyzing in your editor. +You can still "dig deep" into individual triggers: + +```@repl fix-inference +itrig = mtrig.itrigs[1] +``` + +This is useful if you want to analyze with `Cthulhu.ascend`. +`Method`-based triggers, which may aggregate many different individual triggers, can be useful because tools like [Cthulhu.jl](https://github.com/JuliaDebug/Cthulhu.jl) show you the inference results for the entire `MethodInstance`, allowing you to fix many different inference problems at once. + +### Trigger trees + +While method triggers are probably the most useful way of organizing these inference triggers, for learning purposes here we'll use a more detailed scheme, which organizes inference triggers in a tree: + +```@repl fix-inference +itree = trigger_tree(itrigs) +using AbstractTrees +print_tree(itree) +``` + +This gives you a big-picture overview of how the inference failures arose. +The parent-child relationships are based on the backtraces at the entrance to inference, +and the nodes are organized in the order in which inference occurred. +Inspection of these trees can be informative; for example, here we notice a lot of method specializations for `Container{T}` for different `T`. + +We're going to march through these systematically. + +### `suggest` and fixing `Core.Box` + +You may have noticed above that `summary(mtrig)` generated a red `has Core.Box` message. Assuming that `itrig` is still the first (and it turns out, only) trigger from this method, let's look at this again, explicitly using [`suggest`](@ref), the tool that generated this hint: + +```@repl fix-inference +suggest(itrig) +``` + +You can see that SnoopCompile recommends tackling this first; depending on how much additional code is affected, fixing a `Core.Box` allows inference to work better and may resolve other triggers. + +This message also directs readers to a section of [this documentation](@ref Fixing-Core.Box) that links to a page of the Julia manual describing the underlying problem. The Julia documentation suggests a couple of fixes, of which the best (in this case) is to use the `let` statement to rebind the variable and end any "conflict" with the closure: + +``` +function abmult(r::Int, ys) + if r < 0 + r = -r + end + let r = r # Julia #15276 + return map(x -> howbig(r * x), ys) + end +end +``` + + + +### `suggest` and a fix involving manual `eltype` specification + +Let's look at the other Method-trigger rooted in `OptimizeMe`: + +```@repl fix-inference +mtrig = modtrigs[2] +summary(mtrig) +itrig = mtrig.itrigs[1] +``` + +If you use Cthulhu's `ascend(itrig)` you might see something like this: + +![ascend-lotsa](../assets/ascend_optimizeme1.png) + +The first thing to note here is that `cs` is inferred as an `AbstractVector`--fixing this to make it a concrete type should be our next goal. There's a second, more subtle hint: in the call menu at the bottom, the selected call is marked `< semi-concrete eval >`. This is a hint that a method is being called with a non-concrete type. + +What might that non-concrete type be? + +```@repl fix-inference +isconcretetype(OptimizeMe.Container) +``` + +The statement `Container.(list)` is thus creating an `AbstractVector` with a non-concrete element type. +You can seem in greater detail what happens, inference-wise, in this snippet from `print_tree(itree)`: + +``` + ├─ similar(::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1}, Tuple{Base.OneTo{Int64}}, Type{Main.OptimizeMe.Container}, Tuple{Base.Broadcast.Extruded{Vector{Any}, Tuple{Bool}, Tuple{Int64}}}}, ::Type{Main.OptimizeMe.Container{Int64}}) + ├─ setindex!(::Vector{Main.OptimizeMe.Container{Int64}}, ::Main.OptimizeMe.Container{Int64}, ::Int64) + ├─ Base.Broadcast.copyto_nonleaf!(::Vector{Main.OptimizeMe.Container{Int64}}, ::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1}, Tuple{Base.OneTo{Int64}}, Type{Main.OptimizeMe.Container}, Tuple{Base.Broadcast.Extruded{Vector{Any}, Tuple{Bool}, Tuple{Int64}}}}, ::Base.OneTo{Int64}, ::Int64, ::Int64) + │ ├─ similar(::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1}, Tuple{Base.OneTo{Int64}}, Type{Main.OptimizeMe.Container}, Tuple{Base.Broadcast.Extruded{Vector{Any}, Tuple{Bool}, Tuple{Int64}}}}, ::Type{Main.OptimizeMe.Container}) + │ └─ Base.Broadcast.restart_copyto_nonleaf!(::Vector{Main.OptimizeMe.Container}, ::Vector{Main.OptimizeMe.Container{Int64}}, ::Base.Broadcast.Broadcasted +``` + +In rough terms, what this means is the following: +- since the first item in `list` is an `Int`, the output initially gets created as a `Vector{Container{Int}}` +- however, `copyto_nonleaf!` runs into trouble when it goes to copy the second item, which is a `Container{UInt8}` +- hence, `copyto_nonleaf!` re-allocates the output array to be a generic `Vector{Container}` and then calls `restart_copyto_nonleaf!`. + +We can prevent all this hassle with one simple change: rewrite that line as + +``` +cs = Container{Any}.(list) +``` + +We use `Container{Any}` here because there is no more specific element type--other than an unreasonably-large `Union`--that can hold all the items in `list`. + +If you make these edits manually, you'll see that we've gone from dozens of `itrigs` (38 on Julia 1.10, you may get a different number on other Julia versions) down to about a dozen (13 on Julia 1.10). Real progress! + +### Replacing hard-to-infer calls with lower-level APIs + +We note that many of the remaining triggers are somehow related to `show`, for example: + +``` +Inference triggered to call show(::IOContext{Base.TTY}, ::MIME{Symbol("text/plain")}, ::Vector{Main.OptimizeMe.Container{Any}}) from #55 (/cache/build/builder-amdci4-0/julialang/julia-release-1-dot-10/usr/share/julia/stdlib/v1.10/REPL/src/REPL.jl:273) with specialization (::REPL.var"#55#56"{REPL.REPLDisplay{REPL.LineEditREPL}, MIME{Symbol("text/plain")}, Base.RefValue{Any}})(::Any) +``` + +In this case we see that the calling method is `#55`. This is a `gensym`, or generated symbol, indicating that the method was generated during Julia's lowering pass, and might indicate a macro, a `do` block or other anonymous function, the generator for a `@generated` function, etc. + +`edit(itrig)` (or equivalently, `edit(node)` where `node` is a child of `itree`) takes us to this method in `Base`, for which key lines are + +```julia +function display(d::REPLDisplay, mime::MIME"text/plain", x) + x = Ref{Any}(x) + with_repl_linfo(d.repl) do io + ⋮ + show(io, mime, x[]) + ⋮ +end +``` + +The generated method corresponds to the `do` block here. +The call to `show` comes from `show(io, mime, x[])`. +This implementation uses a clever trick, wrapping `x` in a `Ref{Any}(x)`, to prevent specialization of the method defined by the `do` block on the specific type of `x`. +This trick is designed to limit the number of `MethodInstance`s inferred for this `display` method. + +A great option is to replace the call to `display` with an explicit + +``` +show(stdout, MIME("text/plain"), cs) +``` + +There's one extra detail: the type of `stdout` is not fixed (and therefore not known), because one can use a terminal, a file, `devnull`, etc., as `stdout`. If you want to prevent all runtime dispatch from this call, you'd need to supply an `io::IO` object of known type as the first argument. It could, for example, be passed in to `lotsa_containers` from `main`: + +``` +function lotsa_containers(io::IO) + ⋮ + println(io, "lotsa containers:") + show(io, MIME("text/plain"), cs) +end +``` + +However, if you want it to go to `stdout`--and to allow users to redirect `stdout` to a target of their choosing--then an `io` argument may have to be of unknown type when called from `main`. + +### When you need to rely on `@compile_workload` + +Most of the remaining triggers are difficult to fix because they occur in deliberately-`@nospecialize`d portions of Julia's internal code for displaying arrays. In such cases, adding a `PrecompileTools.@compile_workload` is your best option. Here we use an interesting trick: + +``` +@compile_workload begin + lotsa_containers(devnull) # use `devnull` to suppress output + abmult(rand(-5:5), rand(3)) +end +precompile(lotsa_containers, (Base.TTY,)) +``` + +During the workload, we pass `devnull` as the `io` object to `lotsa_containers`: this suppresses the output so you don't see anything during precompilation. However, `devnull` is not a `Base.TTY`, the standard type of `stdout`. Nevertheless, this is effective because we can see that many of the callees in the remaining inference-triggers do not depend on the `io` object. + +To really ice the cake, we also add a manual `precompile` directive. (`precompile` doesn't execute the method, it just compiles it.) This doesn't "step through" runtime dispatch, but at least it precompiles the entry point. +Thus, at least `lotsa_containers` will be precompiled for the most likely `IO` type encountered in practice. + +With these changes, we've fixed nearly all the latency problems in `OptimizeMe`, and made it much less vulnerable to invalidation as well. You can see the final code in the [`OptimizeMeFixed` source code](https://github.com/timholy/SnoopCompile.jl/blob/master/examples/OptimizeMeFixed.jl). Note that this would have to be turned into a real package for the `@compile_workload` to have any effect. + +## [A note on analyzing test suites](@id test-suites) + +If you're doing a package analysis, it's convenient to use the package's `runtests.jl` script as a way to cover much of the package's functionality. +SnoopCompile has a couple of enhancements designed to make it easier to ignore inference triggers that come from the test suite itself. +First, `suggest.(itrigs)` may show something like this: + +``` + ./broadcast.jl:1315: inlineable (ignore this one) + ./broadcast.jl:1315: inlineable (ignore this one) + ./broadcast.jl:1315: inlineable (ignore this one) + ./broadcast.jl:1315: inlineable (ignore this one) +``` + +This indicates a broadcasting operation in the `@testset` itself. +Second, while it's a little dangerous (because `suggest` cannot entirely be trusted), you can filter these out: + +```julia +julia> itrigsel = [itrig for itrig in itrigs if !isignorable(suggest(itrig))]; + +julia> length(itrigs) +222 + +julia> length(itrigsel) +71 +``` + +While there is some risk of discarding triggers that provide clues about the origin of other triggers (e.g., they would have shown up in the same branch of the `trigger_tree`), the shorter list may help direct your attention to the "real" issues. diff --git a/docs/src/tutorials/snoop_inference_parcel.md b/docs/src/tutorials/snoop_inference_parcel.md new file mode 100644 index 00000000..f072cc4c --- /dev/null +++ b/docs/src/tutorials/snoop_inference_parcel.md @@ -0,0 +1,69 @@ +# [Using `@snoop_inference` to emit manual precompile directives](@id precompilation) + +In a few cases, it may be inconvenient or impossible to precompile using a [workload](https://julialang.github.io/PrecompileTools.jl/stable/#Tutorial:-forcing-precompilation-with-workloads). Some examples might be: +- an application that opens graphical windows +- an application that connects to a database +- an application that creates, deletes, or rewrites files on disk + +In such cases, one alternative is to create a manual list of precompile directives using Julia's `precompile(f, argtypes)` function. + +!!! warning + Manual precompile directives are much more likely to "go stale" as the package is developed---`precompile` does not throw an error if a method for the given `argtypes` cannot be found. They are also more likely to be dependent on the Julia version, operating system, or CPU architecture. Whenever possible, it's safer to use a workload. + +`precompile` directives have to be emitted by the module that owns the method and/or types. +SnoopCompile comes with a tool, `parcel`, that splits out the "root-most" precompilable MethodInstances into their constituent modules. +This will typically correspond to the bottom row of boxes in the [flame graph](@ref flamegraph). +In cases where you have some that are not naively precompilable, they will include MethodInstances from higher up in the call tree. + +Let's use `SnoopCompile.parcel` on our [`OptimizeMe`](@ref inferrability) demo: + +```@repl parcel-inference +using SnoopCompileCore, SnoopCompile # here we need the SnoopCompile path for the next line (normally you should wait until after data collection is complete) +include(joinpath(pkgdir(SnoopCompile), "examples", "OptimizeMe.jl")) +tinf = @snoop_inference OptimizeMe.main(); +ttot, pcs = SnoopCompile.parcel(tinf); +ttot +pcs +``` + +`ttot` shows the total amount of time spent on type-inference. +`parcel` discovered precompilable MethodInstances for four modules, `Core`, `Base.Multimedia`, `Base`, and `OptimizeMe` that might benefit from precompile directives. +These are listed in increasing order of inference time. + +Let's look specifically at `OptimizeMeFixed`, since that's under our control: + +```@repl parcel-inference +pcmod = pcs[end] +tmod, tpcs = pcmod.second; +tmod +tpcs +``` + +This indicates the amount of time spent specifically on `OptimizeMe`, plus the list of calls that could be precompiled in that module. + +We could look at the other modules (packages) similarly. + +## SnoopCompile.write + +You can generate files that contain ready-to-use `precompile` directives using `SnoopCompile.write`: + +```@repl parcel-inference +SnoopCompile.write("/tmp/precompiles_OptimizeMe", pcs) +``` + +You'll now find a directory `/tmp/precompiles_OptimizeMe`, and inside you'll find files for modules that could have precompile directives added manually. +The contents of the last of these should be recognizable: + +```julia +function _precompile_() + ccall(:jl_generating_output, Cint, ()) == 1 || return nothing + Base.precompile(Tuple{typeof(main)}) # time: 0.4204474 +end +``` + +The first `ccall` line ensures we only pay the cost of running these `precompile` directives if we're building the package; this is relevant mostly if you're running Julia with `--compiled-modules=no`, which can be a convenient way to disable precompilation and examine packages in their "native state." +(It would also matter if you've set `__precompile__(false)` at the top of your module, but if so why are you reading this?) + +This file is ready to be moved into the `OptimizeMe` repository and `include`d into your module definition. + +You might also consider submitting some of the other files (or their `precompile` directives) to the packages you depend on. diff --git a/docs/src/tutorials/snoop_llvm.md b/docs/src/tutorials/snoop_llvm.md new file mode 100644 index 00000000..a1b2eacb --- /dev/null +++ b/docs/src/tutorials/snoop_llvm.md @@ -0,0 +1,37 @@ +# Tutorial on `@snoop_llvm` + +Julia uses the [LLVM compiler](https://llvm.org/) to generate machine code. Typically, the two main contributors to the overall compile time are inference and LLVM, and thus together `@snoop_inference` and `@snoop_llvm` collect fairly comprehensive data on the compiler. + +`@snoop_llvm` has a somewhat different design than `@snoop_inference`: while `@snoop_inference` runs in the same session that you'll be using for analysis (and thus requires that you remember to do the data gathering in a fresh session), `@snoop_llvm` spawns a fresh process to collect the data. The downside is that you get less interactivity, as the data have to be written out in intermediate forms as a text file. + +### Add SnoopCompileCore and SnoopCompile to your environment + +Here, we'll add these packages to your [default environment](https://pkgdocs.julialang.org/v1/environments/). + +``` +using Pkg +Pkg.add(["SnoopCompileCore", "SnoopCompile"]); +``` + +## Collecting the data + +Here's a simple demonstration of usage: + +```@repl tutorial-llvm +using SnoopCompileCore +@snoop_llvm "func_names.csv" "llvm_timings.yaml" begin + using InteractiveUtils + @eval InteractiveUtils.peakflops() +end + +using SnoopCompile +times, info = SnoopCompile.read_snoop_llvm("func_names.csv", "llvm_timings.yaml", tmin_secs = 0.025); +``` + +This will write two files, `"func_names.csv"` and `"llvm_timings.yaml"`, in your current working directory. Let's look at what was read from these files: + +```@repl tutorial-llvm +times +info +``` + diff --git a/examples/OptimizeMe.jl b/examples/OptimizeMe.jl index 20952407..1a7a2504 100644 --- a/examples/OptimizeMe.jl +++ b/examples/OptimizeMe.jl @@ -1,7 +1,8 @@ """ -OptimizeMe is a demonstration module used in illustrating how to improve code and generate effective `precompile` directives. -It has deliberate weaknesses in its design, and the analysis of these weaknesses via `@snoop_inference` is discussed -in the documentation. +OptimizeMe is a module used to demonstrate how to make code more precompilable +and more resistant to invalidation. It has deliberate weaknesses in its design, +and the analysis and resolution of these weaknesses via `@snoop_inference` is +discussed in the documentation. """ module OptimizeMe @@ -16,34 +17,21 @@ function lotsa_containers() display(cs) end -concat_string(c1::Container, c2::Container) = string(c1.value) * ' ' * string(c2.value) +howbig(str::AbstractString) = length(str) +howbig(x::Char) = 1 +howbig(x::Unsigned) = x +howbig(x::Real) = abs(x) -function contain_concrete(item1, item2) - c1 = Container(item1) - c2 = Container(item2) - return concat_string(c1, c2) -end - -function contain_list(list) - cs = Container.(list) - return concat_string(cs...) -end - -struct Object - x::Int -end - -function makeobjects() - xs = [1:5; 7] - return Object.(xs) +function abmult(r::Int, ys) + if r < 0 + r = -r + end + return map(x -> howbig(r * x), ys) end function main() lotsa_containers() - println(contain_concrete(3.14, "is great")) - list = [2.718, "is jealous"] - println(contain_list(list)) - display(makeobjects()) + return abmult(rand(-5:5), rand(3)) end end diff --git a/examples/OptimizeMeFixed.jl b/examples/OptimizeMeFixed.jl index 0c158fbf..af0e76ad 100644 --- a/examples/OptimizeMeFixed.jl +++ b/examples/OptimizeMeFixed.jl @@ -1,68 +1,49 @@ """ -OptimizeMe is a demonstration module used in illustrating how to improve code and generate effective `precompile` directives. -It has deliberate weaknesses in its design, and the analysis of these weaknesses via `@snoop_inference` is discussed -in the documentation. +OptimizeMeFixed is the "improved" version of OptimizeMe. See the file in this same directory for details. """ module OptimizeMeFixed +using PrecompileTools + struct Container{T} value::T end +Base.show(io::IO, c::Container) = print(io, "Container(", c.value, ")") -function lotsa_containers() - list = Any[1, 0x01, 0xffff, 2.0f0, 'a', [0], ("key", 42)] +function lotsa_containers(io::IO) + list = [1, 0x01, 0xffff, 2.0f0, 'a', [0], ("key", 42)] cs = Container{Any}.(list) - println("lotsa containers:") - display(cs) + println(io, "lotsa containers:") + show(io, MIME("text/plain"), cs) end -concat_string(c1::Container, c2::Container) = string(c1.value) * ' ' * string(c2.value) - -function contain_concrete(item1, item2) - c1 = Container(item1) - c2 = Container(item2) - return concat_string(c1, c2) -end - -function contain_list(list) - length(list) == 2 || throw(DimensionMismatch("list must have length 2")) - item1 = convert(Float64, list[1])::Float64 - item2 = list[2]::String - return contain_concrete(item1, item2) -end +howbig(str::AbstractString) = length(str) +howbig(x::Char) = 1 +howbig(x::Unsigned) = x +howbig(x::Real) = abs(x) -struct Object - x::Int +function abmult(r::Int, ys) + if r < 0 + r = -r + end + let r = r # Julia #15276 + return map(x -> howbig(r * x), ys) + end end -Base.show(io::IO, o::Object) = print(io, "Object x: ", o.x) -function makeobjects() - xs = [1:5; 7:7] - return Object.(xs) +function main() + lotsa_containers(stdout) + return abmult(rand(-5:5), rand(3)) end -# "Stub" callers for precompilability -function warmup() - mime = MIME("text/plain") - io = Base.stdout::Base.TTY - v = [Container{Any}(0)] - show(io, mime, v) - show(IOContext(io), mime, v) - v = [Object(0)] - show(io, mime, v) - show(IOContext(io), mime, v) - return nothing -end -function main() - lotsa_containers() - println(contain_concrete(3.14, "is great")) - list = [2.718, "is jealous"] - println(contain_list(list)) - display(makeobjects()) +@compile_workload begin + lotsa_containers(devnull) # use `devnull` to suppress output + abmult(rand(-5:5), rand(3)) end - -precompile(Tuple{typeof(main)}) # time: 0.4204474 -precompile(Tuple{typeof(warmup)}) +# since `devnull` is not a `Base.TTY`--the standard type of `stdout`--let's also +# use an explicit `precompile` directive. (Note this does not trigger any visible output). +# This doesn't "follow" runtime dispatch but at least it precompiles the entry point. +precompile(lotsa_containers, (Base.TTY,)) end diff --git a/ext/SCPyPlotExt.jl b/ext/SCPyPlotExt.jl index 9c924735..fca26cca 100644 --- a/ext/SCPyPlotExt.jl +++ b/ext/SCPyPlotExt.jl @@ -7,24 +7,6 @@ using PyPlot: PyPlot, plt, PyCall get_bystr(@nospecialize(by)) = by === inclusive ? "Inclusive" : by === exclusive ? "Exclusive" : error("unknown ", by) -""" - methodref, ax = pgdsgui(tinf::InferenceTimingNode; consts::Bool=true, by=inclusive) - methodref = pgdsgui(ax, tinf::InferenceTimingNode; kwargs...) - -Create a scatter plot comparing: - - (vertical axis) the inference time for all instances of each Method, as captured by `tinf`; - - (horizontal axis) the run time cost, as estimated by capturing a `@profile` before calling this function. - -Each dot corresponds to a single method. The face color encodes the number of times that method was inferred, -and the edge color corresponds to the fraction of the runtime spent on runtime dispatch (black is 0%, bright red is 100%). -Clicking on a dot prints the method (or location, if inlined) to the REPL, and sets `methodref[]` to -that method. - -`ax` is the pyplot axis of the scatterplot. - -!!! compat - `pgdsgui` depends on PyPlot via the Requires.jl package. You must load both SnoopCompile and PyPlot for this function to be defined. -""" function pgdsgui(ax::PyCall.PyObject, ridata::AbstractVector{Pair{Union{Method,MethodLoc},PGDSData}}; bystr, consts, markersz=25, linewidth=0.5, t0 = 0.001, interactive::Bool=true, kwargs...) methodref = Ref{Union{Method,MethodLoc}}() # returned to the user for inspection of clicked methods function onclick(event) diff --git a/src/SnoopCompile.jl b/src/SnoopCompile.jl index 84783375..29878794 100644 --- a/src/SnoopCompile.jl +++ b/src/SnoopCompile.jl @@ -72,23 +72,41 @@ end include("parcel_snoop_inference.jl") include("inference_demos.jl") -export @snoop_inference, exclusive, inclusive, flamegraph, flatten, accumulate_by_source, collect_for, runtime_inferencetime, staleinstances +export exclusive, inclusive, flamegraph, flatten, accumulate_by_source, collect_for, runtime_inferencetime, staleinstances export InferenceTrigger, inference_triggers, callerinstance, callingframe, skiphigherorder, trigger_tree, suggest, isignorable export report_callee, report_caller, report_callees include("parcel_snoop_llvm.jl") -export read_snoop_llvm, @snoop_llvm +export read_snoop_llvm include("invalidations.jl") -export @snoop_invalidations, uinvalidated, invalidation_trees, filtermod, findcaller +export uinvalidated, invalidation_trees, filtermod, findcaller include("invalidation_and_inference.jl") export precompile_blockers # Write -# include("write.jl") +include("write.jl") # For PyPlot extension +""" + methodref, ax = pgdsgui(tinf::InferenceTimingNode; consts::Bool=true, by=inclusive) + methodref = pgdsgui(ax, tinf::InferenceTimingNode; kwargs...) + +Create a scatter plot comparing: + - (vertical axis) the inference time for all instances of each Method, as captured by `tinf`; + - (horizontal axis) the run time cost, as estimated by capturing a `@profile` before calling this function. + +Each dot corresponds to a single method. The face color encodes the number of times that method was inferred, +and the edge color corresponds to the fraction of the runtime spent on runtime dispatch (black is 0%, bright red is 100%). +Clicking on a dot prints the method (or location, if inlined) to the REPL, and sets `methodref[]` to +that method. + +`ax` is the pyplot axis of the scatterplot. + +!!! compat + `pgdsgui` depends on PyPlot via the Requires.jl package. You must load both SnoopCompile and PyPlot for this function to be defined. +""" function pgdsgui end export pgdsgui # For PrettyTables extension diff --git a/src/parcel_snoop_inference.jl b/src/parcel_snoop_inference.jl index 56a22861..6862ef4d 100644 --- a/src/parcel_snoop_inference.jl +++ b/src/parcel_snoop_inference.jl @@ -1053,7 +1053,7 @@ function Location(itrig::InferenceTrigger) end Base.show(io::IO, loc::Location) = print(io, loc.func, " at ", loc.file, ':', loc.line) -InteractiveUtils.edit(loc::Location) = edit(string(loc.file), loc.line) +InteractiveUtils.edit(loc::Location) = edit(Base.fixup_stdlib_path(string(loc.file)), loc.line) const LocationTriggers = TaggedTriggers{Location} @@ -1169,6 +1169,8 @@ function Base.show(io::IO, s::Suggested) show_suggest(io, s.categories, rtcallee, sf) end +Base.haskey(s::Suggested, k::Suggestion) = k in s.categories + function show_suggest(io::IO, categories, rtcallee, sf) showcaller = true showvahint = showannotate = false @@ -1660,7 +1662,14 @@ function suggest!(stree, node) return stree end -Base.show(io::IO, node::SuggestNode) = print_tree(io, node) +function Base.show(io::IO, node::SuggestNode) + if node.s === nothing + print(io, "no inference trigger") + else + show(io, node.s) + end + print(" (", string(length(node.children)), " children)") +end function strip_prefix(io::IO, obj, prefix) print(io, obj) diff --git a/src/utils.jl b/src/utils.jl index 8b2d9e1b..e2335474 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -128,7 +128,7 @@ function add_repr!(list, modgens::Dict{Module, Vector{Method}}, mi::MethodInstan return add_if_evals!(list, topmod, reprcontext(topmod, p), paramrepr, tt, check_eval = check_eval, time=time) end -function handle_kwbody(topmod::Module, m::Method, paramrepr, tt, fstr="fbody"; check_eval = true, has_bodyfunction::Bool=false) +function handle_kwbody(topmod::Module, m::Method, paramrepr, tt, fstr="fbody"; check_eval = true) nameparent = Symbol(match(r"^#([^#]*)#", String(m.name)).captures[1]) if !isdefined(m.module, nameparent) @debug "Module $topmod: skipping $m due to inability to look up kwbody parent" # see example related to issue #237 @@ -140,17 +140,12 @@ function handle_kwbody(topmod::Module, m::Method, paramrepr, tt, fstr="fbody"; c can1, exc1 = can_eval(topmod, whichstr, check_eval) if can1 ttstr = tuplestring(paramrepr) - pcstr = has_bodyfunction ? """ + pcstr = """ let fbody = try Base.bodyfunction($whichstr) catch missing end if !ismissing(fbody) precompile($fstr, $ttstr) end - end""" : """ - let fbody = try __lookup_kwbody__($whichstr) catch missing end - if !ismissing(fbody) - precompile($fstr, $ttstr) - end - end""" # extra indentation because `write` will indent 1st line + end""" can2, exc2 = can_eval(topmod, pcstr, check_eval) if can2 return pcstr diff --git a/test/snoop_inference.jl b/test/snoop_inference.jl index 85e1fd04..356534f8 100644 --- a/test/snoop_inference.jl +++ b/test/snoop_inference.jl @@ -729,10 +729,7 @@ include("testmodules/SnoopBench.jl") io = IOBuffer() SnoopCompile.write(io, tmis; tmin=0.0) str = String(take!(io)) - @test occursin("__lookup_kwbody__", str) - SnoopCompile.write(io, tmis; tmin=0.0, has_bodyfunction=true) - str = String(take!(io)) - @test !occursin("__lookup_kwbody__", str) + @test occursin("bodyfunction", str) A = [a] tinf = @snoop_inference SnoopBench.mappushes(identity, A)