From 1ad36b13466191ae871aa2ed4649907229491130 Mon Sep 17 00:00:00 2001 From: Carine Morel Date: Tue, 25 Jun 2024 18:40:23 +0200 Subject: [PATCH] Create a multicore-magic-dscheck library. --- dune-project | 10 + multicore-magic-dscheck.opam | 29 ++ src/dscheck/dune | 4 + .../multicore_magic_atomic_array_dscheck.ml | 20 ++ src/dscheck/multicore_magic_dscheck.ml | 14 + src/dscheck/multicore_magic_dscheck.mli | 258 ++++++++++++++++++ src/dscheck/transparent_atomic_dscheck.ml | 3 + 7 files changed, 338 insertions(+) create mode 100644 multicore-magic-dscheck.opam create mode 100644 src/dscheck/dune create mode 100644 src/dscheck/multicore_magic_atomic_array_dscheck.ml create mode 100644 src/dscheck/multicore_magic_dscheck.ml create mode 100644 src/dscheck/multicore_magic_dscheck.mli create mode 100644 src/dscheck/transparent_atomic_dscheck.ml diff --git a/dune-project b/dune-project index d6d4ea9..2492432 100644 --- a/dune-project +++ b/dune-project @@ -41,3 +41,13 @@ (and (>= 2.4.1) :with-doc)))) + +(package + (name multicore-magic-dscheck) + (synopsis "Low-level multiscore utilities for OCaml") + (depends + (ocaml + (>= 4.12.0)) + ;; Test dependencies + (multicore-magic + (= :version)))) diff --git a/multicore-magic-dscheck.opam b/multicore-magic-dscheck.opam new file mode 100644 index 0000000..3d2c095 --- /dev/null +++ b/multicore-magic-dscheck.opam @@ -0,0 +1,29 @@ +# This file is generated by dune, edit dune-project instead +opam-version: "2.0" +synopsis: "Low-level multiscore utilities for OCaml" +maintainer: ["Vesa Karvonen "] +authors: ["Vesa Karvonen "] +license: "ISC" +homepage: "https://github.com/ocaml-multicore/multicore-magic" +bug-reports: "https://github.com/ocaml-multicore/multicore-magic/issues" +depends: [ + "dune" {>= "3.14"} + "ocaml" {>= "4.12.0"} + "multicore-magic" {= version} + "odoc" {with-doc} +] +build: [ + ["dune" "subst"] {dev} + [ + "dune" + "build" + "-p" + name + "-j" + jobs + "@install" + "@runtest" {with-test} + "@doc" {with-doc} + ] +] +dev-repo: "git+https://github.com/ocaml-multicore/multicore-magic.git" diff --git a/src/dscheck/dune b/src/dscheck/dune new file mode 100644 index 0000000..af2bb7e --- /dev/null +++ b/src/dscheck/dune @@ -0,0 +1,4 @@ +(library + (name multicore_magic_dscheck) + (package multicore-magic-dscheck) + (libraries multicore-magic dscheck)) diff --git a/src/dscheck/multicore_magic_atomic_array_dscheck.ml b/src/dscheck/multicore_magic_atomic_array_dscheck.ml new file mode 100644 index 0000000..7389b23 --- /dev/null +++ b/src/dscheck/multicore_magic_atomic_array_dscheck.ml @@ -0,0 +1,20 @@ +module Atomic = Dscheck.TracedAtomic + +type 'a t = 'a Atomic.t array + +let[@inline] at (type a) (xs : a t) i : a Atomic.t = + (* ['a t] does not contain [float]s. *) + Obj.magic (Array.unsafe_get (Obj.magic xs : a ref array) i) + +let[@inline] make n v = Array.init n @@ fun _ -> Atomic.make v +let[@inline] init n fn = Array.init n @@ fun i -> Atomic.make (fn i) +let[@inline] of_array xs = init (Array.length xs) (Array.unsafe_get xs) + +external length : 'a array -> int = "%array_length" + +let[@inline] unsafe_fenceless_set xs i v = Obj.magic (at xs i) := v +let[@inline] unsafe_fenceless_get xs i = !(Obj.magic (at xs i)) + +let[@inline] unsafe_compare_and_set xs i b a = + Atomic.compare_and_set (at xs i) b a + diff --git a/src/dscheck/multicore_magic_dscheck.ml b/src/dscheck/multicore_magic_dscheck.ml new file mode 100644 index 0000000..c8c2201 --- /dev/null +++ b/src/dscheck/multicore_magic_dscheck.ml @@ -0,0 +1,14 @@ +include Multicore_magic + + +let[@inline] fenceless_get (atomic : 'a Atomic.t) = + !(Sys.opaque_identity (Obj.magic atomic : 'a ref)) + +let[@inline] fenceless_set (atomic : 'a Atomic.t) value = + (Obj.magic atomic : 'a ref) := value + +let[@inline] fence atomic = Atomic.fetch_and_add atomic 0 |> ignore + + +module Transparent_atomic = Transparent_atomic_dscheck +module Atomic_array = Multicore_magic_atomic_array_dscheck diff --git a/src/dscheck/multicore_magic_dscheck.mli b/src/dscheck/multicore_magic_dscheck.mli new file mode 100644 index 0000000..619b09e --- /dev/null +++ b/src/dscheck/multicore_magic_dscheck.mli @@ -0,0 +1,258 @@ +(** This is a library of magic multicore utilities intended for experts for + extracting the best possible performance from multicore OCaml. + + Hopefully future releases of multicore OCaml will make this library + obsolete! *) + +(** {1 Helpers for using padding to avoid false sharing} *) + +val copy_as_padded : 'a -> 'a +(** Depending on the object, either creates a shallow clone of it or returns it + as is. When cloned, the clone will have extra padding words added after the + last used word. + + This is designed to help avoid + {{:https://en.wikipedia.org/wiki/False_sharing} false sharing}. False + sharing has a negative impact on multicore performance. Accesses of both + atomic and non-atomic locations, whether read-only or read-write, may suffer + from false sharing. + + The intended use case for this is to pad all long lived objects that are + being accessed highly frequently (read or written). + + Many kinds of objects can be padded, for example: + + {[ + let padded_atomic = Multicore_magic.copy_as_padded (Atomic.make 101) + + let padded_ref = Multicore_magic.copy_as_padded (ref 42) + + let padded_record = Multicore_magic.copy_as_padded { + number = 76; + pointer = 1 :: 2 :: 3 :: []; + } + + let padded_variant = Multicore_magic.copy_as_padded (Some 1) + ]} + + Padding changes the length of an array. If you need to pad an array, use + {!make_padded_array}. *) + +val make_padded_array : int -> 'a -> 'a array +(** Creates a padded array. The length of the returned array includes padding. + Use {!length_of_padded_array} to get the unpadded length. *) + +val length_of_padded_array : 'a array -> int +(** Returns the length of an array created by {!make_padded_array} without the + padding. + + {b WARNING}: This is not guaranteed to work with {!copy_as_padded}. *) + +val length_of_padded_array_minus_1 : 'a array -> int +(** Returns the length of an array created by {!make_padded_array} without the + padding minus 1. + + {b WARNING}: This is not guaranteed to work with {!copy_as_padded}. *) + +(** {1 Missing [Atomic] operations} *) + +val fenceless_get : 'a Atomic.t -> 'a +(** Get a value from the atomic without performing an acquire fence. + + Consider the following prototypical example of a lock-free algorithm: + + {[ + let rec prototypical_lock_free_algorithm () = + let expected = Atomic.get atomic in + let desired = (* computed from expected *) in + if not (Atomic.compare_and_set atomic expected desired) then + (* failure, maybe retry *) + else + (* success *) + ]} + + A potential performance problem with the above example is that it performs + two acquire fences. Both the [Atomic.get] and the [Atomic.compare_and_set] + perform an acquire fence. This may have a negative impact on performance. + + Assuming the first fence is not necessary, we can rewrite the example using + {!fenceless_get} as follows: + + {[ + let rec prototypical_lock_free_algorithm () = + let expected = Multicore_magic.fenceless_get atomic in + let desired = (* computed from expected *) in + if not (Atomic.compare_and_set atomic expected desired) then + (* failure, maybe retry *) + else + (* success *) + ]} + + Now only a single acquire fence is performed by [Atomic.compare_and_set] and + performance may be improved. *) + +val fenceless_set : 'a Atomic.t -> 'a -> unit +(** Set the value of an atomic without performing a full fence. + + Consider the following example: + + {[ + let new_atomic = Atomic.make dummy_value in + (* prepare data_structure referring to new_atomic *) + Atomic.set new_atomic data_structure; + (* publish the data_structure: *) + Atomic.exchance old_atomic data_structure + ]} + + A potential performance problem with the above example is that it performs + two full fences. Both the [Atomic.set] used to initialize the data + structure and the [Atomic.exchange] used to publish the data structure + perform a full fence. The same would also apply in cases where + [Atomic.compare_and_set] or [Atomic.set] would be used to publish the data + structure. This may have a negative impact on performance. + + Using {!fenceless_set} we can rewrite the example as follows: + + {[ + let new_atomic = Atomic.make dummy_value in + (* prepare data_structure referring to new_atomic *) + Multicore_magic.fenceless_set new_atomic data_structure; + (* publish the data_structure: *) + Atomic.exchance old_atomic data_structure + ]} + + Now only a single full fence is performed by [Atomic.exchange] and + performance may be improved. *) + +val fence : int Atomic.t -> unit +(** Perform a full acquire-release fence on the given atomic. + + [fence atomic] is equivalent to [ignore (Atomic.fetch_and_add atomic 0)]. *) + +(** {1 Fixes and workarounds} *) + +module Transparent_atomic : sig + (** A replacement for [Stdlib.Atomic] with fixes and performance improvements + + [Stdlib.Atomic.get] is incorrectly subject to CSE optimization in OCaml + 5.0.0 and 5.1.0. This can result in code being generated that can produce + results that cannot be explained with the OCaml memory model. It can also + sometimes result in code being generated where a manual optimization to + avoid writing to memory is defeated by the compiler as the compiler + eliminates a (repeated) read access. This module implements {!get} such + that argument to [Stdlib.Atomic.get] is passed through + [Sys.opaque_identity], which prevents the compiler from applying the CSE + optimization. + + OCaml 5 generates inefficient accesses of ['a Stdlib.Atomic.t array]s + assuming that the array might be an array of [float]ing point numbers. + That is because the [Stdlib.Atomic.t] type constructor is opaque, which + means that the compiler cannot assume that [_ Stdlib.Atomic.t] is not the + same as [float]. This module defines {{!t} the type} as [private 'a ref], + which allows the compiler to know that it cannot be the same as [float], + which allows the compiler to generate more efficient array accesses. This + can both improve performance and reduce size of generated code when using + arrays of atomics. *) + + type !'a t + + val make : 'a -> 'a t + val make_contended : 'a -> 'a t + val get : 'a t -> 'a + val fenceless_get : 'a t -> 'a + val set : 'a t -> 'a -> unit + val fenceless_set : 'a t -> 'a -> unit + val exchange : 'a t -> 'a -> 'a + val compare_and_set : 'a t -> 'a -> 'a -> bool + val fetch_and_add : int t -> int -> int + val incr : int t -> unit + val decr : int t -> unit +end + +(** {1 Missing functionality} *) + +module Atomic_array : sig + (** Array of (potentially unboxed) atomic locations. + + Where available, this uses an undocumented operation exported by the OCaml + 5 runtime, + {{:https://github.com/ocaml/ocaml/blob/7a5d882d22cdd32b6319e9be680bd1a3d67377a9/runtime/memory.c#L313-L338} + [caml_atomic_cas_field]}, which makes it possible to perform sequentially + consistent atomic updates of record fields and array elements. + + Hopefully a future version of OCaml provides more comprehensive and even + more efficient support for both sequentially consistent and relaxed atomic + operations on records and arrays. *) + + type !'a t + (** Represents an array of atomic locations. *) + + val make : int -> 'a -> 'a t + (** [make n value] creates a new array of [n] atomic locations having given + [value]. *) + + val of_array : 'a array -> 'a t + (** [of_array non_atomic_array] create a new array of atomic locations as a + copy of the given [non_atomic_array]. *) + + val init : int -> (int -> 'a) -> 'a t + (** [init n fn] is equivalent to {{!of_array} [of_array (Array.init n fn)]}. *) + + val length : 'a t -> int + (** [length atomic_array] returns the length of the [atomic_array]. *) + + val unsafe_fenceless_get : 'a t -> int -> 'a + (** [unsafe_fenceless_get atomic_array index] reads and returns the value at + the specified [index] of the [atomic_array]. + + ⚠️ The read is {i relaxed} and may be reordered with respect to other reads + and writes in program order. + + ⚠️ No bounds checking is performed. *) + + val unsafe_fenceless_set : 'a t -> int -> 'a -> unit + (** [unsafe_fenceless_set atomic_array index value] writes the given [value] + to the specified [index] of the [atomic_array]. + + ⚠️ The write is {i relaxed} and may be reordered with respect to other + reads and (non-initializing) writes in program order. + + ⚠️ No bounds checking is performed. *) + + val unsafe_compare_and_set : 'a t -> int -> 'a -> 'a -> bool + (** [unsafe_compare_and_set atomic_array index before after] atomically + updates the specified [index] of the [atomic_array] to the [after] value + in case it had the [before] value and returns a boolean indicating whether + that was the case. This operation is {i sequentially consistent} and may + not be reordered with respect to other reads and writes in program order. + + ⚠️ No bounds checking is performed. *) +end + +(** {1 Avoiding contention} *) + +val instantaneous_domain_index : unit -> int +(** [instantaneous_domain_index ()] potentially (re)allocates and returns a + non-negative integer "index" for the current domain. The indices are + guaranteed to be unique among the domains that exist at a point in time. + Each call of [instantaneous_domain_index ()] may return a different index. + + The intention is that the returned value can be used as an index into a + contention avoiding parallelism safe data structure. For example, a naïve + scalable increment of one counter from an array of counters could be done as + follows: + + {[ + let incr counters = + (* Assuming length of [counters] is a power of two and larger than + the number of domains. *) + let mask = Array.length counters - 1 in + let index = instantaneous_domain_index () in + Atomic.incr counters.(index land mask) + ]} + + The implementation ensures that the indices are allocated as densely as + possible at any given moment. This should allow allocating as many counters + as needed and essentially eliminate contention. + + On OCaml 4 [instantaneous_domain_index ()] will always return [0]. *) diff --git a/src/dscheck/transparent_atomic_dscheck.ml b/src/dscheck/transparent_atomic_dscheck.ml new file mode 100644 index 0000000..a17cc84 --- /dev/null +++ b/src/dscheck/transparent_atomic_dscheck.ml @@ -0,0 +1,3 @@ +include Dscheck.TracedAtomic +let fenceless_get = get +let fenceless_set = set \ No newline at end of file