From f7aa64a72fdf669eb3e03b4b52cc1b6c90dbf0c8 Mon Sep 17 00:00:00 2001 From: ltdk Date: Sat, 5 Oct 2024 21:25:19 -0400 Subject: [PATCH] Further sequester Group/Tag code --- src/{raw => control}/bitmask.rs | 4 +- src/{raw => control/group}/generic.rs | 9 +- src/control/group/mod.rs | 35 +++++++ src/{raw => control/group}/neon.rs | 9 +- src/{raw => control/group}/sse2.rs | 9 +- src/control/mod.rs | 10 ++ src/control/tag.rs | 81 ++++++++++++++++ src/lib.rs | 2 + src/raw/mod.rs | 132 ++++---------------------- src/util.rs | 14 +++ 10 files changed, 169 insertions(+), 136 deletions(-) rename src/{raw => control}/bitmask.rs (99%) rename src/{raw => control/group}/generic.rs (94%) create mode 100644 src/control/group/mod.rs rename src/{raw => control/group}/neon.rs (93%) rename src/{raw => control/group}/sse2.rs (94%) create mode 100644 src/control/mod.rs create mode 100644 src/control/tag.rs create mode 100644 src/util.rs diff --git a/src/raw/bitmask.rs b/src/control/bitmask.rs similarity index 99% rename from src/raw/bitmask.rs rename to src/control/bitmask.rs index 87a5a6462..cfacfce67 100644 --- a/src/raw/bitmask.rs +++ b/src/control/bitmask.rs @@ -1,4 +1,4 @@ -use super::imp::{ +use super::group::{ BitMaskWord, NonZeroBitMaskWord, BITMASK_ITER_MASK, BITMASK_MASK, BITMASK_STRIDE, }; @@ -102,7 +102,7 @@ impl IntoIterator for BitMask { /// Iterator over the contents of a `BitMask`, returning the indices of set /// bits. -#[derive(Copy, Clone)] +#[derive(Clone)] pub(crate) struct BitMaskIter(pub(crate) BitMask); impl Iterator for BitMaskIter { diff --git a/src/raw/generic.rs b/src/control/group/generic.rs similarity index 94% rename from src/raw/generic.rs rename to src/control/group/generic.rs index 435164479..223070997 100644 --- a/src/raw/generic.rs +++ b/src/control/group/generic.rs @@ -1,5 +1,4 @@ -use super::bitmask::BitMask; -use super::Tag; +use super::super::{BitMask, Tag}; use core::{mem, ptr}; // Use the native word size as the group size. Using a 64-bit group size on @@ -81,8 +80,7 @@ impl Group { #[inline] #[allow(clippy::cast_ptr_alignment)] pub(crate) unsafe fn load_aligned(ptr: *const Tag) -> Self { - // FIXME: use align_offset once it stabilizes - debug_assert_eq!(ptr as usize & (mem::align_of::() - 1), 0); + debug_assert_eq!(ptr.align_offset(mem::align_of::()), 0); Group(ptr::read(ptr.cast())) } @@ -91,8 +89,7 @@ impl Group { #[inline] #[allow(clippy::cast_ptr_alignment)] pub(crate) unsafe fn store_aligned(self, ptr: *mut Tag) { - // FIXME: use align_offset once it stabilizes - debug_assert_eq!(ptr as usize & (mem::align_of::() - 1), 0); + debug_assert_eq!(ptr.align_offset(mem::align_of::()), 0); ptr::write(ptr.cast(), self.0); } diff --git a/src/control/group/mod.rs b/src/control/group/mod.rs new file mode 100644 index 000000000..614326048 --- /dev/null +++ b/src/control/group/mod.rs @@ -0,0 +1,35 @@ +cfg_if! { + // Use the SSE2 implementation if possible: it allows us to scan 16 buckets + // at once instead of 8. We don't bother with AVX since it would require + // runtime dispatch and wouldn't gain us much anyways: the probability of + // finding a match drops off drastically after the first few buckets. + // + // I attempted an implementation on ARM using NEON instructions, but it + // turns out that most NEON instructions have multi-cycle latency, which in + // the end outweighs any gains over the generic implementation. + if #[cfg(all( + target_feature = "sse2", + any(target_arch = "x86", target_arch = "x86_64"), + not(miri), + ))] { + mod sse2; + use sse2 as imp; + } else if #[cfg(all( + target_arch = "aarch64", + target_feature = "neon", + // NEON intrinsics are currently broken on big-endian targets. + // See https://github.com/rust-lang/stdarch/issues/1484. + target_endian = "little", + not(miri), + ))] { + mod neon; + use neon as imp; + } else { + mod generic; + use generic as imp; + } +} +pub(crate) use self::imp::Group; +pub(super) use self::imp::{ + BitMaskWord, NonZeroBitMaskWord, BITMASK_ITER_MASK, BITMASK_MASK, BITMASK_STRIDE, +}; diff --git a/src/raw/neon.rs b/src/control/group/neon.rs similarity index 93% rename from src/raw/neon.rs rename to src/control/group/neon.rs index b79f139e8..9374cb388 100644 --- a/src/raw/neon.rs +++ b/src/control/group/neon.rs @@ -1,5 +1,4 @@ -use super::bitmask::BitMask; -use super::Tag; +use super::super::{BitMask, Tag}; use core::arch::aarch64 as neon; use core::mem; use core::num::NonZeroU64; @@ -52,8 +51,7 @@ impl Group { #[inline] #[allow(clippy::cast_ptr_alignment)] pub(crate) unsafe fn load_aligned(ptr: *const Tag) -> Self { - // FIXME: use align_offset once it stabilizes - debug_assert_eq!(ptr as usize & (mem::align_of::() - 1), 0); + debug_assert_eq!(ptr.align_offset(mem::align_of::()), 0); Group(neon::vld1_u8(ptr.cast())) } @@ -62,8 +60,7 @@ impl Group { #[inline] #[allow(clippy::cast_ptr_alignment)] pub(crate) unsafe fn store_aligned(self, ptr: *mut Tag) { - // FIXME: use align_offset once it stabilizes - debug_assert_eq!(ptr as usize & (mem::align_of::() - 1), 0); + debug_assert_eq!(ptr.align_offset(mem::align_of::()), 0); neon::vst1_u8(ptr.cast(), self.0); } diff --git a/src/raw/sse2.rs b/src/control/group/sse2.rs similarity index 94% rename from src/raw/sse2.rs rename to src/control/group/sse2.rs index 87af2727b..0d4b10822 100644 --- a/src/raw/sse2.rs +++ b/src/control/group/sse2.rs @@ -1,5 +1,4 @@ -use super::bitmask::BitMask; -use super::Tag; +use super::super::{BitMask, Tag}; use core::mem; use core::num::NonZeroU16; @@ -58,8 +57,7 @@ impl Group { #[inline] #[allow(clippy::cast_ptr_alignment)] pub(crate) unsafe fn load_aligned(ptr: *const Tag) -> Self { - // FIXME: use align_offset once it stabilizes - debug_assert_eq!(ptr as usize & (mem::align_of::() - 1), 0); + debug_assert_eq!(ptr.align_offset(mem::align_of::()), 0); Group(x86::_mm_load_si128(ptr.cast())) } @@ -68,8 +66,7 @@ impl Group { #[inline] #[allow(clippy::cast_ptr_alignment)] pub(crate) unsafe fn store_aligned(self, ptr: *mut Tag) { - // FIXME: use align_offset once it stabilizes - debug_assert_eq!(ptr as usize & (mem::align_of::() - 1), 0); + debug_assert_eq!(ptr.align_offset(mem::align_of::()), 0); x86::_mm_store_si128(ptr.cast(), self.0); } diff --git a/src/control/mod.rs b/src/control/mod.rs new file mode 100644 index 000000000..62ef8bfcc --- /dev/null +++ b/src/control/mod.rs @@ -0,0 +1,10 @@ +mod bitmask; +mod group; +mod tag; + +use self::bitmask::BitMask; +pub(crate) use self::{ + bitmask::BitMaskIter, + group::Group, + tag::{Tag, TagSliceExt}, +}; diff --git a/src/control/tag.rs b/src/control/tag.rs new file mode 100644 index 000000000..c5b84233b --- /dev/null +++ b/src/control/tag.rs @@ -0,0 +1,81 @@ +use core::{fmt, mem}; + +/// Single tag in a control group. +#[derive(Copy, Clone, PartialEq, Eq)] +#[repr(transparent)] +pub(crate) struct Tag(pub(super) u8); +impl Tag { + /// Control tag value for an empty bucket. + pub(crate) const EMPTY: Tag = Tag(0b1111_1111); + + /// Control tag value for a deleted bucket. + pub(crate) const DELETED: Tag = Tag(0b1000_0000); + + /// Checks whether a control tag represents a full bucket (top bit is clear). + #[inline] + pub(crate) const fn is_full(self) -> bool { + self.0 & 0x80 == 0 + } + + /// Checks whether a control tag represents a special value (top bit is set). + #[inline] + pub(crate) const fn is_special(self) -> bool { + self.0 & 0x80 != 0 + } + + /// Checks whether a special control value is EMPTY (just check 1 bit). + #[inline] + pub(crate) const fn special_is_empty(self) -> bool { + debug_assert!(self.is_special()); + self.0 & 0x01 != 0 + } + + /// Creates a control tag representing a full bucket with the given hash. + #[inline] + #[allow(clippy::cast_possible_truncation)] + pub(crate) const fn full(hash: u64) -> Tag { + // Constant for function that grabs the top 7 bits of the hash. + const MIN_HASH_LEN: usize = if mem::size_of::() < mem::size_of::() { + mem::size_of::() + } else { + mem::size_of::() + }; + + // Grab the top 7 bits of the hash. While the hash is normally a full 64-bit + // value, some hash functions (such as FxHash) produce a usize result + // instead, which means that the top 32 bits are 0 on 32-bit platforms. + // So we use MIN_HASH_LEN constant to handle this. + let top7 = hash >> (MIN_HASH_LEN * 8 - 7); + Tag((top7 & 0x7f) as u8) // truncation + } +} +impl fmt::Debug for Tag { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.is_special() { + if self.special_is_empty() { + f.pad("EMPTY") + } else { + f.pad("DELETED") + } + } else { + f.debug_tuple("full").field(&(self.0 & 0x7F)).finish() + } + } +} + +/// Extension trait for slices of tags. +pub(crate) trait TagSliceExt { + /// Fills the control with the given tag. + fn fill_tag(&mut self, tag: Tag); + + /// Clears out the control. + fn fill_empty(&mut self) { + self.fill_tag(Tag::EMPTY) + } +} +impl TagSliceExt for [Tag] { + fn fill_tag(&mut self, tag: Tag) { + // SAFETY: We have access to the entire slice, so, we can write to the entire slice. + unsafe { self.as_mut_ptr().write_bytes(tag.0, self.len()) } + } +} diff --git a/src/lib.rs b/src/lib.rs index a637ccbef..8364d7b60 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -61,7 +61,9 @@ doc_comment::doctest!("../README.md"); #[macro_use] mod macros; +mod control; mod raw; +mod util; mod external_trait_impls; mod map; diff --git a/src/raw/mod.rs b/src/raw/mod.rs index 1c4a5f42e..ce07120e6 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -1,68 +1,19 @@ use crate::alloc::alloc::{handle_alloc_error, Layout}; +use crate::control::{BitMaskIter, Group, Tag, TagSliceExt}; use crate::scopeguard::{guard, ScopeGuard}; +use crate::util::{invalid_mut, likely, unlikely}; use crate::TryReserveError; use core::array; use core::iter::FusedIterator; use core::marker::PhantomData; use core::mem; use core::ptr::NonNull; +use core::slice; use core::{hint, ptr}; -cfg_if! { - // Use the SSE2 implementation if possible: it allows us to scan 16 buckets - // at once instead of 8. We don't bother with AVX since it would require - // runtime dispatch and wouldn't gain us much anyways: the probability of - // finding a match drops off drastically after the first few buckets. - // - // I attempted an implementation on ARM using NEON instructions, but it - // turns out that most NEON instructions have multi-cycle latency, which in - // the end outweighs any gains over the generic implementation. - if #[cfg(all( - target_feature = "sse2", - any(target_arch = "x86", target_arch = "x86_64"), - not(miri), - ))] { - mod sse2; - use sse2 as imp; - } else if #[cfg(all( - target_arch = "aarch64", - target_feature = "neon", - // NEON intrinsics are currently broken on big-endian targets. - // See https://github.com/rust-lang/stdarch/issues/1484. - target_endian = "little", - not(miri), - ))] { - mod neon; - use neon as imp; - } else { - mod generic; - use generic as imp; - } -} - mod alloc; pub(crate) use self::alloc::{do_alloc, Allocator, Global}; -mod bitmask; - -use self::bitmask::BitMaskIter; -use self::imp::Group; - -// Branch prediction hint. This is currently only available on nightly but it -// consistently improves performance by 10-15%. -#[cfg(not(feature = "nightly"))] -use core::convert::{identity as likely, identity as unlikely}; -#[cfg(feature = "nightly")] -use core::intrinsics::{likely, unlikely}; - -// FIXME: use strict provenance functions once they are stable. -// Implement it with a transmute for now. -#[inline(always)] -#[allow(clippy::useless_transmute)] // clippy is wrong, cast and transmute are different here -fn invalid_mut(addr: usize) -> *mut T { - unsafe { core::mem::transmute(addr) } -} - #[inline] unsafe fn offset_from(to: *const T, from: *const T) -> usize { to.offset_from(from) as usize @@ -102,56 +53,6 @@ trait SizedTypeProperties: Sized { impl SizedTypeProperties for T {} -/// Single tag in a control group. -#[derive(Copy, Clone, PartialEq, Eq, Debug)] -#[repr(transparent)] -pub(crate) struct Tag(u8); -impl Tag { - /// Control tag value for an empty bucket. - const EMPTY: Tag = Tag(0b1111_1111); - - /// Control tag value for a deleted bucket. - const DELETED: Tag = Tag(0b1000_0000); - - /// Checks whether a control tag represents a full bucket (top bit is clear). - #[inline] - const fn is_full(self) -> bool { - self.0 & 0x80 == 0 - } - - /// Checks whether a control tag represents a special value (top bit is set). - #[inline] - const fn is_special(self) -> bool { - self.0 & 0x80 != 0 - } - - /// Checks whether a special control value is EMPTY (just check 1 bit). - #[inline] - const fn special_is_empty(self) -> bool { - debug_assert!(self.is_special()); - self.0 & 0x01 != 0 - } - - /// Creates a control tag representing a full bucket with the given hash. - #[inline] - #[allow(clippy::cast_possible_truncation)] - const fn full(hash: u64) -> Tag { - // Constant for function that grabs the top 7 bits of the hash. - const MIN_HASH_LEN: usize = if mem::size_of::() < mem::size_of::() { - mem::size_of::() - } else { - mem::size_of::() - }; - - // Grab the top 7 bits of the hash. While the hash is normally a full 64-bit - // value, some hash functions (such as FxHash) produce a usize result - // instead, which means that the top 32 bits are 0 on 32-bit platforms. - // So we use MIN_HASH_LEN constant to handle this. - let top7 = hash >> (MIN_HASH_LEN * 8 - 7); - Tag((top7 & 0x7f) as u8) // truncation - } -} - /// Primary hash function, used to select the initial bucket to probe from. #[inline] #[allow(clippy::cast_possible_truncation)] @@ -1577,13 +1478,12 @@ impl RawTableInner { let buckets = capacity_to_buckets(capacity).ok_or_else(|| fallibility.capacity_overflow())?; - let result = Self::new_uninitialized(alloc, table_layout, buckets, fallibility)?; + let mut result = + Self::new_uninitialized(alloc, table_layout, buckets, fallibility)?; // SAFETY: We checked that the table is allocated and therefore the table already has // `self.bucket_mask + 1 + Group::WIDTH` number of control bytes (see TableLayout::calculate_layout_for) // so writing `self.num_ctrl_bytes() == bucket_mask + 1 + Group::WIDTH` bytes is safe. - result - .ctrl(0) - .write_bytes(Tag::EMPTY.0, result.num_ctrl_bytes()); + result.ctrl_slice().fill_empty(); Ok(result) } @@ -2576,6 +2476,12 @@ impl RawTableInner { self.ctrl.as_ptr().add(index).cast() } + /// Gets the slice of all control bytes. + fn ctrl_slice(&mut self) -> &mut [Tag] { + // SAFETY: We've intiailized all control bytes, and have the correct number. + unsafe { slice::from_raw_parts_mut(self.ctrl.as_ptr().cast(), self.num_ctrl_bytes()) } + } + #[inline] fn buckets(&self) -> usize { self.bucket_mask + 1 @@ -3111,10 +3017,7 @@ impl RawTableInner { #[inline] fn clear_no_drop(&mut self) { if !self.is_empty_singleton() { - unsafe { - self.ctrl(0) - .write_bytes(Tag::EMPTY.0, self.num_ctrl_bytes()); - } + self.ctrl_slice().fill_empty(); } self.items = 0; self.growth_left = bucket_mask_to_capacity(self.bucket_mask); @@ -3672,7 +3575,7 @@ impl Clone for RawIterRange { Self { data: self.data.clone(), next_ctrl: self.next_ctrl, - current_group: self.current_group, + current_group: self.current_group.clone(), end: self.end, } } @@ -4292,7 +4195,7 @@ mod test_map { unsafe { // SAFETY: The `buckets` is power of two and we're not // trying to actually use the returned RawTable. - let table = + let mut table = RawTable::<(u64, Vec)>::new_uninitialized(Global, 8, Fallibility::Infallible) .unwrap(); @@ -4301,10 +4204,7 @@ mod test_map { // SAFETY: We checked that the table is allocated and therefore the table already has // `self.bucket_mask + 1 + Group::WIDTH` number of control bytes (see TableLayout::calculate_layout_for) // so writing `table.table.num_ctrl_bytes() == bucket_mask + 1 + Group::WIDTH` bytes is safe. - table - .table - .ctrl(0) - .write_bytes(Tag::EMPTY.0, table.table.num_ctrl_bytes()); + table.table.ctrl_slice().fill_empty(); // SAFETY: table.capacity() is guaranteed to be smaller than table.buckets() table.table.ctrl(0).write_bytes(0, table.capacity()); diff --git a/src/util.rs b/src/util.rs new file mode 100644 index 000000000..c8a811732 --- /dev/null +++ b/src/util.rs @@ -0,0 +1,14 @@ +// FIXME: Branch prediction hint. This is currently only available on nightly +// but it consistently improves performance by 10-15%. +#[cfg(not(feature = "nightly"))] +pub(crate) use core::convert::{identity as likely, identity as unlikely}; +#[cfg(feature = "nightly")] +pub(crate) use core::intrinsics::{likely, unlikely}; + +// FIXME: use strict provenance functions once they are stable. +// Implement it with a transmute for now. +#[inline(always)] +#[allow(clippy::useless_transmute)] // clippy is wrong, cast and transmute are different here +pub(crate) fn invalid_mut(addr: usize) -> *mut T { + unsafe { core::mem::transmute(addr) } +}