From ad62b87d8fb486bf75f297bd147e55b2d03da8ba Mon Sep 17 00:00:00 2001 From: Gabriel Eiseman Date: Sun, 11 Feb 2024 03:51:45 -0500 Subject: [PATCH] Avl fixes: - move check wrappers to avl_check so they can be used explicitly in tests - improve avl_attach/increase/decrease to accept duplicates but optionally report via status arg - add avl_attach_exclusive to retain the old behavior - improve/correct docs - remove avl_swap_nodes because it was basically useless: it totally swapped two nodes so the shape/external properties of the tree stayed the same and only their memory addresses actually changed, it basically could only be used to avoid iterator invalidation or swap backing allocators - fix avl_upper_bound failing if key is in the tree - add explanatory comments to avl_sift_down_bounded and avl_reorder_recursive because they are complex - fix avl_reorder and avl_sift_down_bounded being backwards, they were coded as if avl_heapify made a min heap but since it makes a max heap they have to be flipped --- include/crater/avl.h | 42 +++--- include/crater/avl_check.h | 11 ++ src/lib/crater/avl.c | 285 +++++++++++++++++++++---------------- 3 files changed, 196 insertions(+), 142 deletions(-) diff --git a/include/crater/avl.h b/include/crater/avl.h index 5bee5af..27d3b5a 100644 --- a/include/crater/avl.h +++ b/include/crater/avl.h @@ -158,13 +158,22 @@ int cr8r_avl_insert_update(cr8r_avl_node **r, void *key, cr8r_avl_ft*); /// @return a pointer to the new root node of the tree that contained n, in case it changed. Can be NULL. cr8r_avl_node *cr8r_avl_remove_node(cr8r_avl_node *n, cr8r_avl_ft*); +/// Add a given node to an existing tree. +/// +/// The existing tree may contain a node with the same element (see { @link cr8r_avl_attach_exclusive } for a version that disallows this). +/// @param [in] r: the root of the existing tree. Notice this is a pointer, not a pointer to a pointer as many other functions have. Can be NULL. +/// @param [in] n: the node to insert. Should not have links. +/// @param [out] is_duplicate: if this is not null, 0 is written if the node's key is not equal to any existing node, and 1 is written if it is. +/// @return a pointer to the new root, in case it changed. +cr8r_avl_node *cr8r_avl_attach(cr8r_avl_node *r, cr8r_avl_node *n, cr8r_avl_ft*, int *is_duplicate); + /// Add a given node to an existing tree. /// /// The existing tree should not already contain a node with the same element. /// @param [in] r: the root of the existing tree. Notice this is a pointer, not a pointer to a pointer as many other functions have. Can be NULL. /// @param [in] n: the node to insert. Should not have links. -/// @return a pointer to the new root, in case it changed, or NULL if a node with the same element as n already exists. -cr8r_avl_node *cr8r_avl_attach(cr8r_avl_node *r, cr8r_avl_node *n, cr8r_avl_ft*); +/// @return a pointer to the new root, in case it changed, or NULL if n has the same key as an existing node +cr8r_avl_node *cr8r_avl_attach_exclusive(cr8r_avl_node *r, cr8r_avl_node *n, cr8r_avl_ft*); /// Remove a given node from the tree containing it but do not free it. /// @@ -180,12 +189,17 @@ cr8r_avl_node *cr8r_avl_detach(cr8r_avl_node *n, cr8r_avl_ft*); /// @return a pointer to the node matching the given key, or NULL if no match exists. cr8r_avl_node *cr8r_avl_get(cr8r_avl_node *r, void *key, cr8r_avl_ft*); -/// Find the deepest node that is a would be ancestor of a given element, -/// a leaf node which is either the lower bound or upper bound of the given key. +/// Find the deepest node on the search path for a given key +/// If there is a node in the tree with the given key, returns a pointer to that node. +/// Otherwise, returns a pointer leaf node which is either the lower bound or upper bound +/// of the given key. In both cases, the node returned is the deepest node that would +/// be an ancestor of the key if it were inserted into the tree without rebalancing. /// /// @param [in] r: the root of the tree to search /// @param [in] key: element to search for in the tree -/// @return a pointer to the last node in the tree on the search path for the given key. +/// @return a pointer to the last node in the tree on the search path for the given key, +/// which could compare equal to the key, be its lower bound or upper bound, or, if the given +/// root is null, be null cr8r_avl_node *cr8r_avl_search(cr8r_avl_node *r, void *key, cr8r_avl_ft*); /// Find the root of the tree containing a given node. @@ -246,29 +260,25 @@ cr8r_avl_node *cr8r_avl_upper_bound(cr8r_avl_node *r, void *key, cr8r_avl_ft*); /// @param [in] r: the root of the tree, which is invalidated (unless ft->free doesn't invalidate nodes for some reason) void cr8r_avl_delete(cr8r_avl_node *r, cr8r_avl_ft*); -/// Swap two nodes in a tree, swapping their contents and then patching up links -/// -/// @param [in, out] a, b: the nodes to swap -/// @param [in] size: the size of the element. Only this is required rather than the entire function table -void cr8r_avl_swap_nodes(cr8r_avl_node *a, cr8r_avl_node *b, uint64_t size); - /// Restore the binary search tree invariant after decreasing the key for a single node /// /// After decreasing the key in a node, this function should be called, which then moves the node /// if necessary to ensure the tree is a BST. Only the given node should violate the BST condition. -/// Also requires the tree have no duplicate keys. /// @param [in] n: the node that has had its key decreased and may need to be moved -/// @return a pointer to the root of the tree, in case it changes, or NULL if a duplicate key is encountered -cr8r_avl_node *cr8r_avl_decrease(cr8r_avl_node *n, cr8r_avl_ft*); +/// @param [out] is_duplicate: if this is not null, 0 is written if the decreased key is unique within the tree, +/// and 1 is written if it is equal to some existing key +/// @return a pointer to the root of the tree, in case it changes +cr8r_avl_node *cr8r_avl_decrease(cr8r_avl_node *n, cr8r_avl_ft*, int *is_duplicate); /// Restore the binary search tree invariant after increasing the key for a single node /// /// After increasing the key in a node, this function should be called, which then moves the node /// if necessary to ensure the tree is a BST. Only the given node should violate the BST condition. -/// Also requires the tree have no duplicate keys. /// @param [in] n: the node that has had its key increased and may need to be moved +/// @param [out] is_duplicate: if this is not null, 0 is written if the increased key is unique within the tree, +/// and 1 is written if it is equal to some existing key /// @return a pointer to the root of the tree, in case it changes, or NULL if a duplicate key is encountered -cr8r_avl_node *cr8r_avl_increase(cr8r_avl_node *n, cr8r_avl_ft*); +cr8r_avl_node *cr8r_avl_increase(cr8r_avl_node *n, cr8r_avl_ft*, int *is_duplicate); /// Find the first node in a postorder traversal of the tree /// diff --git a/include/crater/avl_check.h b/include/crater/avl_check.h index eef4a31..a75ce8d 100644 --- a/include/crater/avl_check.h +++ b/include/crater/avl_check.h @@ -24,3 +24,14 @@ inline static int cr8r_avl_check_balance(cr8r_avl_node *n){ return -1; } +#ifdef DEBUG +#include +#define CR8R_AVL_ASSERT_LINKS(n) assert(cr8r_avl_check_links(n)) +#define CR8R_AVL_ASSERT_BALANCE(n) assert(cr8r_avl_check_balance(n) != -1) +#define CR8R_AVL_ASSERT_ALL(n) do{CR8R_AVL_ASSERT_LINKS(n);CR8R_AVL_ASSERT_BALANCE(n);}while(0) +#else +#define CR8R_AVL_ASSERT_LINKS(n) +#define CR8R_AVL_ASSERT_BALANCE(n) +#define CR8R_AVL_ASSERT_ALL(n) +#endif + diff --git a/src/lib/crater/avl.c b/src/lib/crater/avl.c index 8aaab83..b384043 100644 --- a/src/lib/crater/avl.c +++ b/src/lib/crater/avl.c @@ -2,19 +2,8 @@ #include #include -#include - -#ifdef DEBUG -#include #include -#define ASSERT_LINKS(n) assert(cr8r_avl_check_links(n)) -#define ASSERT_BALANCE(n) assert(cr8r_avl_check_balance(n) != -1) -#define ASSERT_ALL(n) do{ASSERT_LINKS(n);ASSERT_BALANCE(n);}while(0) -#else -#define ASSERT_LINKS(n) -#define ASSERT_BALANCE(n) -#define ASSERT_ALL(n) -#endif +#include static cr8r_avl_node *cr8r_avl_insert_recursive(cr8r_avl_node *r, void *key, cr8r_avl_ft *ft); inline static cr8r_avl_node *cr8r_avl_insert_rebalance_r(cr8r_avl_node *p, cr8r_avl_node *n); @@ -130,6 +119,46 @@ cr8r_avl_node *cr8r_avl_get(cr8r_avl_node *r, void *key, cr8r_avl_ft *ft){ return r; } +cr8r_avl_node *cr8r_avl_search_dups(cr8r_avl_node *r, void *key, cr8r_avl_ft *ft, int *is_duplicate){ + if(!r){ + if(is_duplicate){ + *is_duplicate = 0; + } + return NULL; + } + int found_dup = 0; + while(1){ + int ord = ft->cmp(&ft->base, r->data, key); + if(ord < 0){ + if(!r->right){ + break; + } + r = r->right; + }else if(ord > 0){ + if(!r->left){ + break; + } + r = r->left; + }else{ + found_dup = 1; + if(r->balance < 0){ + if(!r->right){ + break; + } + r = r->right; + }else if(!r->left){ + break; + }else{ + r = r->left; + } + } + } + if(is_duplicate){ + *is_duplicate = found_dup; + } + return r; +} + cr8r_avl_node *cr8r_avl_search(cr8r_avl_node *r, void *key, cr8r_avl_ft *ft){ if(!r){ return NULL; @@ -169,8 +198,8 @@ cr8r_avl_node *cr8r_avl_upper_bound(cr8r_avl_node *r, void *key, cr8r_avl_ft *ft }else if(ord > 0){ ret = cr8r_avl_upper_bound(r->left, key, ft); return ret ? ret : r; - } - return NULL; + }//otherwise r->data "==" key + return cr8r_avl_next(r); } cr8r_avl_node *cr8r_avl_first(cr8r_avl_node *r){ @@ -189,7 +218,7 @@ cr8r_avl_node *cr8r_avl_last(cr8r_avl_node *r){ int cr8r_avl_insert(cr8r_avl_node **r, void *key, cr8r_avl_ft *ft){ cr8r_avl_node *t = cr8r_avl_insert_recursive(*r, key, ft); - ASSERT_ALL(*r); + CR8R_AVL_ASSERT_ALL(*r); if(t){ *r = t; } @@ -238,7 +267,7 @@ int cr8r_avl_insert_update(cr8r_avl_node **r, void *key, cr8r_avl_ft *ft){ return 0; } *r = cr8r_avl_insert_retrace(t->right); - ASSERT_ALL(*r); + CR8R_AVL_ASSERT_ALL(*r); return CR8R_AVL_INSERTED; } }else if(ord > 0){ @@ -249,7 +278,7 @@ int cr8r_avl_insert_update(cr8r_avl_node **r, void *key, cr8r_avl_ft *ft){ return 0; } *r = cr8r_avl_insert_retrace(t->left); - ASSERT_ALL(*r); + CR8R_AVL_ASSERT_ALL(*r); return CR8R_AVL_INSERTED; } }else{ @@ -260,7 +289,7 @@ int cr8r_avl_insert_update(cr8r_avl_node **r, void *key, cr8r_avl_ft *ft){ int cr8r_avl_remove(cr8r_avl_node **r, void *key, cr8r_avl_ft *ft){ cr8r_avl_node *n = cr8r_avl_get(*r, key, ft); - ASSERT_ALL(*r); + CR8R_AVL_ASSERT_ALL(*r); if(n){ *r = cr8r_avl_remove_node(n, ft); } @@ -485,7 +514,27 @@ void cr8r_avl_delete(cr8r_avl_node *r, cr8r_avl_ft *ft){ ft->free(&ft->base, r); } -cr8r_avl_node *cr8r_avl_attach(cr8r_avl_node *r, cr8r_avl_node *n, cr8r_avl_ft *ft){ +cr8r_avl_node *cr8r_avl_attach(cr8r_avl_node *r, cr8r_avl_node *n, cr8r_avl_ft *ft, int *is_duplicate){ + if(!r){ + if(is_duplicate){ + *is_duplicate = 0; + } + return n; + } + r = cr8r_avl_search_dups(r, n->data, ft, is_duplicate); + int ord = ft->cmp(&ft->base, r->data, n->data); + if(ord > 0){ + r->left = n; + }else if(ord < 0 || r->balance < 0){ + r->right = n; + }else{ + r->left = n; + } + n->parent = r; + return cr8r_avl_insert_retrace(n); +} + +cr8r_avl_node *cr8r_avl_attach_exclusive(cr8r_avl_node *r, cr8r_avl_node *n, cr8r_avl_ft *ft){ if(!r){ return n; } @@ -514,107 +563,40 @@ cr8r_avl_node *cr8r_avl_detach(cr8r_avl_node *n, cr8r_avl_ft *ft){ return r; } -void cr8r_avl_swap_nodes(cr8r_avl_node *a, cr8r_avl_node *b, uint64_t size){ - if(a == b){ - return; - } - cr8r_avl_node a_l = *a, b_l = *b; - char tmp[offsetof(cr8r_avl_node, data) + size]; - memcpy(tmp, a, offsetof(cr8r_avl_node, data) + size); - memcpy(a, b, offsetof(cr8r_avl_node, data) + size); - memcpy(b, tmp, offsetof(cr8r_avl_node, data) + size); - if(a_l.parent == b){ - if(b_l.left == a){ - a->left = b; - }else{ - a->right = b; - } - b->parent = a; - if(a->parent){ - if(a->parent->left == b){ - a->parent->left = a; - }else{ - a->parent->right = a; - } - } - }else if(b_l.parent == a){ - if(a_l.left == b){ - b->left = a; - }else{ - b->right = a; - } - a->parent = b; - if(b->parent){ - if(b->parent->left == a){ - b->parent->left = b; - }else{ - b->parent->right = b; - } - } - }else{ - if(a->parent){ - if(b->parent == a->parent){ - if(a->parent->left == b){ - a->parent->left = a; - b->parent->right = b; - }else{ - a->parent->right = a; - b->parent->left = b; - } - }else{ - if(a->parent->left == b){ - a->parent->left = a; - }else{ - a->parent->right = a; - } - } - } - if(b->parent && b->parent != a->parent){ - if(b->parent->left == a){ - b->parent->left = b; - }else{ - b->parent->right = b; - } - } - } - if(a->left){ - a->left->parent = a; - } - if(a->right){ - a->right->parent = a; - } - if(b->left){ - b->left->parent = b; - } - if(b->right){ - b->right->parent = b; - } - ASSERT_ALL(cr8r_avl_root(a)); - ASSERT_ALL(cr8r_avl_root(b)); -} - -cr8r_avl_node *cr8r_avl_decrease(cr8r_avl_node *n, cr8r_avl_ft *ft){ +cr8r_avl_node *cr8r_avl_decrease(cr8r_avl_node *n, cr8r_avl_ft *ft, int *is_duplicate){ cr8r_avl_node *p = cr8r_avl_prev(n); int ord; if(!p || (ord = ft->cmp(&ft->base, p->data, n->data)) < 0){ + if(is_duplicate){ + *is_duplicate = 0; + } return cr8r_avl_root(n); }else if(!ord){ - return NULL; + if(is_duplicate){ + *is_duplicate = 1; + } + return cr8r_avl_root(n); } p = cr8r_avl_detach(n, ft); - return cr8r_avl_attach(p, n, ft); + return cr8r_avl_attach(p, n, ft, is_duplicate); } -cr8r_avl_node *cr8r_avl_increase(cr8r_avl_node *n, cr8r_avl_ft *ft){ +cr8r_avl_node *cr8r_avl_increase(cr8r_avl_node *n, cr8r_avl_ft *ft, int *is_duplicate){ cr8r_avl_node *s = cr8r_avl_next(n); int ord; if(!s || (ord = ft->cmp(&ft->base, s->data, n->data)) > 0){ + if(is_duplicate){ + *is_duplicate = 0; + } return cr8r_avl_root(n); }else if(!ord){ - return NULL; + if(is_duplicate){ + *is_duplicate = 1; + } + return cr8r_avl_root(n); } s = cr8r_avl_detach(n, ft); - return cr8r_avl_attach(s, n, ft); + return cr8r_avl_attach(s, n, ft, is_duplicate); } cr8r_avl_node *cr8r_avl_first_post(cr8r_avl_node *r){ @@ -644,25 +626,31 @@ void cr8r_avl_sift_down(cr8r_avl_node *r, cr8r_avl_ft *ft){ cr8r_avl_sift_down_bounded(r, NULL, ft); } +// sift down for a partially max heap / partially bst avl tree, where u is the first node (in inorder traversal order) +// in the bst portion (and also an upper bound for the max heap under ft->cmp). +// When u is NULL, this decays to sift down for a fully max heap avl tree. +// Partially max heap / partially bst avl trees only arise during execution of avl_reorder, +// and that function does some extra pointer juggling to ensure that only checking right children for u and stopping once +// it is found is correct; see that function for more info void cr8r_avl_sift_down_bounded(cr8r_avl_node *r, cr8r_avl_node *u, cr8r_avl_ft *ft){ - cr8r_avl_node *min_child; + cr8r_avl_node *max_child; while(1){ - if(r->left && r->left != u){ - if(r->right){ - min_child = ft->cmp(&ft->base, r->left->data, r->right->data) > 0 ? r->left : r->right; + if(r->right && r->right != u){ + if(r->left){ + max_child = ft->cmp(&ft->base, r->left->data, r->right->data) >= 0 ? r->left : r->right; }else{ - min_child = r->left; + max_child = r->right; } - }else if(r->right){ - min_child = r->right; + }else if(r->left){ + max_child = r->left; }else{ break; } - if(ft->cmp(&ft->base, min_child->data, r->data) <= 0){ + if(ft->cmp(&ft->base, max_child->data, r->data) <= 0){ break; } - cr8r_avl_swap_data(min_child, r, ft->base.size); - r = min_child; + cr8r_avl_swap_data(max_child, r, ft->base.size); + r = max_child; } } @@ -712,7 +700,7 @@ cr8r_avl_node *cr8r_avl_heappop_node(cr8r_avl_node **r, cr8r_avl_ft *ft){ s->right->parent = s; } s->balance = (*r)->balance; - ASSERT_ALL(s); + CR8R_AVL_ASSERT_ALL(s); (*r)->left = (*r)->right = NULL; (*r)->balance = 0; cr8r_avl_node *res = *r; @@ -730,32 +718,77 @@ void cr8r_avl_reorder_recursive(cr8r_avl_node *r, cr8r_avl_ft *ft){ if(!r){ return; } - for(cr8r_avl_node *u = cr8r_avl_first(r); u != r;){ + // in this function, r is initially fully a max heap, but as we proceed, we convert + // it to a bst in reverse inorder order. We do this by repeatedly swapping the data + // in the root of the max heap portion with the first unvisited node in reverse inorder order. + // Then we sift down the root of the max heap to preserve the max heap invariant on the + // heap portion of the avl tree. The bst portion trivially has its bst invariant satisfied, + // and we never break the avl invariants because we never actually change the shape of the tree. + // HOWEVER, if leave the links in the tree alone, then the max heap region and the bst region + // are not "nice": in particular if the node we are on in reverse inorder traversal has a left subtree, + // then we have visited it and its right subtree and when we call sift down it would have to + // determine if it ever ran into u or one of u's reverse inorder predecessors and skip any such nodes. + // To avoid this, we use sift_down_bounded and adjust the links in the tree as we go + for(cr8r_avl_node *u = cr8r_avl_last(r); u != r;){ cr8r_avl_swap_data(u, r, ft->base.size); cr8r_avl_sift_down_bounded(r, u, ft); - if(u->right){ - cr8r_avl_reorder_retrace(u->right); - u = cr8r_avl_first(u->right); + if(u->left){ + // if u has a left subtree, the next node in reverse inorder is cr8r_avl_first(u->left), + // but we want sift_down_bounded to skip u and its reverse inorder predecessors (ie the nodes + // in the bst portion of the avl tree). Therefore, we find the FIRST ancestor of u where u is in its right subtree, + // and we change its right pointer to point to u->left instead. This ancestor is guaranteed to exist because + // u is a right descendent of the root (in a reverse inorder traversal this is true until we hit the root, so once we + // hit the root we force it to continue to be true by recursing on the left subtree, see below). + // Changing the first left ancestor of u (first ancestor where u is a right descendent) so its right child is u->left + // breaks the structure of the tree, but we will fix it later. + // Notice that for ancestors where u is in their left subtree, they are reverse inorder predecessors of u, so + // they have already been visited. + cr8r_avl_reorder_retrace(u->left); + u = cr8r_avl_last(u->left); }else{ + // find the first left ancestor of u (which must itself be either the root or in the right subtree of the root, since u is in + // the right subtree of the root). Since u has no left subtree, this is u's successor in reverse inorder traversal. + // We also fix this first left ancestor so that its right child pointer points at its descendent again and not u. u = cr8r_avl_reorder_relink(u); } } - cr8r_avl_reorder_recursive(r->right, ft); -} - + // if u == r, we have reached the root node. At this point, the top of the heap and the current node in reverse inorder traversal + // are the same, so we don't actually have to call avl_swap_data here. + // We also don't need to temporarily set r->left->parent to NULL, because this recursive call will only look for left ancestors of + // u for u in the right subtree of r->left, so the left ancestor search will always terminate at or before r->left. + // Then the recursive call will recurse to r->left->left if needed and so on. + cr8r_avl_reorder_recursive(r->left, ft); +} + +// given n the nonempty left subtree of the just-filled current reverse inorder traversal node u, +// find the first left ancestor of n (the first ancestor where n is a right descendent), +// and change this ancestor's right link to point at n instead. Note that u == n->parent, and the +// original shape of the tree can be recovered later: when we walk up from u, any ancestor that is +// a left child of its parent will still have the correct parent pointer and its parent will still have +// it as a left child. However, the first ancestor that is a right child of its parent will have its parent +// pointer correct but its parent's right child will point to n instead of it. +// this might fail to find a left ancestor, but recursing on the left subtree when u reaches the root ensures +// the root is always a left ancestor. This ensures calling sift_down_bounded on the root with u as the +// bound will see all nodes after u in the reverse inorder traversal of the original tree: when u has a left +// subtree, its entire left subtree is after it in this traversal, and in particular the rightmost element of +// its left subtree is its successor in the traversal. Also, any time we visit a node on the left edge of the +// left subtree, we update the first left ancestor to point at its left subtree as well, if any. void cr8r_avl_reorder_retrace(cr8r_avl_node *n){ cr8r_avl_node *a = n; - while(a == a->parent->right){ + while(a == a->parent->left){ a = a->parent; } - a->parent->left = n; + a->parent->right = n; } +// when u does not have a left child, its reverse inorder successor will instead be its first left ancestor. +// of course, this is also the node whose right child pointer we clobbered when first moving from any +// ancestor of u before its first left ancestor, so now is when we fix it up. cr8r_avl_node *cr8r_avl_reorder_relink(cr8r_avl_node *n){ - while(n == n->parent->right){ + while(n == n->parent->left){ n = n->parent; } - n->parent->left = n; + n->parent->right = n; return n->parent; }