430 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			ReStructuredText
		
	
	
	
	
	
			
		
		
	
	
			430 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			ReStructuredText
		
	
	
	
	
	
| =================================
 | |
| Red-black Trees (rbtree) in Linux
 | |
| =================================
 | |
| 
 | |
| 
 | |
| :Date: January 18, 2007
 | |
| :Author: Rob Landley <rob@landley.net>
 | |
| 
 | |
| What are red-black trees, and what are they for?
 | |
| ------------------------------------------------
 | |
| 
 | |
| Red-black trees are a type of self-balancing binary search tree, used for
 | |
| storing sortable key/value data pairs.  This differs from radix trees (which
 | |
| are used to efficiently store sparse arrays and thus use long integer indexes
 | |
| to insert/access/delete nodes) and hash tables (which are not kept sorted to
 | |
| be easily traversed in order, and must be tuned for a specific size and
 | |
| hash function where rbtrees scale gracefully storing arbitrary keys).
 | |
| 
 | |
| Red-black trees are similar to AVL trees, but provide faster real-time bounded
 | |
| worst case performance for insertion and deletion (at most two rotations and
 | |
| three rotations, respectively, to balance the tree), with slightly slower
 | |
| (but still O(log n)) lookup time.
 | |
| 
 | |
| To quote Linux Weekly News:
 | |
| 
 | |
|     There are a number of red-black trees in use in the kernel.
 | |
|     The deadline and CFQ I/O schedulers employ rbtrees to
 | |
|     track requests; the packet CD/DVD driver does the same.
 | |
|     The high-resolution timer code uses an rbtree to organize outstanding
 | |
|     timer requests.  The ext3 filesystem tracks directory entries in a
 | |
|     red-black tree.  Virtual memory areas (VMAs) are tracked with red-black
 | |
|     trees, as are epoll file descriptors, cryptographic keys, and network
 | |
|     packets in the "hierarchical token bucket" scheduler.
 | |
| 
 | |
| This document covers use of the Linux rbtree implementation.  For more
 | |
| information on the nature and implementation of Red Black Trees,  see:
 | |
| 
 | |
|   Linux Weekly News article on red-black trees
 | |
|     https://lwn.net/Articles/184495/
 | |
| 
 | |
|   Wikipedia entry on red-black trees
 | |
|     https://en.wikipedia.org/wiki/Red-black_tree
 | |
| 
 | |
| Linux implementation of red-black trees
 | |
| ---------------------------------------
 | |
| 
 | |
| Linux's rbtree implementation lives in the file "lib/rbtree.c".  To use it,
 | |
| "#include <linux/rbtree.h>".
 | |
| 
 | |
| The Linux rbtree implementation is optimized for speed, and thus has one
 | |
| less layer of indirection (and better cache locality) than more traditional
 | |
| tree implementations.  Instead of using pointers to separate rb_node and data
 | |
| structures, each instance of struct rb_node is embedded in the data structure
 | |
| it organizes.  And instead of using a comparison callback function pointer,
 | |
| users are expected to write their own tree search and insert functions
 | |
| which call the provided rbtree functions.  Locking is also left up to the
 | |
| user of the rbtree code.
 | |
| 
 | |
| Creating a new rbtree
 | |
| ---------------------
 | |
| 
 | |
| Data nodes in an rbtree tree are structures containing a struct rb_node member::
 | |
| 
 | |
|   struct mytype {
 | |
|   	struct rb_node node;
 | |
|   	char *keystring;
 | |
|   };
 | |
| 
 | |
| When dealing with a pointer to the embedded struct rb_node, the containing data
 | |
| structure may be accessed with the standard container_of() macro.  In addition,
 | |
| individual members may be accessed directly via rb_entry(node, type, member).
 | |
| 
 | |
| At the root of each rbtree is an rb_root structure, which is initialized to be
 | |
| empty via:
 | |
| 
 | |
|   struct rb_root mytree = RB_ROOT;
 | |
| 
 | |
| Searching for a value in an rbtree
 | |
| ----------------------------------
 | |
| 
 | |
| Writing a search function for your tree is fairly straightforward: start at the
 | |
| root, compare each value, and follow the left or right branch as necessary.
 | |
| 
 | |
| Example::
 | |
| 
 | |
|   struct mytype *my_search(struct rb_root *root, char *string)
 | |
|   {
 | |
|   	struct rb_node *node = root->rb_node;
 | |
| 
 | |
|   	while (node) {
 | |
|   		struct mytype *data = container_of(node, struct mytype, node);
 | |
| 		int result;
 | |
| 
 | |
| 		result = strcmp(string, data->keystring);
 | |
| 
 | |
| 		if (result < 0)
 | |
|   			node = node->rb_left;
 | |
| 		else if (result > 0)
 | |
|   			node = node->rb_right;
 | |
| 		else
 | |
|   			return data;
 | |
| 	}
 | |
| 	return NULL;
 | |
|   }
 | |
| 
 | |
| Inserting data into an rbtree
 | |
| -----------------------------
 | |
| 
 | |
| Inserting data in the tree involves first searching for the place to insert the
 | |
| new node, then inserting the node and rebalancing ("recoloring") the tree.
 | |
| 
 | |
| The search for insertion differs from the previous search by finding the
 | |
| location of the pointer on which to graft the new node.  The new node also
 | |
| needs a link to its parent node for rebalancing purposes.
 | |
| 
 | |
| Example::
 | |
| 
 | |
|   int my_insert(struct rb_root *root, struct mytype *data)
 | |
|   {
 | |
|   	struct rb_node **new = &(root->rb_node), *parent = NULL;
 | |
| 
 | |
|   	/* Figure out where to put new node */
 | |
|   	while (*new) {
 | |
|   		struct mytype *this = container_of(*new, struct mytype, node);
 | |
|   		int result = strcmp(data->keystring, this->keystring);
 | |
| 
 | |
| 		parent = *new;
 | |
|   		if (result < 0)
 | |
|   			new = &((*new)->rb_left);
 | |
|   		else if (result > 0)
 | |
|   			new = &((*new)->rb_right);
 | |
|   		else
 | |
|   			return FALSE;
 | |
|   	}
 | |
| 
 | |
|   	/* Add new node and rebalance tree. */
 | |
|   	rb_link_node(&data->node, parent, new);
 | |
|   	rb_insert_color(&data->node, root);
 | |
| 
 | |
| 	return TRUE;
 | |
|   }
 | |
| 
 | |
| Removing or replacing existing data in an rbtree
 | |
| ------------------------------------------------
 | |
| 
 | |
| To remove an existing node from a tree, call::
 | |
| 
 | |
|   void rb_erase(struct rb_node *victim, struct rb_root *tree);
 | |
| 
 | |
| Example::
 | |
| 
 | |
|   struct mytype *data = mysearch(&mytree, "walrus");
 | |
| 
 | |
|   if (data) {
 | |
|   	rb_erase(&data->node, &mytree);
 | |
|   	myfree(data);
 | |
|   }
 | |
| 
 | |
| To replace an existing node in a tree with a new one with the same key, call::
 | |
| 
 | |
|   void rb_replace_node(struct rb_node *old, struct rb_node *new,
 | |
|   			struct rb_root *tree);
 | |
| 
 | |
| Replacing a node this way does not re-sort the tree: If the new node doesn't
 | |
| have the same key as the old node, the rbtree will probably become corrupted.
 | |
| 
 | |
| Iterating through the elements stored in an rbtree (in sort order)
 | |
| ------------------------------------------------------------------
 | |
| 
 | |
| Four functions are provided for iterating through an rbtree's contents in
 | |
| sorted order.  These work on arbitrary trees, and should not need to be
 | |
| modified or wrapped (except for locking purposes)::
 | |
| 
 | |
|   struct rb_node *rb_first(struct rb_root *tree);
 | |
|   struct rb_node *rb_last(struct rb_root *tree);
 | |
|   struct rb_node *rb_next(struct rb_node *node);
 | |
|   struct rb_node *rb_prev(struct rb_node *node);
 | |
| 
 | |
| To start iterating, call rb_first() or rb_last() with a pointer to the root
 | |
| of the tree, which will return a pointer to the node structure contained in
 | |
| the first or last element in the tree.  To continue, fetch the next or previous
 | |
| node by calling rb_next() or rb_prev() on the current node.  This will return
 | |
| NULL when there are no more nodes left.
 | |
| 
 | |
| The iterator functions return a pointer to the embedded struct rb_node, from
 | |
| which the containing data structure may be accessed with the container_of()
 | |
| macro, and individual members may be accessed directly via
 | |
| rb_entry(node, type, member).
 | |
| 
 | |
| Example::
 | |
| 
 | |
|   struct rb_node *node;
 | |
|   for (node = rb_first(&mytree); node; node = rb_next(node))
 | |
| 	printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring);
 | |
| 
 | |
| Cached rbtrees
 | |
| --------------
 | |
| 
 | |
| Computing the leftmost (smallest) node is quite a common task for binary
 | |
| search trees, such as for traversals or users relying on a the particular
 | |
| order for their own logic. To this end, users can use 'struct rb_root_cached'
 | |
| to optimize O(logN) rb_first() calls to a simple pointer fetch avoiding
 | |
| potentially expensive tree iterations. This is done at negligible runtime
 | |
| overhead for maintenance; albeit larger memory footprint.
 | |
| 
 | |
| Similar to the rb_root structure, cached rbtrees are initialized to be
 | |
| empty via::
 | |
| 
 | |
|   struct rb_root_cached mytree = RB_ROOT_CACHED;
 | |
| 
 | |
| Cached rbtree is simply a regular rb_root with an extra pointer to cache the
 | |
| leftmost node. This allows rb_root_cached to exist wherever rb_root does,
 | |
| which permits augmented trees to be supported as well as only a few extra
 | |
| interfaces::
 | |
| 
 | |
|   struct rb_node *rb_first_cached(struct rb_root_cached *tree);
 | |
|   void rb_insert_color_cached(struct rb_node *, struct rb_root_cached *, bool);
 | |
|   void rb_erase_cached(struct rb_node *node, struct rb_root_cached *);
 | |
| 
 | |
| Both insert and erase calls have their respective counterpart of augmented
 | |
| trees::
 | |
| 
 | |
|   void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *,
 | |
| 				  bool, struct rb_augment_callbacks *);
 | |
|   void rb_erase_augmented_cached(struct rb_node *, struct rb_root_cached *,
 | |
| 				 struct rb_augment_callbacks *);
 | |
| 
 | |
| 
 | |
| Support for Augmented rbtrees
 | |
| -----------------------------
 | |
| 
 | |
| Augmented rbtree is an rbtree with "some" additional data stored in
 | |
| each node, where the additional data for node N must be a function of
 | |
| the contents of all nodes in the subtree rooted at N. This data can
 | |
| be used to augment some new functionality to rbtree. Augmented rbtree
 | |
| is an optional feature built on top of basic rbtree infrastructure.
 | |
| An rbtree user who wants this feature will have to call the augmentation
 | |
| functions with the user provided augmentation callback when inserting
 | |
| and erasing nodes.
 | |
| 
 | |
| C files implementing augmented rbtree manipulation must include
 | |
| <linux/rbtree_augmented.h> instead of <linux/rbtree.h>. Note that
 | |
| linux/rbtree_augmented.h exposes some rbtree implementations details
 | |
| you are not expected to rely on; please stick to the documented APIs
 | |
| there and do not include <linux/rbtree_augmented.h> from header files
 | |
| either so as to minimize chances of your users accidentally relying on
 | |
| such implementation details.
 | |
| 
 | |
| On insertion, the user must update the augmented information on the path
 | |
| leading to the inserted node, then call rb_link_node() as usual and
 | |
| rb_augment_inserted() instead of the usual rb_insert_color() call.
 | |
| If rb_augment_inserted() rebalances the rbtree, it will callback into
 | |
| a user provided function to update the augmented information on the
 | |
| affected subtrees.
 | |
| 
 | |
| When erasing a node, the user must call rb_erase_augmented() instead of
 | |
| rb_erase(). rb_erase_augmented() calls back into user provided functions
 | |
| to updated the augmented information on affected subtrees.
 | |
| 
 | |
| In both cases, the callbacks are provided through struct rb_augment_callbacks.
 | |
| 3 callbacks must be defined:
 | |
| 
 | |
| - A propagation callback, which updates the augmented value for a given
 | |
|   node and its ancestors, up to a given stop point (or NULL to update
 | |
|   all the way to the root).
 | |
| 
 | |
| - A copy callback, which copies the augmented value for a given subtree
 | |
|   to a newly assigned subtree root.
 | |
| 
 | |
| - A tree rotation callback, which copies the augmented value for a given
 | |
|   subtree to a newly assigned subtree root AND recomputes the augmented
 | |
|   information for the former subtree root.
 | |
| 
 | |
| The compiled code for rb_erase_augmented() may inline the propagation and
 | |
| copy callbacks, which results in a large function, so each augmented rbtree
 | |
| user should have a single rb_erase_augmented() call site in order to limit
 | |
| compiled code size.
 | |
| 
 | |
| 
 | |
| Sample usage
 | |
| ^^^^^^^^^^^^
 | |
| 
 | |
| Interval tree is an example of augmented rb tree. Reference -
 | |
| "Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein.
 | |
| More details about interval trees:
 | |
| 
 | |
| Classical rbtree has a single key and it cannot be directly used to store
 | |
| interval ranges like [lo:hi] and do a quick lookup for any overlap with a new
 | |
| lo:hi or to find whether there is an exact match for a new lo:hi.
 | |
| 
 | |
| However, rbtree can be augmented to store such interval ranges in a structured
 | |
| way making it possible to do efficient lookup and exact match.
 | |
| 
 | |
| This "extra information" stored in each node is the maximum hi
 | |
| (max_hi) value among all the nodes that are its descendants. This
 | |
| information can be maintained at each node just be looking at the node
 | |
| and its immediate children. And this will be used in O(log n) lookup
 | |
| for lowest match (lowest start address among all possible matches)
 | |
| with something like::
 | |
| 
 | |
|   struct interval_tree_node *
 | |
|   interval_tree_first_match(struct rb_root *root,
 | |
| 			    unsigned long start, unsigned long last)
 | |
|   {
 | |
| 	struct interval_tree_node *node;
 | |
| 
 | |
| 	if (!root->rb_node)
 | |
| 		return NULL;
 | |
| 	node = rb_entry(root->rb_node, struct interval_tree_node, rb);
 | |
| 
 | |
| 	while (true) {
 | |
| 		if (node->rb.rb_left) {
 | |
| 			struct interval_tree_node *left =
 | |
| 				rb_entry(node->rb.rb_left,
 | |
| 					 struct interval_tree_node, rb);
 | |
| 			if (left->__subtree_last >= start) {
 | |
| 				/*
 | |
| 				 * Some nodes in left subtree satisfy Cond2.
 | |
| 				 * Iterate to find the leftmost such node N.
 | |
| 				 * If it also satisfies Cond1, that's the match
 | |
| 				 * we are looking for. Otherwise, there is no
 | |
| 				 * matching interval as nodes to the right of N
 | |
| 				 * can't satisfy Cond1 either.
 | |
| 				 */
 | |
| 				node = left;
 | |
| 				continue;
 | |
| 			}
 | |
| 		}
 | |
| 		if (node->start <= last) {		/* Cond1 */
 | |
| 			if (node->last >= start)	/* Cond2 */
 | |
| 				return node;	/* node is leftmost match */
 | |
| 			if (node->rb.rb_right) {
 | |
| 				node = rb_entry(node->rb.rb_right,
 | |
| 					struct interval_tree_node, rb);
 | |
| 				if (node->__subtree_last >= start)
 | |
| 					continue;
 | |
| 			}
 | |
| 		}
 | |
| 		return NULL;	/* No match */
 | |
| 	}
 | |
|   }
 | |
| 
 | |
| Insertion/removal are defined using the following augmented callbacks::
 | |
| 
 | |
|   static inline unsigned long
 | |
|   compute_subtree_last(struct interval_tree_node *node)
 | |
|   {
 | |
| 	unsigned long max = node->last, subtree_last;
 | |
| 	if (node->rb.rb_left) {
 | |
| 		subtree_last = rb_entry(node->rb.rb_left,
 | |
| 			struct interval_tree_node, rb)->__subtree_last;
 | |
| 		if (max < subtree_last)
 | |
| 			max = subtree_last;
 | |
| 	}
 | |
| 	if (node->rb.rb_right) {
 | |
| 		subtree_last = rb_entry(node->rb.rb_right,
 | |
| 			struct interval_tree_node, rb)->__subtree_last;
 | |
| 		if (max < subtree_last)
 | |
| 			max = subtree_last;
 | |
| 	}
 | |
| 	return max;
 | |
|   }
 | |
| 
 | |
|   static void augment_propagate(struct rb_node *rb, struct rb_node *stop)
 | |
|   {
 | |
| 	while (rb != stop) {
 | |
| 		struct interval_tree_node *node =
 | |
| 			rb_entry(rb, struct interval_tree_node, rb);
 | |
| 		unsigned long subtree_last = compute_subtree_last(node);
 | |
| 		if (node->__subtree_last == subtree_last)
 | |
| 			break;
 | |
| 		node->__subtree_last = subtree_last;
 | |
| 		rb = rb_parent(&node->rb);
 | |
| 	}
 | |
|   }
 | |
| 
 | |
|   static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new)
 | |
|   {
 | |
| 	struct interval_tree_node *old =
 | |
| 		rb_entry(rb_old, struct interval_tree_node, rb);
 | |
| 	struct interval_tree_node *new =
 | |
| 		rb_entry(rb_new, struct interval_tree_node, rb);
 | |
| 
 | |
| 	new->__subtree_last = old->__subtree_last;
 | |
|   }
 | |
| 
 | |
|   static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
 | |
|   {
 | |
| 	struct interval_tree_node *old =
 | |
| 		rb_entry(rb_old, struct interval_tree_node, rb);
 | |
| 	struct interval_tree_node *new =
 | |
| 		rb_entry(rb_new, struct interval_tree_node, rb);
 | |
| 
 | |
| 	new->__subtree_last = old->__subtree_last;
 | |
| 	old->__subtree_last = compute_subtree_last(old);
 | |
|   }
 | |
| 
 | |
|   static const struct rb_augment_callbacks augment_callbacks = {
 | |
| 	augment_propagate, augment_copy, augment_rotate
 | |
|   };
 | |
| 
 | |
|   void interval_tree_insert(struct interval_tree_node *node,
 | |
| 			    struct rb_root *root)
 | |
|   {
 | |
| 	struct rb_node **link = &root->rb_node, *rb_parent = NULL;
 | |
| 	unsigned long start = node->start, last = node->last;
 | |
| 	struct interval_tree_node *parent;
 | |
| 
 | |
| 	while (*link) {
 | |
| 		rb_parent = *link;
 | |
| 		parent = rb_entry(rb_parent, struct interval_tree_node, rb);
 | |
| 		if (parent->__subtree_last < last)
 | |
| 			parent->__subtree_last = last;
 | |
| 		if (start < parent->start)
 | |
| 			link = &parent->rb.rb_left;
 | |
| 		else
 | |
| 			link = &parent->rb.rb_right;
 | |
| 	}
 | |
| 
 | |
| 	node->__subtree_last = last;
 | |
| 	rb_link_node(&node->rb, rb_parent, link);
 | |
| 	rb_insert_augmented(&node->rb, root, &augment_callbacks);
 | |
|   }
 | |
| 
 | |
|   void interval_tree_remove(struct interval_tree_node *node,
 | |
| 			    struct rb_root *root)
 | |
|   {
 | |
| 	rb_erase_augmented(&node->rb, root, &augment_callbacks);
 | |
|   }
 |