diff --git a/TODO b/TODO index 54216ac87..7883ae53f 100644 --- a/TODO +++ b/TODO @@ -20,6 +20,7 @@ * 000[12]-mm-* * x86-efi-Calling-__pa-with-an-ioremap-address-is-invalid.patch (also CC'd to stable) +* epoll-limit-paths.patch (in linux-next via -mm tree) **** Other stuff that should go upstream (in decreasing likelyhood) ************************************ diff --git a/epoll-limit-paths.patch b/epoll-limit-paths.patch new file mode 100644 index 000000000..440db27b9 --- /dev/null +++ b/epoll-limit-paths.patch @@ -0,0 +1,465 @@ +From 6a4ca79652219cf22da800d990e5b46feaea1ad9 Mon Sep 17 00:00:00 2001 +From: Jason Baron +Date: Mon, 24 Oct 2011 14:59:02 +1100 +Subject: [PATCH] epoll: limit paths + +epoll: limit paths + +The current epoll code can be tickled to run basically indefinitely in +both loop detection path check (on ep_insert()), and in the wakeup paths. +The programs that tickle this behavior set up deeply linked networks of +epoll file descriptors that cause the epoll algorithms to traverse them +indefinitely. A couple of these sample programs have been previously +posted in this thread: https://lkml.org/lkml/2011/2/25/297. + +To fix the loop detection path check algorithms, I simply keep track of +the epoll nodes that have been already visited. Thus, the loop detection +becomes proportional to the number of epoll file descriptor and links. +This dramatically decreases the run-time of the loop check algorithm. In +one diabolical case I tried it reduced the run-time from 15 mintues (all +in kernel time) to .3 seconds. + +Fixing the wakeup paths could be done at wakeup time in a similar manner +by keeping track of nodes that have already been visited, but the +complexity is harder, since there can be multiple wakeups on different +cpus...Thus, I've opted to limit the number of possible wakeup paths when +the paths are created. + +This is accomplished, by noting that the end file descriptor points that +are found during the loop detection pass (from the newly added link), are +actually the sources for wakeup events. I keep a list of these file +descriptors and limit the number and length of these paths that emanate +from these 'source file descriptors'. In the current implemetation I +allow 1000 paths of length 1, 500 of length 2, 100 of length 3, 50 of +length 4 and 10 of length 5. Note that it is sufficient to check the +'source file descriptors' reachable from the newly added link, since no +other 'source file descriptors' will have newly added links. This allows +us to check only the wakeup paths that may have gotten too long, and not +re-check all possible wakeup paths on the system. + +In terms of the path limit selection, I think its first worth noting that +the most common case for epoll, is probably the model where you have 1 +epoll file descriptor that is monitoring n number of 'source file +descriptors'. In this case, each 'source file descriptor' has a 1 path of +length 1. Thus, I believe that the limits I'm proposing are quite +reasonable and in fact may be too generous. Thus, I'm hoping that the +proposed limits will not prevent any workloads that currently work to +fail. + +In terms of locking, I have extended the use of the 'epmutex' to all +epoll_ctl add and remove operations. Currently its only used in a subset +of the add paths. I need to hold the epmutex, so that we can correctly +traverse a coherent graph, to check the number of paths. I believe that +this additional locking is probably ok, since its in the setup/teardown +paths, and doesn't affect the running paths, but it certainly is going to +add some extra overhead. Also, worth noting is that the epmuex was +recently added to the ep_ctl add operations in the initial path loop +detection code using the argument that it was not on a critical path. + +Another thing to note here, is the length of epoll chains that is allowed. +Currently, eventpoll.c defines: + +/* Maximum number of nesting allowed inside epoll sets */ +#define EP_MAX_NESTS 4 + +This basically means that I am limited to a graph depth of 5 (EP_MAX_NESTS ++ 1). However, this limit is currently only enforced during the loop +check detection code, and only when the epoll file descriptors are added +in a certain order. Thus, this limit is currently easily bypassed. The +newly added check for wakeup paths, stricly limits the wakeup paths to a +length of 5, regardless of the order in which ep's are linked together. +Thus, a side-effect of the new code is a more consistent enforcement of +the graph depth. + +Thus far, I've tested this, using the sample programs previously +mentioned, which now either return quickly or return -EINVAL. I've also +testing using the piptest.c epoll tester, which showed no difference in +performance. I've also created a number of different epoll networks and +tested that they behave as expectded. + +I believe this solves the original diabolical test cases, while still +preserving the sane epoll nesting. + +Signed-off-by: Jason Baron +Cc: Nelson Elhage +Cc: Davide Libenzi +Signed-off-by: Andrew Morton +--- + fs/eventpoll.c | 226 ++++++++++++++++++++++++++++++++++++++++----- + include/linux/eventpoll.h | 1 + + include/linux/fs.h | 1 + + 3 files changed, 203 insertions(+), 25 deletions(-) + +diff --git a/fs/eventpoll.c b/fs/eventpoll.c +index 4a53743..414ac74 100644 +--- a/fs/eventpoll.c ++++ b/fs/eventpoll.c +@@ -197,6 +197,12 @@ struct eventpoll { + + /* The user that created the eventpoll descriptor */ + struct user_struct *user; ++ ++ struct file *file; ++ ++ /* used to optimize loop detection check */ ++ int visited; ++ struct list_head visitedllink; + }; + + /* Wait structure used by the poll hooks */ +@@ -255,6 +261,12 @@ static struct kmem_cache *epi_cache __read_mostly; + /* Slab cache used to allocate "struct eppoll_entry" */ + static struct kmem_cache *pwq_cache __read_mostly; + ++/* Visited nodes during ep_loop_check(), so we can unset them when we finish */ ++LIST_HEAD(visited_list); ++ ++/* Files with newly added links, which need a limit on emanating paths */ ++LIST_HEAD(tfile_check_list); ++ + #ifdef CONFIG_SYSCTL + + #include +@@ -276,6 +288,12 @@ ctl_table epoll_table[] = { + }; + #endif /* CONFIG_SYSCTL */ + ++static const struct file_operations eventpoll_fops; ++ ++static inline int is_file_epoll(struct file *f) ++{ ++ return f->f_op == &eventpoll_fops; ++} + + /* Setup the structure that is used as key for the RB tree */ + static inline void ep_set_ffd(struct epoll_filefd *ffd, +@@ -711,12 +729,6 @@ static const struct file_operations eventpoll_fops = { + .llseek = noop_llseek, + }; + +-/* Fast test to see if the file is an evenpoll file */ +-static inline int is_file_epoll(struct file *f) +-{ +- return f->f_op == &eventpoll_fops; +-} +- + /* + * This is called from eventpoll_release() to unlink files from the eventpoll + * interface. We need to have this facility to cleanup correctly files that are +@@ -926,6 +938,96 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) + rb_insert_color(&epi->rbn, &ep->rbr); + } + ++ ++ ++#define PATH_ARR_SIZE 5 ++/* These are the number paths of length 1 to 5, that we are allowing to emanate ++ * from a single file of interest. For example, we allow 1000 paths of length ++ * 1, to emanate from each file of interest. This essentially represents the ++ * potential wakeup paths, which need to be limited in order to avoid massive ++ * uncontrolled wakeup storms. The common use case should be a single ep which ++ * is connected to n file sources. In this case each file source has 1 path ++ * of length 1. Thus, the numbers below should be more than sufficient. ++ */ ++int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; ++int path_count[PATH_ARR_SIZE]; ++ ++static int path_count_inc(int nests) ++{ ++ if (++path_count[nests] > path_limits[nests]) ++ return -1; ++ return 0; ++} ++ ++static void path_count_init(void) ++{ ++ int i; ++ ++ for (i = 0; i < PATH_ARR_SIZE; i++) ++ path_count[i] = 0; ++} ++ ++static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) ++{ ++ int error = 0; ++ struct file *file = priv; ++ struct file *child_file; ++ struct epitem *epi; ++ ++ list_for_each_entry(epi, &file->f_ep_links, fllink) { ++ child_file = epi->ep->file; ++ if (is_file_epoll(child_file)) { ++ if (list_empty(&child_file->f_ep_links)) { ++ if (path_count_inc(call_nests)) { ++ error = -1; ++ break; ++ } ++ } else { ++ error = ep_call_nested(&poll_loop_ncalls, ++ EP_MAX_NESTS, ++ reverse_path_check_proc, ++ child_file, child_file, ++ current); ++ } ++ if (error != 0) ++ break; ++ } else { ++ printk(KERN_ERR "reverse_path_check_proc: " ++ "file is not an ep!\n"); ++ } ++ } ++ return error; ++} ++ ++/** ++ * reverse_path_check - The tfile_check_list is list of file *, which have ++ * links that are proposed to be newly added. We need to ++ * make sure that those added links don't add too many ++ * paths such that we will spend all our time waking up ++ * eventpoll objects. ++ * ++ * Returns: Returns zero if the proposed links don't create too many paths, ++ * -1 otherwise. ++ */ ++static int reverse_path_check(void) ++{ ++ int length = 0; ++ int error = 0; ++ struct file *current_file; ++ ++ /* let's call this for all tfiles */ ++ list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { ++ length++; ++ path_count_init(); ++ error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, ++ reverse_path_check_proc, current_file, ++ current_file, current); ++ if (error) ++ break; ++ } ++ return error; ++} ++ + /* + * Must be called with "mtx" held. + */ +@@ -987,6 +1089,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, + */ + ep_rbtree_insert(ep, epi); + ++ /* now check if we've created too many backpaths */ ++ error = -EINVAL; ++ if (reverse_path_check()) ++ goto error_remove_epi; ++ + /* We have to drop the new item inside our item list to keep track of it */ + spin_lock_irqsave(&ep->lock, flags); + +@@ -1011,6 +1118,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, + + return 0; + ++error_remove_epi: ++ spin_lock(&tfile->f_lock); ++ if (ep_is_linked(&epi->fllink)) ++ list_del_init(&epi->fllink); ++ spin_unlock(&tfile->f_lock); ++ ++ rb_erase(&epi->rbn, &ep->rbr); ++ + error_unregister: + ep_unregister_pollwait(ep, epi); + +@@ -1275,18 +1390,35 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) + int error = 0; + struct file *file = priv; + struct eventpoll *ep = file->private_data; ++ struct eventpoll *ep_tovisit; + struct rb_node *rbp; + struct epitem *epi; + + mutex_lock_nested(&ep->mtx, call_nests + 1); ++ ep->visited = 1; ++ list_add(&ep->visitedllink, &visited_list); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + epi = rb_entry(rbp, struct epitem, rbn); + if (unlikely(is_file_epoll(epi->ffd.file))) { ++ ep_tovisit = epi->ffd.file->private_data; ++ if (ep_tovisit->visited) ++ continue; + error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, +- ep_loop_check_proc, epi->ffd.file, +- epi->ffd.file->private_data, current); ++ ep_loop_check_proc, epi->ffd.file, ++ ep_tovisit, current); + if (error != 0) + break; ++ } else { ++ /* if we've reached a file that is not associated with ++ * an ep, then then we need to check if the newly added ++ * links are going to add too many wakeup paths. We do ++ * this by adding it to the tfile_check_list, if it's ++ * not already there, and calling reverse_path_check() ++ * during ep_insert() ++ */ ++ if (list_empty(&epi->ffd.file->f_tfile_llink)) ++ list_add(&epi->ffd.file->f_tfile_llink, ++ &tfile_check_list); + } + } + mutex_unlock(&ep->mtx); +@@ -1307,8 +1439,30 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) + */ + static int ep_loop_check(struct eventpoll *ep, struct file *file) + { +- return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, ++ int ret; ++ struct eventpoll *ep_cur, *ep_next; ++ ++ ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + ep_loop_check_proc, file, ep, current); ++ /* clear visited list */ ++ list_for_each_entry_safe(ep_cur, ep_next, &visited_list, visitedllink) { ++ ep_cur->visited = 0; ++ list_del(&ep_cur->visitedllink); ++ } ++ return ret; ++} ++ ++static void clear_tfile_check_list(void) ++{ ++ struct file *file; ++ ++ /* first clear the tfile_check_list */ ++ while (!list_empty(&tfile_check_list)) { ++ file = list_first_entry(&tfile_check_list, struct file, ++ f_tfile_llink); ++ list_del_init(&file->f_tfile_llink); ++ } ++ INIT_LIST_HEAD(&tfile_check_list); + } + + /* +@@ -1316,8 +1470,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file) + */ + SYSCALL_DEFINE1(epoll_create1, int, flags) + { +- int error; ++ int error, fd; + struct eventpoll *ep = NULL; ++ struct file *file; + + /* Check the EPOLL_* constant for consistency. */ + BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); +@@ -1334,11 +1489,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) + * Creates all the items needed to setup an eventpoll file. That is, + * a file structure and a free file descriptor. + */ +- error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, ++ fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); ++ if (fd < 0) { ++ error = fd; ++ goto out_free_ep; ++ } ++ file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, + O_RDWR | (flags & O_CLOEXEC)); +- if (error < 0) +- ep_free(ep); +- ++ if (IS_ERR(file)) { ++ error = PTR_ERR(file); ++ goto out_free_fd; ++ } ++ fd_install(fd, file); ++ ep->file = file; ++ return fd; ++ ++out_free_fd: ++ put_unused_fd(fd); ++out_free_ep: ++ ep_free(ep); + return error; + } + +@@ -1404,21 +1573,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, + /* + * When we insert an epoll file descriptor, inside another epoll file + * descriptor, there is the change of creating closed loops, which are +- * better be handled here, than in more critical paths. ++ * better be handled here, than in more critical paths. While we are ++ * checking for loops we also determine the list of files reachable ++ * and hang them on the tfile_check_list, so we can check that we ++ * haven't created too many possible wakeup paths. + * +- * We hold epmutex across the loop check and the insert in this case, in +- * order to prevent two separate inserts from racing and each doing the +- * insert "at the same time" such that ep_loop_check passes on both +- * before either one does the insert, thereby creating a cycle. ++ * We need to hold the epmutex across both ep_insert and ep_remove ++ * b/c we want to make sure we are looking at a coherent view of ++ * epoll network. + */ +- if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { ++ if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) { + mutex_lock(&epmutex); + did_lock_epmutex = 1; +- error = -ELOOP; +- if (ep_loop_check(ep, tfile) != 0) +- goto error_tgt_fput; + } +- ++ if (op == EPOLL_CTL_ADD) { ++ if (is_file_epoll(tfile)) { ++ error = -ELOOP; ++ if (ep_loop_check(ep, tfile) != 0) ++ goto error_tgt_fput; ++ } else ++ list_add(&tfile->f_tfile_llink, &tfile_check_list); ++ } + + mutex_lock_nested(&ep->mtx, 0); + +@@ -1437,6 +1612,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, + error = ep_insert(ep, &epds, tfile, fd); + } else + error = -EEXIST; ++ clear_tfile_check_list(); + break; + case EPOLL_CTL_DEL: + if (epi) +@@ -1455,7 +1631,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, + mutex_unlock(&ep->mtx); + + error_tgt_fput: +- if (unlikely(did_lock_epmutex)) ++ if (did_lock_epmutex) + mutex_unlock(&epmutex); + + fput(tfile); +diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h +index f362733..657ab55 100644 +--- a/include/linux/eventpoll.h ++++ b/include/linux/eventpoll.h +@@ -61,6 +61,7 @@ struct file; + static inline void eventpoll_init_file(struct file *file) + { + INIT_LIST_HEAD(&file->f_ep_links); ++ INIT_LIST_HEAD(&file->f_tfile_llink); + } + + +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 277f497..93778e0 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -985,6 +985,7 @@ struct file { + #ifdef CONFIG_EPOLL + /* Used by fs/eventpoll.c to link all the hooks to this file */ + struct list_head f_ep_links; ++ struct list_head f_tfile_llink; + #endif /* #ifdef CONFIG_EPOLL */ + struct address_space *f_mapping; + #ifdef CONFIG_DEBUG_WRITECOUNT +-- +1.7.6.4 + diff --git a/kernel.spec b/kernel.spec index 96984b714..b9cbb019c 100644 --- a/kernel.spec +++ b/kernel.spec @@ -51,7 +51,7 @@ Summary: The Linux kernel # For non-released -rc kernels, this will be prepended with "0.", so # for example a 3 here will become 0.3 # -%global baserelease 2 +%global baserelease 3 %global fedora_build %{baserelease} # base_sublevel is the kernel version we're starting with and patching @@ -730,6 +730,7 @@ Patch12025: rcu-avoid-just-onlined-cpu-resched.patch Patch12026: block-stray-block-put-after-teardown.patch Patch12027: usb-add-quirk-for-logitech-webcams.patch Patch12029: crypto-register-cryptd-first.patch +Patch12030: epoll-limit-paths.patch Patch12303: dmar-disable-when-ricoh-multifunction.patch @@ -1360,6 +1361,7 @@ ApplyPatch add-appleir-usb-driver.patch ApplyPatch udlfb-bind-framebuffer-to-interface.patch ApplyPatch ums-realtek-driver-uses-stack-memory-for-DMA.patch ApplyPatch epoll-fix-spurious-lockdep-warnings.patch +ApplyPatch epoll-limit-paths.patch ApplyPatch rcu-avoid-just-onlined-cpu-resched.patch ApplyPatch block-stray-block-put-after-teardown.patch ApplyPatch usb-add-quirk-for-logitech-webcams.patch @@ -2104,6 +2106,9 @@ fi # ||----w | # || || %changelog +* Tue Oct 25 2011 Josh Boyer +- CVE-2011-1083: excessive in kernel CPU consumption when creating large nested epoll structures (rhbz 748668) + * Mon Oct 24 2011 Josh Boyer - Backport 3 fixed from linux-next to fix dib0700 playback (rhbz 733827)