forked from rpms/kernel
		
	Add epoll patch until it gets upstreamed Add trial patch to fix the scheduling while atomic issues around pidmap_init
		
			
				
	
	
		
			117 lines
		
	
	
		
			4.3 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
			
		
		
	
	
			117 lines
		
	
	
		
			4.3 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
epoll can acquire multiple ep->mutex on multiple "struct eventpoll"s
 | 
						|
at once in the case where one epoll fd is monitoring another epoll
 | 
						|
fd. This is perfectly OK, since we're careful about the lock ordering,
 | 
						|
but causes spurious lockdep warnings. Annotate the recursion using
 | 
						|
mutex_lock_nested, and add a comment explaining the nesting rules for
 | 
						|
good measure.
 | 
						|
 | 
						|
Reported-by: Paul Bolle <pebolle@tiscali.nl>
 | 
						|
Signed-off-by: Nelson Elhage <nelhage@nelhage.com>
 | 
						|
---
 | 
						|
 I've tested this on a synthetic epoll test case, that just adds e1 to
 | 
						|
 e2 and then does an epoll_wait(). I verified that it caused lockdep
 | 
						|
 problems on 3.0 and that this patch fixed it, but I haven't done more
 | 
						|
 extensive testing. Paul, are you able to test systemd against this?
 | 
						|
 | 
						|
 fs/eventpoll.c |   25 ++++++++++++++++++-------
 | 
						|
 1 files changed, 18 insertions(+), 7 deletions(-)
 | 
						|
 | 
						|
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
 | 
						|
index f9cfd16..0cb7bc6 100644
 | 
						|
--- a/fs/eventpoll.c
 | 
						|
+++ b/fs/eventpoll.c
 | 
						|
@@ -76,6 +76,15 @@
 | 
						|
  * Events that require holding "epmutex" are very rare, while for
 | 
						|
  * normal operations the epoll private "ep->mtx" will guarantee
 | 
						|
  * a better scalability.
 | 
						|
+ * It is possible to acquire multiple "ep->mtx"es at once in the case
 | 
						|
+ * when one epoll fd is added to another. In this case, we always
 | 
						|
+ * acquire the locks in the order of nesting (i.e. after epoll_ctl(e1,
 | 
						|
+ * EPOLL_CTL_ADD, e2), e1->mtx will always be acquired before
 | 
						|
+ * e2->mtx). Since we disallow cycles of epoll file descriptors, this
 | 
						|
+ * ensures that the mutexes are well-ordered. In order to communicate
 | 
						|
+ * this nesting to lockdep, when walking a tree of epoll file
 | 
						|
+ * descriptors, we use the current recursion depth as the lockdep
 | 
						|
+ * subkey.
 | 
						|
  */
 | 
						|
 
 | 
						|
 /* Epoll private bits inside the event mask */
 | 
						|
@@ -464,13 +473,15 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
 | 
						|
  * @ep: Pointer to the epoll private data structure.
 | 
						|
  * @sproc: Pointer to the scan callback.
 | 
						|
  * @priv: Private opaque data passed to the @sproc callback.
 | 
						|
+ * @depth: The current depth of recursive f_op->poll calls.
 | 
						|
  *
 | 
						|
  * Returns: The same integer error code returned by the @sproc callback.
 | 
						|
  */
 | 
						|
 static int ep_scan_ready_list(struct eventpoll *ep,
 | 
						|
 			      int (*sproc)(struct eventpoll *,
 | 
						|
 					   struct list_head *, void *),
 | 
						|
-			      void *priv)
 | 
						|
+			      void *priv,
 | 
						|
+			      int depth)
 | 
						|
 {
 | 
						|
 	int error, pwake = 0;
 | 
						|
 	unsigned long flags;
 | 
						|
@@ -481,7 +492,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
 | 
						|
 	 * We need to lock this because we could be hit by
 | 
						|
 	 * eventpoll_release_file() and epoll_ctl().
 | 
						|
 	 */
 | 
						|
-	mutex_lock(&ep->mtx);
 | 
						|
+	mutex_lock_nested(&ep->mtx, depth);
 | 
						|
 
 | 
						|
 	/*
 | 
						|
 	 * Steal the ready list, and re-init the original one to the
 | 
						|
@@ -670,7 +681,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 | 
						|
 
 | 
						|
 static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
 | 
						|
 {
 | 
						|
-	return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
 | 
						|
+	return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
 | 
						|
 }
 | 
						|
 
 | 
						|
 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 | 
						|
@@ -737,7 +748,7 @@ void eventpoll_release_file(struct file *file)
 | 
						|
 
 | 
						|
 		ep = epi->ep;
 | 
						|
 		list_del_init(&epi->fllink);
 | 
						|
-		mutex_lock(&ep->mtx);
 | 
						|
+		mutex_lock_nested(&ep->mtx, 0);
 | 
						|
 		ep_remove(ep, epi);
 | 
						|
 		mutex_unlock(&ep->mtx);
 | 
						|
 	}
 | 
						|
@@ -1134,7 +1145,7 @@ static int ep_send_events(struct eventpoll *ep,
 | 
						|
 	esed.maxevents = maxevents;
 | 
						|
 	esed.events = events;
 | 
						|
 
 | 
						|
-	return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
 | 
						|
+	return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
 | 
						|
 }
 | 
						|
 
 | 
						|
 static inline struct timespec ep_set_mstimeout(long ms)
 | 
						|
@@ -1267,7 +1278,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
 | 
						|
 	struct rb_node *rbp;
 | 
						|
 	struct epitem *epi;
 | 
						|
 
 | 
						|
-	mutex_lock(&ep->mtx);
 | 
						|
+	mutex_lock_nested(&ep->mtx, call_nests + 1);
 | 
						|
 	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
 | 
						|
 		epi = rb_entry(rbp, struct epitem, rbn);
 | 
						|
 		if (unlikely(is_file_epoll(epi->ffd.file))) {
 | 
						|
@@ -1409,7 +1420,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 | 
						|
 	}
 | 
						|
 
 | 
						|
 
 | 
						|
-	mutex_lock(&ep->mtx);
 | 
						|
+	mutex_lock_nested(&ep->mtx, 0);
 | 
						|
 
 | 
						|
 	/*
 | 
						|
 	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
 | 
						|
-- 
 | 
						|
1.7.4.1
 | 
						|
 | 
						|
--
 | 
						|
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
 | 
						|
the body of a message to majordomo@vger.kernel.org
 | 
						|
More majordomo info at  http://vger.kernel.org/majordomo-info.html
 |