From 7845789edd0ec32e4756ef845de93c70d7641c2f Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Wed, 12 Feb 2014 10:37:45 -0500 Subject: [PATCH] Fix cgroup destroy oops (rhbz 1045755) --- cgroup-fixes.patch | 342 +++++++++++++++++++++++++++++++++++++++++++++ kernel.spec | 7 + 2 files changed, 349 insertions(+) create mode 100644 cgroup-fixes.patch diff --git a/cgroup-fixes.patch b/cgroup-fixes.patch new file mode 100644 index 000000000..b76be1d70 --- /dev/null +++ b/cgroup-fixes.patch @@ -0,0 +1,342 @@ +From ab3f5faa6255a0eb4f832675507d9e295ca7e9ba Mon Sep 17 00:00:00 2001 +From: Hugh Dickins +Date: Thu, 06 Feb 2014 23:56:01 +0000 +Subject: cgroup: use an ordered workqueue for cgroup destruction + +Sometimes the cleanup after memcg hierarchy testing gets stuck in +mem_cgroup_reparent_charges(), unable to bring non-kmem usage down to 0. + +There may turn out to be several causes, but a major cause is this: the +workitem to offline parent can get run before workitem to offline child; +parent's mem_cgroup_reparent_charges() circles around waiting for the +child's pages to be reparented to its lrus, but it's holding cgroup_mutex +which prevents the child from reaching its mem_cgroup_reparent_charges(). + +Just use an ordered workqueue for cgroup_destroy_wq. + +tj: Committing as the temporary fix until the reverse dependency can + be removed from memcg. Comment updated accordingly. + +Fixes: e5fca243abae ("cgroup: use a dedicated workqueue for cgroup destruction") +Suggested-by: Filipe Brandenburger +Signed-off-by: Hugh Dickins +Cc: stable@vger.kernel.org # 3.10+ +Signed-off-by: Tejun Heo +--- +diff --git a/kernel/cgroup.c b/kernel/cgroup.c +index e2f46ba..aa95591 100644 +--- a/kernel/cgroup.c ++++ b/kernel/cgroup.c +@@ -4845,12 +4845,16 @@ static int __init cgroup_wq_init(void) + /* + * There isn't much point in executing destruction path in + * parallel. Good chunk is serialized with cgroup_mutex anyway. +- * Use 1 for @max_active. ++ * ++ * XXX: Must be ordered to make sure parent is offlined after ++ * children. The ordering requirement is for memcg where a ++ * parent's offline may wait for a child's leading to deadlock. In ++ * the long term, this should be fixed from memcg side. + * + * We would prefer to do this in cgroup_init() above, but that + * is called before init_workqueues(): so leave this until after. + */ +- cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); ++ cgroup_destroy_wq = alloc_ordered_workqueue("cgroup_destroy", 0); + BUG_ON(!cgroup_destroy_wq); + + /* +-- +cgit v0.9.2 +From eb46bf89696972b856a9adb6aebd5c7b65c266e4 Mon Sep 17 00:00:00 2001 +From: Tejun Heo +Date: Sat, 08 Feb 2014 15:26:33 +0000 +Subject: cgroup: fix error return value in cgroup_mount() + +When cgroup_mount() fails to allocate an id for the root, it didn't +set ret before jumping to unlock_drop ending up returning 0 after a +failure. Fix it. + +Signed-off-by: Tejun Heo +Acked-by: Li Zefan +Cc: stable@vger.kernel.org +--- +diff --git a/kernel/cgroup.c b/kernel/cgroup.c +index aa95591..793f371 100644 +--- a/kernel/cgroup.c ++++ b/kernel/cgroup.c +@@ -1566,10 +1566,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, + mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); + +- root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, +- 0, 1, GFP_KERNEL); +- if (root_cgrp->id < 0) ++ ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); ++ if (ret < 0) + goto unlock_drop; ++ root_cgrp->id = ret; + + /* Check for name clashes with existing mounts */ + ret = -EBUSY; +-- +cgit v0.9.2 +From b58c89986a77a23658682a100eb15d8edb571ebb Mon Sep 17 00:00:00 2001 +From: Tejun Heo +Date: Sat, 08 Feb 2014 15:26:33 +0000 +Subject: cgroup: fix error return from cgroup_create() + +cgroup_create() was returning 0 after allocation failures. Fix it. + +Signed-off-by: Tejun Heo +Acked-by: Li Zefan +Cc: stable@vger.kernel.org +--- +diff --git a/kernel/cgroup.c b/kernel/cgroup.c +index 793f371..0eb7b86 100644 +--- a/kernel/cgroup.c ++++ b/kernel/cgroup.c +@@ -4158,7 +4158,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, + struct cgroup *cgrp; + struct cgroup_name *name; + struct cgroupfs_root *root = parent->root; +- int ssid, err = 0; ++ int ssid, err; + struct cgroup_subsys *ss; + struct super_block *sb = root->sb; + +@@ -4168,8 +4168,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, + return -ENOMEM; + + name = cgroup_alloc_name(dentry); +- if (!name) ++ if (!name) { ++ err = -ENOMEM; + goto err_free_cgrp; ++ } + rcu_assign_pointer(cgrp->name, name); + + /* +@@ -4177,8 +4179,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, + * a half-baked cgroup. + */ + cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); +- if (cgrp->id < 0) ++ if (cgrp->id < 0) { ++ err = -ENOMEM; + goto err_free_name; ++ } + + /* + * Only live parents can have children. Note that the liveliness +-- +cgit v0.9.2 +From 48573a893303986e3b0b2974d6fb11f3d1bb7064 Mon Sep 17 00:00:00 2001 +From: Tejun Heo +Date: Sat, 08 Feb 2014 15:26:34 +0000 +Subject: cgroup: fix locking in cgroup_cfts_commit() + +cgroup_cfts_commit() walks the cgroup hierarchy that the target +subsystem is attached to and tries to apply the file changes. Due to +the convolution with inode locking, it can't keep cgroup_mutex locked +while iterating. It currently holds only RCU read lock around the +actual iteration and then pins the found cgroup using dget(). + +Unfortunately, this is incorrect. Although the iteration does check +cgroup_is_dead() before invoking dget(), there's nothing which +prevents the dentry from going away inbetween. Note that this is +different from the usual css iterations where css_tryget() is used to +pin the css - css_tryget() tests whether the css can be pinned and +fails if not. + +The problem can be solved by simply holding cgroup_mutex instead of +RCU read lock around the iteration, which actually reduces LOC. + +Signed-off-by: Tejun Heo +Acked-by: Li Zefan +Cc: stable@vger.kernel.org +--- +diff --git a/kernel/cgroup.c b/kernel/cgroup.c +index 0eb7b86..3edf716 100644 +--- a/kernel/cgroup.c ++++ b/kernel/cgroup.c +@@ -2763,10 +2763,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) + */ + update_before = cgroup_serial_nr_next; + +- mutex_unlock(&cgroup_mutex); +- + /* add/rm files for all cgroups created before */ +- rcu_read_lock(); + css_for_each_descendant_pre(css, cgroup_css(root, ss)) { + struct cgroup *cgrp = css->cgroup; + +@@ -2775,23 +2772,19 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) + + inode = cgrp->dentry->d_inode; + dget(cgrp->dentry); +- rcu_read_unlock(); +- + dput(prev); + prev = cgrp->dentry; + ++ mutex_unlock(&cgroup_mutex); + mutex_lock(&inode->i_mutex); + mutex_lock(&cgroup_mutex); + if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) + ret = cgroup_addrm_files(cgrp, cfts, is_add); +- mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); +- +- rcu_read_lock(); + if (ret) + break; + } +- rcu_read_unlock(); ++ mutex_unlock(&cgroup_mutex); + dput(prev); + deactivate_super(sb); + return ret; +-- +cgit v0.9.2 +From 0ab02ca8f887908152d1a96db5130fc661d36a1e Mon Sep 17 00:00:00 2001 +From: Li Zefan +Date: Tue, 11 Feb 2014 08:05:46 +0000 +Subject: cgroup: protect modifications to cgroup_idr with cgroup_mutex + +Setup cgroupfs like this: + # mount -t cgroup -o cpuacct xxx /cgroup + # mkdir /cgroup/sub1 + # mkdir /cgroup/sub2 + +Then run these two commands: + # for ((; ;)) { mkdir /cgroup/sub1/tmp && rmdir /mnt/sub1/tmp; } & + # for ((; ;)) { mkdir /cgroup/sub2/tmp && rmdir /mnt/sub2/tmp; } & + +After seconds you may see this warning: + +------------[ cut here ]------------ +WARNING: CPU: 1 PID: 25243 at lib/idr.c:527 sub_remove+0x87/0x1b0() +idr_remove called for id=6 which is not allocated. +... +Call Trace: + [] dump_stack+0x7a/0x96 + [] warn_slowpath_common+0x8c/0xc0 + [] warn_slowpath_fmt+0x46/0x50 + [] sub_remove+0x87/0x1b0 + [] ? css_killed_work_fn+0x32/0x1b0 + [] idr_remove+0x25/0xd0 + [] cgroup_destroy_css_killed+0x5b/0xc0 + [] css_killed_work_fn+0x130/0x1b0 + [] process_one_work+0x26c/0x550 + [] worker_thread+0x12e/0x3b0 + [] kthread+0xe6/0xf0 + [] ret_from_fork+0x7c/0xb0 +---[ end trace 2d1577ec10cf80d0 ]--- + +It's because allocating/removing cgroup ID is not properly synchronized. + +The bug was introduced when we converted cgroup_ida to cgroup_idr. +While synchronization is already done inside ida_simple_{get,remove}(), +users are responsible for concurrent calls to idr_{alloc,remove}(). + +tj: Refreshed on top of b58c89986a77 ("cgroup: fix error return from +cgroup_create()"). + +Fixes: 4e96ee8e981b ("cgroup: convert cgroup_ida to cgroup_idr") +Cc: #3.12+ +Reported-by: Michal Hocko +Signed-off-by: Li Zefan +Signed-off-by: Tejun Heo +--- +diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h +index 5c09759..9450f02 100644 +--- a/include/linux/cgroup.h ++++ b/include/linux/cgroup.h +@@ -166,6 +166,8 @@ struct cgroup { + * + * The ID of the root cgroup is always 0, and a new cgroup + * will be assigned with a smallest available ID. ++ * ++ * Allocating/Removing ID must be protected by cgroup_mutex. + */ + int id; + +diff --git a/kernel/cgroup.c b/kernel/cgroup.c +index 3edf716..52719ce 100644 +--- a/kernel/cgroup.c ++++ b/kernel/cgroup.c +@@ -886,7 +886,9 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) + * per-subsystem and moved to css->id so that lookups are + * successful until the target css is released. + */ ++ mutex_lock(&cgroup_mutex); + idr_remove(&cgrp->root->cgroup_idr, cgrp->id); ++ mutex_unlock(&cgroup_mutex); + cgrp->id = -1; + + call_rcu(&cgrp->rcu_head, cgroup_free_rcu); +@@ -4168,16 +4170,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, + rcu_assign_pointer(cgrp->name, name); + + /* +- * Temporarily set the pointer to NULL, so idr_find() won't return +- * a half-baked cgroup. +- */ +- cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); +- if (cgrp->id < 0) { +- err = -ENOMEM; +- goto err_free_name; +- } +- +- /* + * Only live parents can have children. Note that the liveliness + * check isn't strictly necessary because cgroup_mkdir() and + * cgroup_rmdir() are fully synchronized by i_mutex; however, do it +@@ -4186,7 +4178,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, + */ + if (!cgroup_lock_live_group(parent)) { + err = -ENODEV; +- goto err_free_id; ++ goto err_free_name; ++ } ++ ++ /* ++ * Temporarily set the pointer to NULL, so idr_find() won't return ++ * a half-baked cgroup. ++ */ ++ cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); ++ if (cgrp->id < 0) { ++ err = -ENOMEM; ++ goto err_unlock; + } + + /* Grab a reference on the superblock so the hierarchy doesn't +@@ -4218,7 +4220,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, + */ + err = cgroup_create_file(dentry, S_IFDIR | mode, sb); + if (err < 0) +- goto err_unlock; ++ goto err_free_id; + lockdep_assert_held(&dentry->d_inode->i_mutex); + + cgrp->serial_nr = cgroup_serial_nr_next++; +@@ -4254,12 +4256,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, + + return 0; + +-err_unlock: +- mutex_unlock(&cgroup_mutex); +- /* Release the reference count that we took on the superblock */ +- deactivate_super(sb); + err_free_id: + idr_remove(&root->cgroup_idr, cgrp->id); ++ /* Release the reference count that we took on the superblock */ ++ deactivate_super(sb); ++err_unlock: ++ mutex_unlock(&cgroup_mutex); + err_free_name: + kfree(rcu_dereference_raw(cgrp->name)); + err_free_cgrp: +-- +cgit v0.9.2 diff --git a/kernel.spec b/kernel.spec index 4b98a9e8c..00f24448d 100644 --- a/kernel.spec +++ b/kernel.spec @@ -629,6 +629,9 @@ Patch25193: fix-exynos-hdmi-build.patch #rhbz 1031296 Patch25194: tick-Clear-broadcast-pending-bit-when-switching-to-oneshot.patch +#rhbz 1045755 +Patch25195: cgroup-fixes.patch + # END OF PATCH DEFINITIONS %endif @@ -1280,6 +1283,9 @@ ApplyPatch fix-exynos-hdmi-build.patch #rhbz 1031296 ApplyPatch tick-Clear-broadcast-pending-bit-when-switching-to-oneshot.patch +#rhbz 1045755 +ApplyPatch cgroup-fixes.patch + # END OF PATCH APPLICATIONS %endif @@ -2060,6 +2066,7 @@ fi # || || %changelog * Wed Feb 12 2014 Josh Boyer +- Fix cgroup destroy oops (rhbz 1045755) - Fix backtrace in amd_e400_idle (rhbz 1031296) * Tue Feb 11 2014 Josh Boyer - 3.14.0-0.rc2.git1.1