Fix cgroup destroy oops (rhbz 1045755)

This commit is contained in:
Josh Boyer 2014-02-12 10:37:45 -05:00
parent 5fe48c913d
commit 7845789edd
2 changed files with 349 additions and 0 deletions

342
cgroup-fixes.patch Normal file
View File

@ -0,0 +1,342 @@
From ab3f5faa6255a0eb4f832675507d9e295ca7e9ba Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 06 Feb 2014 23:56:01 +0000
Subject: cgroup: use an ordered workqueue for cgroup destruction
Sometimes the cleanup after memcg hierarchy testing gets stuck in
mem_cgroup_reparent_charges(), unable to bring non-kmem usage down to 0.
There may turn out to be several causes, but a major cause is this: the
workitem to offline parent can get run before workitem to offline child;
parent's mem_cgroup_reparent_charges() circles around waiting for the
child's pages to be reparented to its lrus, but it's holding cgroup_mutex
which prevents the child from reaching its mem_cgroup_reparent_charges().
Just use an ordered workqueue for cgroup_destroy_wq.
tj: Committing as the temporary fix until the reverse dependency can
be removed from memcg. Comment updated accordingly.
Fixes: e5fca243abae ("cgroup: use a dedicated workqueue for cgroup destruction")
Suggested-by: Filipe Brandenburger <filbranden@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: stable@vger.kernel.org # 3.10+
Signed-off-by: Tejun Heo <tj@kernel.org>
---
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e2f46ba..aa95591 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4845,12 +4845,16 @@ static int __init cgroup_wq_init(void)
/*
* There isn't much point in executing destruction path in
* parallel. Good chunk is serialized with cgroup_mutex anyway.
- * Use 1 for @max_active.
+ *
+ * XXX: Must be ordered to make sure parent is offlined after
+ * children. The ordering requirement is for memcg where a
+ * parent's offline may wait for a child's leading to deadlock. In
+ * the long term, this should be fixed from memcg side.
*
* We would prefer to do this in cgroup_init() above, but that
* is called before init_workqueues(): so leave this until after.
*/
- cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
+ cgroup_destroy_wq = alloc_ordered_workqueue("cgroup_destroy", 0);
BUG_ON(!cgroup_destroy_wq);
/*
--
cgit v0.9.2
From eb46bf89696972b856a9adb6aebd5c7b65c266e4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 08 Feb 2014 15:26:33 +0000
Subject: cgroup: fix error return value in cgroup_mount()
When cgroup_mount() fails to allocate an id for the root, it didn't
set ret before jumping to unlock_drop ending up returning 0 after a
failure. Fix it.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: stable@vger.kernel.org
---
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index aa95591..793f371 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1566,10 +1566,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
mutex_lock(&cgroup_mutex);
mutex_lock(&cgroup_root_mutex);
- root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
- 0, 1, GFP_KERNEL);
- if (root_cgrp->id < 0)
+ ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
+ if (ret < 0)
goto unlock_drop;
+ root_cgrp->id = ret;
/* Check for name clashes with existing mounts */
ret = -EBUSY;
--
cgit v0.9.2
From b58c89986a77a23658682a100eb15d8edb571ebb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 08 Feb 2014 15:26:33 +0000
Subject: cgroup: fix error return from cgroup_create()
cgroup_create() was returning 0 after allocation failures. Fix it.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: stable@vger.kernel.org
---
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 793f371..0eb7b86 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4158,7 +4158,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
struct cgroup *cgrp;
struct cgroup_name *name;
struct cgroupfs_root *root = parent->root;
- int ssid, err = 0;
+ int ssid, err;
struct cgroup_subsys *ss;
struct super_block *sb = root->sb;
@@ -4168,8 +4168,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
return -ENOMEM;
name = cgroup_alloc_name(dentry);
- if (!name)
+ if (!name) {
+ err = -ENOMEM;
goto err_free_cgrp;
+ }
rcu_assign_pointer(cgrp->name, name);
/*
@@ -4177,8 +4179,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
* a half-baked cgroup.
*/
cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
- if (cgrp->id < 0)
+ if (cgrp->id < 0) {
+ err = -ENOMEM;
goto err_free_name;
+ }
/*
* Only live parents can have children. Note that the liveliness
--
cgit v0.9.2
From 48573a893303986e3b0b2974d6fb11f3d1bb7064 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 08 Feb 2014 15:26:34 +0000
Subject: cgroup: fix locking in cgroup_cfts_commit()
cgroup_cfts_commit() walks the cgroup hierarchy that the target
subsystem is attached to and tries to apply the file changes. Due to
the convolution with inode locking, it can't keep cgroup_mutex locked
while iterating. It currently holds only RCU read lock around the
actual iteration and then pins the found cgroup using dget().
Unfortunately, this is incorrect. Although the iteration does check
cgroup_is_dead() before invoking dget(), there's nothing which
prevents the dentry from going away inbetween. Note that this is
different from the usual css iterations where css_tryget() is used to
pin the css - css_tryget() tests whether the css can be pinned and
fails if not.
The problem can be solved by simply holding cgroup_mutex instead of
RCU read lock around the iteration, which actually reduces LOC.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: stable@vger.kernel.org
---
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0eb7b86..3edf716 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2763,10 +2763,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
*/
update_before = cgroup_serial_nr_next;
- mutex_unlock(&cgroup_mutex);
-
/* add/rm files for all cgroups created before */
- rcu_read_lock();
css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
struct cgroup *cgrp = css->cgroup;
@@ -2775,23 +2772,19 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
inode = cgrp->dentry->d_inode;
dget(cgrp->dentry);
- rcu_read_unlock();
-
dput(prev);
prev = cgrp->dentry;
+ mutex_unlock(&cgroup_mutex);
mutex_lock(&inode->i_mutex);
mutex_lock(&cgroup_mutex);
if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
ret = cgroup_addrm_files(cgrp, cfts, is_add);
- mutex_unlock(&cgroup_mutex);
mutex_unlock(&inode->i_mutex);
-
- rcu_read_lock();
if (ret)
break;
}
- rcu_read_unlock();
+ mutex_unlock(&cgroup_mutex);
dput(prev);
deactivate_super(sb);
return ret;
--
cgit v0.9.2
From 0ab02ca8f887908152d1a96db5130fc661d36a1e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 11 Feb 2014 08:05:46 +0000
Subject: cgroup: protect modifications to cgroup_idr with cgroup_mutex
Setup cgroupfs like this:
# mount -t cgroup -o cpuacct xxx /cgroup
# mkdir /cgroup/sub1
# mkdir /cgroup/sub2
Then run these two commands:
# for ((; ;)) { mkdir /cgroup/sub1/tmp && rmdir /mnt/sub1/tmp; } &
# for ((; ;)) { mkdir /cgroup/sub2/tmp && rmdir /mnt/sub2/tmp; } &
After seconds you may see this warning:
------------[ cut here ]------------
WARNING: CPU: 1 PID: 25243 at lib/idr.c:527 sub_remove+0x87/0x1b0()
idr_remove called for id=6 which is not allocated.
...
Call Trace:
[<ffffffff8156063c>] dump_stack+0x7a/0x96
[<ffffffff810591ac>] warn_slowpath_common+0x8c/0xc0
[<ffffffff81059296>] warn_slowpath_fmt+0x46/0x50
[<ffffffff81300aa7>] sub_remove+0x87/0x1b0
[<ffffffff810f3f02>] ? css_killed_work_fn+0x32/0x1b0
[<ffffffff81300bf5>] idr_remove+0x25/0xd0
[<ffffffff810f2bab>] cgroup_destroy_css_killed+0x5b/0xc0
[<ffffffff810f4000>] css_killed_work_fn+0x130/0x1b0
[<ffffffff8107cdbc>] process_one_work+0x26c/0x550
[<ffffffff8107eefe>] worker_thread+0x12e/0x3b0
[<ffffffff81085f96>] kthread+0xe6/0xf0
[<ffffffff81570bac>] ret_from_fork+0x7c/0xb0
---[ end trace 2d1577ec10cf80d0 ]---
It's because allocating/removing cgroup ID is not properly synchronized.
The bug was introduced when we converted cgroup_ida to cgroup_idr.
While synchronization is already done inside ida_simple_{get,remove}(),
users are responsible for concurrent calls to idr_{alloc,remove}().
tj: Refreshed on top of b58c89986a77 ("cgroup: fix error return from
cgroup_create()").
Fixes: 4e96ee8e981b ("cgroup: convert cgroup_ida to cgroup_idr")
Cc: <stable@vger.kernel.org> #3.12+
Reported-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5c09759..9450f02 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -166,6 +166,8 @@ struct cgroup {
*
* The ID of the root cgroup is always 0, and a new cgroup
* will be assigned with a smallest available ID.
+ *
+ * Allocating/Removing ID must be protected by cgroup_mutex.
*/
int id;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3edf716..52719ce 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -886,7 +886,9 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
* per-subsystem and moved to css->id so that lookups are
* successful until the target css is released.
*/
+ mutex_lock(&cgroup_mutex);
idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+ mutex_unlock(&cgroup_mutex);
cgrp->id = -1;
call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
@@ -4168,16 +4170,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
rcu_assign_pointer(cgrp->name, name);
/*
- * Temporarily set the pointer to NULL, so idr_find() won't return
- * a half-baked cgroup.
- */
- cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
- if (cgrp->id < 0) {
- err = -ENOMEM;
- goto err_free_name;
- }
-
- /*
* Only live parents can have children. Note that the liveliness
* check isn't strictly necessary because cgroup_mkdir() and
* cgroup_rmdir() are fully synchronized by i_mutex; however, do it
@@ -4186,7 +4178,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
*/
if (!cgroup_lock_live_group(parent)) {
err = -ENODEV;
- goto err_free_id;
+ goto err_free_name;
+ }
+
+ /*
+ * Temporarily set the pointer to NULL, so idr_find() won't return
+ * a half-baked cgroup.
+ */
+ cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
+ if (cgrp->id < 0) {
+ err = -ENOMEM;
+ goto err_unlock;
}
/* Grab a reference on the superblock so the hierarchy doesn't
@@ -4218,7 +4220,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
*/
err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
if (err < 0)
- goto err_unlock;
+ goto err_free_id;
lockdep_assert_held(&dentry->d_inode->i_mutex);
cgrp->serial_nr = cgroup_serial_nr_next++;
@@ -4254,12 +4256,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
return 0;
-err_unlock:
- mutex_unlock(&cgroup_mutex);
- /* Release the reference count that we took on the superblock */
- deactivate_super(sb);
err_free_id:
idr_remove(&root->cgroup_idr, cgrp->id);
+ /* Release the reference count that we took on the superblock */
+ deactivate_super(sb);
+err_unlock:
+ mutex_unlock(&cgroup_mutex);
err_free_name:
kfree(rcu_dereference_raw(cgrp->name));
err_free_cgrp:
--
cgit v0.9.2

View File

@ -629,6 +629,9 @@ Patch25193: fix-exynos-hdmi-build.patch
#rhbz 1031296
Patch25194: tick-Clear-broadcast-pending-bit-when-switching-to-oneshot.patch
#rhbz 1045755
Patch25195: cgroup-fixes.patch
# END OF PATCH DEFINITIONS
%endif
@ -1280,6 +1283,9 @@ ApplyPatch fix-exynos-hdmi-build.patch
#rhbz 1031296
ApplyPatch tick-Clear-broadcast-pending-bit-when-switching-to-oneshot.patch
#rhbz 1045755
ApplyPatch cgroup-fixes.patch
# END OF PATCH APPLICATIONS
%endif
@ -2060,6 +2066,7 @@ fi
# || ||
%changelog
* Wed Feb 12 2014 Josh Boyer <jwboyer@fedoraproject.org>
- Fix cgroup destroy oops (rhbz 1045755)
- Fix backtrace in amd_e400_idle (rhbz 1031296)
* Tue Feb 11 2014 Josh Boyer <jwboyer@fedoraproject.org> - 3.14.0-0.rc2.git1.1