Fix cgroup destroy oops (rhbz 1045755)
This commit is contained in:
parent
5fe48c913d
commit
7845789edd
342
cgroup-fixes.patch
Normal file
342
cgroup-fixes.patch
Normal file
@ -0,0 +1,342 @@
|
||||
From ab3f5faa6255a0eb4f832675507d9e295ca7e9ba Mon Sep 17 00:00:00 2001
|
||||
From: Hugh Dickins <hughd@google.com>
|
||||
Date: Thu, 06 Feb 2014 23:56:01 +0000
|
||||
Subject: cgroup: use an ordered workqueue for cgroup destruction
|
||||
|
||||
Sometimes the cleanup after memcg hierarchy testing gets stuck in
|
||||
mem_cgroup_reparent_charges(), unable to bring non-kmem usage down to 0.
|
||||
|
||||
There may turn out to be several causes, but a major cause is this: the
|
||||
workitem to offline parent can get run before workitem to offline child;
|
||||
parent's mem_cgroup_reparent_charges() circles around waiting for the
|
||||
child's pages to be reparented to its lrus, but it's holding cgroup_mutex
|
||||
which prevents the child from reaching its mem_cgroup_reparent_charges().
|
||||
|
||||
Just use an ordered workqueue for cgroup_destroy_wq.
|
||||
|
||||
tj: Committing as the temporary fix until the reverse dependency can
|
||||
be removed from memcg. Comment updated accordingly.
|
||||
|
||||
Fixes: e5fca243abae ("cgroup: use a dedicated workqueue for cgroup destruction")
|
||||
Suggested-by: Filipe Brandenburger <filbranden@google.com>
|
||||
Signed-off-by: Hugh Dickins <hughd@google.com>
|
||||
Cc: stable@vger.kernel.org # 3.10+
|
||||
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||||
---
|
||||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
|
||||
index e2f46ba..aa95591 100644
|
||||
--- a/kernel/cgroup.c
|
||||
+++ b/kernel/cgroup.c
|
||||
@@ -4845,12 +4845,16 @@ static int __init cgroup_wq_init(void)
|
||||
/*
|
||||
* There isn't much point in executing destruction path in
|
||||
* parallel. Good chunk is serialized with cgroup_mutex anyway.
|
||||
- * Use 1 for @max_active.
|
||||
+ *
|
||||
+ * XXX: Must be ordered to make sure parent is offlined after
|
||||
+ * children. The ordering requirement is for memcg where a
|
||||
+ * parent's offline may wait for a child's leading to deadlock. In
|
||||
+ * the long term, this should be fixed from memcg side.
|
||||
*
|
||||
* We would prefer to do this in cgroup_init() above, but that
|
||||
* is called before init_workqueues(): so leave this until after.
|
||||
*/
|
||||
- cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
|
||||
+ cgroup_destroy_wq = alloc_ordered_workqueue("cgroup_destroy", 0);
|
||||
BUG_ON(!cgroup_destroy_wq);
|
||||
|
||||
/*
|
||||
--
|
||||
cgit v0.9.2
|
||||
From eb46bf89696972b856a9adb6aebd5c7b65c266e4 Mon Sep 17 00:00:00 2001
|
||||
From: Tejun Heo <tj@kernel.org>
|
||||
Date: Sat, 08 Feb 2014 15:26:33 +0000
|
||||
Subject: cgroup: fix error return value in cgroup_mount()
|
||||
|
||||
When cgroup_mount() fails to allocate an id for the root, it didn't
|
||||
set ret before jumping to unlock_drop ending up returning 0 after a
|
||||
failure. Fix it.
|
||||
|
||||
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||||
Acked-by: Li Zefan <lizefan@huawei.com>
|
||||
Cc: stable@vger.kernel.org
|
||||
---
|
||||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
|
||||
index aa95591..793f371 100644
|
||||
--- a/kernel/cgroup.c
|
||||
+++ b/kernel/cgroup.c
|
||||
@@ -1566,10 +1566,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
||||
mutex_lock(&cgroup_mutex);
|
||||
mutex_lock(&cgroup_root_mutex);
|
||||
|
||||
- root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
|
||||
- 0, 1, GFP_KERNEL);
|
||||
- if (root_cgrp->id < 0)
|
||||
+ ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
|
||||
+ if (ret < 0)
|
||||
goto unlock_drop;
|
||||
+ root_cgrp->id = ret;
|
||||
|
||||
/* Check for name clashes with existing mounts */
|
||||
ret = -EBUSY;
|
||||
--
|
||||
cgit v0.9.2
|
||||
From b58c89986a77a23658682a100eb15d8edb571ebb Mon Sep 17 00:00:00 2001
|
||||
From: Tejun Heo <tj@kernel.org>
|
||||
Date: Sat, 08 Feb 2014 15:26:33 +0000
|
||||
Subject: cgroup: fix error return from cgroup_create()
|
||||
|
||||
cgroup_create() was returning 0 after allocation failures. Fix it.
|
||||
|
||||
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||||
Acked-by: Li Zefan <lizefan@huawei.com>
|
||||
Cc: stable@vger.kernel.org
|
||||
---
|
||||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
|
||||
index 793f371..0eb7b86 100644
|
||||
--- a/kernel/cgroup.c
|
||||
+++ b/kernel/cgroup.c
|
||||
@@ -4158,7 +4158,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
|
||||
struct cgroup *cgrp;
|
||||
struct cgroup_name *name;
|
||||
struct cgroupfs_root *root = parent->root;
|
||||
- int ssid, err = 0;
|
||||
+ int ssid, err;
|
||||
struct cgroup_subsys *ss;
|
||||
struct super_block *sb = root->sb;
|
||||
|
||||
@@ -4168,8 +4168,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
|
||||
return -ENOMEM;
|
||||
|
||||
name = cgroup_alloc_name(dentry);
|
||||
- if (!name)
|
||||
+ if (!name) {
|
||||
+ err = -ENOMEM;
|
||||
goto err_free_cgrp;
|
||||
+ }
|
||||
rcu_assign_pointer(cgrp->name, name);
|
||||
|
||||
/*
|
||||
@@ -4177,8 +4179,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
|
||||
* a half-baked cgroup.
|
||||
*/
|
||||
cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
|
||||
- if (cgrp->id < 0)
|
||||
+ if (cgrp->id < 0) {
|
||||
+ err = -ENOMEM;
|
||||
goto err_free_name;
|
||||
+ }
|
||||
|
||||
/*
|
||||
* Only live parents can have children. Note that the liveliness
|
||||
--
|
||||
cgit v0.9.2
|
||||
From 48573a893303986e3b0b2974d6fb11f3d1bb7064 Mon Sep 17 00:00:00 2001
|
||||
From: Tejun Heo <tj@kernel.org>
|
||||
Date: Sat, 08 Feb 2014 15:26:34 +0000
|
||||
Subject: cgroup: fix locking in cgroup_cfts_commit()
|
||||
|
||||
cgroup_cfts_commit() walks the cgroup hierarchy that the target
|
||||
subsystem is attached to and tries to apply the file changes. Due to
|
||||
the convolution with inode locking, it can't keep cgroup_mutex locked
|
||||
while iterating. It currently holds only RCU read lock around the
|
||||
actual iteration and then pins the found cgroup using dget().
|
||||
|
||||
Unfortunately, this is incorrect. Although the iteration does check
|
||||
cgroup_is_dead() before invoking dget(), there's nothing which
|
||||
prevents the dentry from going away inbetween. Note that this is
|
||||
different from the usual css iterations where css_tryget() is used to
|
||||
pin the css - css_tryget() tests whether the css can be pinned and
|
||||
fails if not.
|
||||
|
||||
The problem can be solved by simply holding cgroup_mutex instead of
|
||||
RCU read lock around the iteration, which actually reduces LOC.
|
||||
|
||||
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||||
Acked-by: Li Zefan <lizefan@huawei.com>
|
||||
Cc: stable@vger.kernel.org
|
||||
---
|
||||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
|
||||
index 0eb7b86..3edf716 100644
|
||||
--- a/kernel/cgroup.c
|
||||
+++ b/kernel/cgroup.c
|
||||
@@ -2763,10 +2763,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
|
||||
*/
|
||||
update_before = cgroup_serial_nr_next;
|
||||
|
||||
- mutex_unlock(&cgroup_mutex);
|
||||
-
|
||||
/* add/rm files for all cgroups created before */
|
||||
- rcu_read_lock();
|
||||
css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
|
||||
struct cgroup *cgrp = css->cgroup;
|
||||
|
||||
@@ -2775,23 +2772,19 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
|
||||
|
||||
inode = cgrp->dentry->d_inode;
|
||||
dget(cgrp->dentry);
|
||||
- rcu_read_unlock();
|
||||
-
|
||||
dput(prev);
|
||||
prev = cgrp->dentry;
|
||||
|
||||
+ mutex_unlock(&cgroup_mutex);
|
||||
mutex_lock(&inode->i_mutex);
|
||||
mutex_lock(&cgroup_mutex);
|
||||
if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
|
||||
ret = cgroup_addrm_files(cgrp, cfts, is_add);
|
||||
- mutex_unlock(&cgroup_mutex);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
-
|
||||
- rcu_read_lock();
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
- rcu_read_unlock();
|
||||
+ mutex_unlock(&cgroup_mutex);
|
||||
dput(prev);
|
||||
deactivate_super(sb);
|
||||
return ret;
|
||||
--
|
||||
cgit v0.9.2
|
||||
From 0ab02ca8f887908152d1a96db5130fc661d36a1e Mon Sep 17 00:00:00 2001
|
||||
From: Li Zefan <lizefan@huawei.com>
|
||||
Date: Tue, 11 Feb 2014 08:05:46 +0000
|
||||
Subject: cgroup: protect modifications to cgroup_idr with cgroup_mutex
|
||||
|
||||
Setup cgroupfs like this:
|
||||
# mount -t cgroup -o cpuacct xxx /cgroup
|
||||
# mkdir /cgroup/sub1
|
||||
# mkdir /cgroup/sub2
|
||||
|
||||
Then run these two commands:
|
||||
# for ((; ;)) { mkdir /cgroup/sub1/tmp && rmdir /mnt/sub1/tmp; } &
|
||||
# for ((; ;)) { mkdir /cgroup/sub2/tmp && rmdir /mnt/sub2/tmp; } &
|
||||
|
||||
After seconds you may see this warning:
|
||||
|
||||
------------[ cut here ]------------
|
||||
WARNING: CPU: 1 PID: 25243 at lib/idr.c:527 sub_remove+0x87/0x1b0()
|
||||
idr_remove called for id=6 which is not allocated.
|
||||
...
|
||||
Call Trace:
|
||||
[<ffffffff8156063c>] dump_stack+0x7a/0x96
|
||||
[<ffffffff810591ac>] warn_slowpath_common+0x8c/0xc0
|
||||
[<ffffffff81059296>] warn_slowpath_fmt+0x46/0x50
|
||||
[<ffffffff81300aa7>] sub_remove+0x87/0x1b0
|
||||
[<ffffffff810f3f02>] ? css_killed_work_fn+0x32/0x1b0
|
||||
[<ffffffff81300bf5>] idr_remove+0x25/0xd0
|
||||
[<ffffffff810f2bab>] cgroup_destroy_css_killed+0x5b/0xc0
|
||||
[<ffffffff810f4000>] css_killed_work_fn+0x130/0x1b0
|
||||
[<ffffffff8107cdbc>] process_one_work+0x26c/0x550
|
||||
[<ffffffff8107eefe>] worker_thread+0x12e/0x3b0
|
||||
[<ffffffff81085f96>] kthread+0xe6/0xf0
|
||||
[<ffffffff81570bac>] ret_from_fork+0x7c/0xb0
|
||||
---[ end trace 2d1577ec10cf80d0 ]---
|
||||
|
||||
It's because allocating/removing cgroup ID is not properly synchronized.
|
||||
|
||||
The bug was introduced when we converted cgroup_ida to cgroup_idr.
|
||||
While synchronization is already done inside ida_simple_{get,remove}(),
|
||||
users are responsible for concurrent calls to idr_{alloc,remove}().
|
||||
|
||||
tj: Refreshed on top of b58c89986a77 ("cgroup: fix error return from
|
||||
cgroup_create()").
|
||||
|
||||
Fixes: 4e96ee8e981b ("cgroup: convert cgroup_ida to cgroup_idr")
|
||||
Cc: <stable@vger.kernel.org> #3.12+
|
||||
Reported-by: Michal Hocko <mhocko@suse.cz>
|
||||
Signed-off-by: Li Zefan <lizefan@huawei.com>
|
||||
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||||
---
|
||||
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
|
||||
index 5c09759..9450f02 100644
|
||||
--- a/include/linux/cgroup.h
|
||||
+++ b/include/linux/cgroup.h
|
||||
@@ -166,6 +166,8 @@ struct cgroup {
|
||||
*
|
||||
* The ID of the root cgroup is always 0, and a new cgroup
|
||||
* will be assigned with a smallest available ID.
|
||||
+ *
|
||||
+ * Allocating/Removing ID must be protected by cgroup_mutex.
|
||||
*/
|
||||
int id;
|
||||
|
||||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
|
||||
index 3edf716..52719ce 100644
|
||||
--- a/kernel/cgroup.c
|
||||
+++ b/kernel/cgroup.c
|
||||
@@ -886,7 +886,9 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
|
||||
* per-subsystem and moved to css->id so that lookups are
|
||||
* successful until the target css is released.
|
||||
*/
|
||||
+ mutex_lock(&cgroup_mutex);
|
||||
idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
|
||||
+ mutex_unlock(&cgroup_mutex);
|
||||
cgrp->id = -1;
|
||||
|
||||
call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
|
||||
@@ -4168,16 +4170,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
|
||||
rcu_assign_pointer(cgrp->name, name);
|
||||
|
||||
/*
|
||||
- * Temporarily set the pointer to NULL, so idr_find() won't return
|
||||
- * a half-baked cgroup.
|
||||
- */
|
||||
- cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
|
||||
- if (cgrp->id < 0) {
|
||||
- err = -ENOMEM;
|
||||
- goto err_free_name;
|
||||
- }
|
||||
-
|
||||
- /*
|
||||
* Only live parents can have children. Note that the liveliness
|
||||
* check isn't strictly necessary because cgroup_mkdir() and
|
||||
* cgroup_rmdir() are fully synchronized by i_mutex; however, do it
|
||||
@@ -4186,7 +4178,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
|
||||
*/
|
||||
if (!cgroup_lock_live_group(parent)) {
|
||||
err = -ENODEV;
|
||||
- goto err_free_id;
|
||||
+ goto err_free_name;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * Temporarily set the pointer to NULL, so idr_find() won't return
|
||||
+ * a half-baked cgroup.
|
||||
+ */
|
||||
+ cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
|
||||
+ if (cgrp->id < 0) {
|
||||
+ err = -ENOMEM;
|
||||
+ goto err_unlock;
|
||||
}
|
||||
|
||||
/* Grab a reference on the superblock so the hierarchy doesn't
|
||||
@@ -4218,7 +4220,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
|
||||
*/
|
||||
err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
|
||||
if (err < 0)
|
||||
- goto err_unlock;
|
||||
+ goto err_free_id;
|
||||
lockdep_assert_held(&dentry->d_inode->i_mutex);
|
||||
|
||||
cgrp->serial_nr = cgroup_serial_nr_next++;
|
||||
@@ -4254,12 +4256,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
|
||||
|
||||
return 0;
|
||||
|
||||
-err_unlock:
|
||||
- mutex_unlock(&cgroup_mutex);
|
||||
- /* Release the reference count that we took on the superblock */
|
||||
- deactivate_super(sb);
|
||||
err_free_id:
|
||||
idr_remove(&root->cgroup_idr, cgrp->id);
|
||||
+ /* Release the reference count that we took on the superblock */
|
||||
+ deactivate_super(sb);
|
||||
+err_unlock:
|
||||
+ mutex_unlock(&cgroup_mutex);
|
||||
err_free_name:
|
||||
kfree(rcu_dereference_raw(cgrp->name));
|
||||
err_free_cgrp:
|
||||
--
|
||||
cgit v0.9.2
|
@ -629,6 +629,9 @@ Patch25193: fix-exynos-hdmi-build.patch
|
||||
#rhbz 1031296
|
||||
Patch25194: tick-Clear-broadcast-pending-bit-when-switching-to-oneshot.patch
|
||||
|
||||
#rhbz 1045755
|
||||
Patch25195: cgroup-fixes.patch
|
||||
|
||||
# END OF PATCH DEFINITIONS
|
||||
|
||||
%endif
|
||||
@ -1280,6 +1283,9 @@ ApplyPatch fix-exynos-hdmi-build.patch
|
||||
#rhbz 1031296
|
||||
ApplyPatch tick-Clear-broadcast-pending-bit-when-switching-to-oneshot.patch
|
||||
|
||||
#rhbz 1045755
|
||||
ApplyPatch cgroup-fixes.patch
|
||||
|
||||
# END OF PATCH APPLICATIONS
|
||||
|
||||
%endif
|
||||
@ -2060,6 +2066,7 @@ fi
|
||||
# || ||
|
||||
%changelog
|
||||
* Wed Feb 12 2014 Josh Boyer <jwboyer@fedoraproject.org>
|
||||
- Fix cgroup destroy oops (rhbz 1045755)
|
||||
- Fix backtrace in amd_e400_idle (rhbz 1031296)
|
||||
|
||||
* Tue Feb 11 2014 Josh Boyer <jwboyer@fedoraproject.org> - 3.14.0-0.rc2.git1.1
|
||||
|
Loading…
Reference in New Issue
Block a user