b7dd6f45c1
Resolves: bz#1479446 bz#1520882 bz#1579758 bz#1598407 bz#1599808 Resolves: bz#1603118 bz#1619357 bz#1622001 bz#1622308 bz#1631166 Resolves: bz#1631418 bz#1632563 bz#1634649 bz#1635071 bz#1635100 Resolves: bz#1635136 bz#1636291 bz#1638069 bz#1640347 bz#1642854 Resolves: bz#1643035 bz#1644120 bz#1644279 bz#1645916 bz#1647675 Signed-off-by: Milind Changire <mchangir@redhat.com>
368 lines
16 KiB
Diff
368 lines
16 KiB
Diff
From 72ce80749fca03ab97a63af79d4e6bc76a49ab64 Mon Sep 17 00:00:00 2001
|
|
From: Krutika Dhananjay <kdhananj@redhat.com>
|
|
Date: Fri, 5 Oct 2018 11:32:21 +0530
|
|
Subject: [PATCH 443/444] features/shard: Hold a ref on base inode when adding
|
|
a shard to lru list
|
|
|
|
> Upstream: https://review.gluster.org/21454
|
|
> BUG: 1605056
|
|
> Change-Id: Ic15ca41444dd04684a9458bd4a526b1d3e160499
|
|
|
|
In __shard_update_shards_inode_list(), previously shard translator
|
|
was not holding a ref on the base inode whenever a shard was added to
|
|
the lru list. But if the base shard is forgotten and destroyed either
|
|
by fuse due to memory pressure or due to the file being deleted at some
|
|
point by a different client with this client still containing stale
|
|
shards in its lru list, the client would crash at the time of locking
|
|
lru_base_inode->lock owing to illegal memory access.
|
|
|
|
So now the base shard is ref'd into the inode ctx of every shard that
|
|
is added to lru list until it gets lru'd out.
|
|
|
|
The patch also handles the case where none of the shards associated
|
|
with a file that is about to be deleted are part of the LRU list and
|
|
where an unlink at the beginning of the operation destroys the base
|
|
inode (because there are no refkeepers) and hence all of the shards
|
|
that are about to be deleted will be resolved without the existence
|
|
of a base shard in-memory. This, if not handled properly, could lead
|
|
to a crash.
|
|
|
|
Change-Id: Ic15ca41444dd04684a9458bd4a526b1d3e160499
|
|
BUG: 1603118
|
|
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
|
|
Reviewed-on: https://code.engineering.redhat.com/gerrit/155318
|
|
Reviewed-by: Xavi Hernandez <xhernandez@redhat.com>
|
|
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
|
---
|
|
tests/bugs/shard/bug-1605056-2.t | 34 +++++++++++++++
|
|
tests/bugs/shard/bug-1605056.t | 63 ++++++++++++++++++++++++++++
|
|
tests/bugs/shard/shard-inode-refcount-test.t | 2 +-
|
|
tests/volume.rc | 12 ++++--
|
|
xlators/features/shard/src/shard.c | 48 +++++++++++++++------
|
|
5 files changed, 141 insertions(+), 18 deletions(-)
|
|
create mode 100644 tests/bugs/shard/bug-1605056-2.t
|
|
create mode 100644 tests/bugs/shard/bug-1605056.t
|
|
|
|
diff --git a/tests/bugs/shard/bug-1605056-2.t b/tests/bugs/shard/bug-1605056-2.t
|
|
new file mode 100644
|
|
index 0000000..a9c10fe
|
|
--- /dev/null
|
|
+++ b/tests/bugs/shard/bug-1605056-2.t
|
|
@@ -0,0 +1,34 @@
|
|
+#!/bin/bash
|
|
+
|
|
+. $(dirname $0)/../../include.rc
|
|
+. $(dirname $0)/../../volume.rc
|
|
+
|
|
+cleanup
|
|
+
|
|
+TEST glusterd
|
|
+TEST pidof glusterd
|
|
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
|
|
+TEST $CLI volume set $V0 features.shard on
|
|
+TEST $CLI volume set $V0 features.shard-block-size 4MB
|
|
+TEST $CLI volume set $V0 features.shard-lru-limit 25
|
|
+TEST $CLI volume set $V0 performance.write-behind off
|
|
+
|
|
+TEST $CLI volume start $V0
|
|
+
|
|
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
|
|
+
|
|
+# Perform a write that would cause 25 shards to be created under .shard
|
|
+TEST dd if=/dev/zero of=$M0/foo bs=1M count=104
|
|
+
|
|
+# Write into another file bar to ensure all of foo's shards are evicted from lru list of $M0
|
|
+TEST dd if=/dev/zero of=$M0/bar bs=1M count=104
|
|
+
|
|
+# Delete foo from $M0. If there's a bug, the mount will crash.
|
|
+TEST unlink $M0/foo
|
|
+
|
|
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
|
|
+
|
|
+TEST $CLI volume stop $V0
|
|
+TEST $CLI volume delete $V0
|
|
+
|
|
+cleanup
|
|
diff --git a/tests/bugs/shard/bug-1605056.t b/tests/bugs/shard/bug-1605056.t
|
|
new file mode 100644
|
|
index 0000000..c2329ea
|
|
--- /dev/null
|
|
+++ b/tests/bugs/shard/bug-1605056.t
|
|
@@ -0,0 +1,63 @@
|
|
+#!/bin/bash
|
|
+
|
|
+. $(dirname $0)/../../include.rc
|
|
+. $(dirname $0)/../../volume.rc
|
|
+
|
|
+SHARD_COUNT_TIME=5
|
|
+
|
|
+cleanup
|
|
+
|
|
+TEST glusterd
|
|
+TEST pidof glusterd
|
|
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
|
|
+TEST $CLI volume set $V0 features.shard on
|
|
+TEST $CLI volume set $V0 features.shard-block-size 4MB
|
|
+TEST $CLI volume set $V0 features.shard-lru-limit 25
|
|
+TEST $CLI volume set $V0 performance.write-behind off
|
|
+
|
|
+TEST $CLI volume start $V0
|
|
+
|
|
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
|
|
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M1
|
|
+
|
|
+# Perform a write that would cause 25 shards to be created under .shard
|
|
+TEST dd if=/dev/zero of=$M0/foo bs=1M count=104
|
|
+
|
|
+# Read the file from $M1, indirectly filling up the lru list.
|
|
+TEST `cat $M1/foo > /dev/null`
|
|
+statedump=$(generate_mount_statedump $V0 $M1)
|
|
+sleep 1
|
|
+EXPECT "25" echo $(grep "inode-count" $statedump | cut -f2 -d'=' | tail -1)
|
|
+rm -f $statedump
|
|
+
|
|
+# Delete foo from $M0.
|
|
+TEST unlink $M0/foo
|
|
+
|
|
+# Send stat on foo from $M1 to force $M1 to "forget" inode associated with foo.
|
|
+# Now the ghost shards associated with "foo" are still in lru list of $M1.
|
|
+TEST ! stat $M1/foo
|
|
+
|
|
+# Let's force the ghost shards of "foo" out of lru list by looking up more shards
|
|
+# through I/O on a file named "bar" from $M1. This should crash if the base inode
|
|
+# had been destroyed by now.
|
|
+
|
|
+TEST dd if=/dev/zero of=$M1/bar bs=1M count=104
|
|
+
|
|
+###############################################
|
|
+#### Now for some inode ref-leak tests ... ####
|
|
+###############################################
|
|
+
|
|
+# Expect there to be 29 active inodes - 26 belonging to "bar", 1 for .shard,
|
|
+# 1 for .shard/remove_me and 1 for '/'
|
|
+EXPECT_WITHIN $SHARD_COUNT_TIME `expr 26 + 3` get_mount_active_size_value $V0 $M1
|
|
+
|
|
+TEST rm -f $M1/bar
|
|
+EXPECT_WITHIN $SHARD_COUNT_TIME 3 get_mount_active_size_value $V0 $M1
|
|
+
|
|
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
|
|
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M1
|
|
+
|
|
+TEST $CLI volume stop $V0
|
|
+TEST $CLI volume delete $V0
|
|
+
|
|
+cleanup
|
|
diff --git a/tests/bugs/shard/shard-inode-refcount-test.t b/tests/bugs/shard/shard-inode-refcount-test.t
|
|
index 087c8ba..3fd181b 100644
|
|
--- a/tests/bugs/shard/shard-inode-refcount-test.t
|
|
+++ b/tests/bugs/shard/shard-inode-refcount-test.t
|
|
@@ -21,7 +21,7 @@ TEST dd if=/dev/zero conv=fsync of=$M0/one-plus-five-shards bs=1M count=23
|
|
ACTIVE_INODES_BEFORE=$(get_mount_active_size_value $V0)
|
|
TEST rm -f $M0/one-plus-five-shards
|
|
# Expect 5 inodes less. But one inode more than before because .remove_me would be created.
|
|
-EXPECT_WITHIN $SHARD_COUNT_TIME `expr $ACTIVE_INODES_BEFORE - 5 + 1` get_mount_active_size_value $V0
|
|
+EXPECT_WITHIN $SHARD_COUNT_TIME `expr $ACTIVE_INODES_BEFORE - 5 + 1` get_mount_active_size_value $V0 $M0
|
|
|
|
EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
|
|
TEST $CLI volume stop $V0
|
|
diff --git a/tests/volume.rc b/tests/volume.rc
|
|
index bba7e4e..6a983fd 100644
|
|
--- a/tests/volume.rc
|
|
+++ b/tests/volume.rc
|
|
@@ -93,7 +93,8 @@ function remove_brick_status_completed_field {
|
|
|
|
function get_mount_process_pid {
|
|
local vol=$1
|
|
- ps auxww | grep glusterfs | grep -E "volfile-id[ =]/?$vol " | awk '{print $2}' | head -1
|
|
+ local mnt=$2
|
|
+ ps auxww | grep glusterfs | grep -E "volfile-id[ =]/?$vol .*$mnt" | awk '{print $2}' | head -1
|
|
}
|
|
|
|
function get_nfs_pid ()
|
|
@@ -126,7 +127,8 @@ function generate_statedump {
|
|
|
|
function generate_mount_statedump {
|
|
local vol=$1
|
|
- generate_statedump $(get_mount_process_pid $vol)
|
|
+ local mnt=$2
|
|
+ generate_statedump $(get_mount_process_pid $vol $mnt)
|
|
}
|
|
|
|
function cleanup_mount_statedump {
|
|
@@ -850,7 +852,8 @@ function get_active_fd_count {
|
|
|
|
function get_mount_active_size_value {
|
|
local vol=$1
|
|
- local statedump=$(generate_mount_statedump $vol)
|
|
+ local mount=$2
|
|
+ local statedump=$(generate_mount_statedump $vol $mount)
|
|
sleep 1
|
|
local val=$(grep "active_size" $statedump | cut -f2 -d'=' | tail -1)
|
|
rm -f $statedump
|
|
@@ -859,7 +862,8 @@ function get_mount_active_size_value {
|
|
|
|
function get_mount_lru_size_value {
|
|
local vol=$1
|
|
- local statedump=$(generate_mount_statedump $vol)
|
|
+ local mount=$2
|
|
+ local statedump=$(generate_mount_statedump $vol $mount)
|
|
sleep 1
|
|
local val=$(grep "lru_size" $statedump | cut -f2 -d'=' | tail -1)
|
|
rm -f $statedump
|
|
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
|
|
index eb32168..fb88315 100644
|
|
--- a/xlators/features/shard/src/shard.c
|
|
+++ b/xlators/features/shard/src/shard.c
|
|
@@ -651,7 +651,8 @@ out:
|
|
|
|
inode_t *
|
|
__shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
|
|
- inode_t *base_inode, int block_num)
|
|
+ inode_t *base_inode, int block_num,
|
|
+ uuid_t gfid)
|
|
{
|
|
char block_bname[256] = {0,};
|
|
inode_t *lru_inode = NULL;
|
|
@@ -679,10 +680,13 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
|
|
inode_ref (linked_inode);
|
|
if (base_inode)
|
|
gf_uuid_copy (ctx->base_gfid, base_inode->gfid);
|
|
+ else
|
|
+ gf_uuid_copy(ctx->base_gfid, gfid);
|
|
ctx->block_num = block_num;
|
|
list_add_tail (&ctx->ilist, &priv->ilist_head);
|
|
priv->inode_count++;
|
|
- ctx->base_inode = base_inode;
|
|
+ if (base_inode)
|
|
+ ctx->base_inode = inode_ref (base_inode);
|
|
} else {
|
|
/*If on the other hand there is no available slot for this inode
|
|
* in the list, delete the lru inode from the head of the list,
|
|
@@ -701,6 +705,8 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
|
|
* deleted from fsync list and fsync'd in a new frame,
|
|
* and then unlinked in memory and forgotten.
|
|
*/
|
|
+ if (!lru_base_inode)
|
|
+ goto after_fsync_check;
|
|
LOCK (&lru_base_inode->lock);
|
|
LOCK (&lru_inode->lock);
|
|
{
|
|
@@ -715,6 +721,7 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
|
|
UNLOCK (&lru_inode->lock);
|
|
UNLOCK (&lru_base_inode->lock);
|
|
|
|
+after_fsync_check:
|
|
if (!do_fsync) {
|
|
shard_make_block_bname (lru_inode_ctx->block_num,
|
|
lru_inode_ctx->base_gfid,
|
|
@@ -729,20 +736,31 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
|
|
inode_forget (lru_inode, 0);
|
|
} else {
|
|
fsync_inode = lru_inode;
|
|
- inode_unref (lru_base_inode);
|
|
+ if (lru_base_inode)
|
|
+ inode_unref (lru_base_inode);
|
|
}
|
|
/* The following unref corresponds to the ref
|
|
* held by inode_find() above.
|
|
*/
|
|
inode_unref (lru_inode);
|
|
+
|
|
+ /* The following unref corresponds to the ref held on
|
|
+ * the base shard at the time of adding shard inode to
|
|
+ * lru list
|
|
+ */
|
|
+ if (lru_base_inode)
|
|
+ inode_unref (lru_base_inode);
|
|
/* For as long as an inode is in lru list, we try to
|
|
* keep it alive by holding a ref on it.
|
|
*/
|
|
inode_ref (linked_inode);
|
|
if (base_inode)
|
|
gf_uuid_copy (ctx->base_gfid, base_inode->gfid);
|
|
+ else
|
|
+ gf_uuid_copy (ctx->base_gfid, gfid);
|
|
ctx->block_num = block_num;
|
|
- ctx->base_inode = base_inode;
|
|
+ if (base_inode)
|
|
+ ctx->base_inode = inode_ref (base_inode);
|
|
list_add_tail (&ctx->ilist, &priv->ilist_head);
|
|
}
|
|
} else {
|
|
@@ -1027,7 +1045,7 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this,
|
|
fsync_inode = __shard_update_shards_inode_list (inode,
|
|
this,
|
|
res_inode,
|
|
- shard_idx_iter);
|
|
+ shard_idx_iter, gfid);
|
|
}
|
|
UNLOCK(&priv->lock);
|
|
shard_idx_iter++;
|
|
@@ -2173,7 +2191,8 @@ shard_link_block_inode (shard_local_t *local, int block_num, inode_t *inode,
|
|
fsync_inode = __shard_update_shards_inode_list (linked_inode,
|
|
this,
|
|
local->loc.inode,
|
|
- block_num);
|
|
+ block_num,
|
|
+ gfid);
|
|
}
|
|
UNLOCK(&priv->lock);
|
|
if (fsync_inode)
|
|
@@ -2881,6 +2900,7 @@ void
|
|
shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
|
|
{
|
|
char block_bname[256] = {0,};
|
|
+ int unref_base_inode = 0;
|
|
uuid_t gfid = {0,};
|
|
inode_t *inode = NULL;
|
|
inode_t *base_inode = NULL;
|
|
@@ -2894,11 +2914,12 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
|
|
priv = this->private;
|
|
|
|
inode = local->inode_list[shard_block_num - local->first_block];
|
|
- base_inode = local->resolver_base_inode;
|
|
+ shard_inode_ctx_get (inode, this, &ctx);
|
|
+ base_inode = ctx->base_inode;
|
|
if (base_inode)
|
|
gf_uuid_copy (gfid, base_inode->gfid);
|
|
else
|
|
- gf_uuid_copy (gfid, local->base_gfid);
|
|
+ gf_uuid_copy (gfid, ctx->base_gfid);
|
|
|
|
shard_make_block_bname (shard_block_num, gfid,
|
|
block_bname, sizeof (block_bname));
|
|
@@ -2912,17 +2933,16 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
|
|
if (!list_empty (&ctx->ilist)) {
|
|
list_del_init (&ctx->ilist);
|
|
priv->inode_count--;
|
|
+ unref_base_inode++;
|
|
GF_ASSERT (priv->inode_count >= 0);
|
|
unlink_unref_forget = _gf_true;
|
|
}
|
|
if (ctx->fsync_needed) {
|
|
- if (base_inode)
|
|
- inode_unref (base_inode);
|
|
+ unref_base_inode++;
|
|
list_del_init (&ctx->to_fsync_list);
|
|
- if (base_inode) {
|
|
+ if (base_inode)
|
|
__shard_inode_ctx_get (base_inode, this, &base_ictx);
|
|
- base_ictx->fsync_count--;
|
|
- }
|
|
+ base_ictx->fsync_count--;
|
|
}
|
|
}
|
|
UNLOCK(&inode->lock);
|
|
@@ -2933,6 +2953,8 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
|
|
inode_unref (inode);
|
|
inode_forget (inode, 0);
|
|
}
|
|
+ if (base_inode && unref_base_inode)
|
|
+ inode_ref_reduce_by_n (base_inode, unref_base_inode);
|
|
UNLOCK(&priv->lock);
|
|
}
|
|
|
|
--
|
|
1.8.3.1
|
|
|