b7dd6f45c1
Resolves: bz#1479446 bz#1520882 bz#1579758 bz#1598407 bz#1599808 Resolves: bz#1603118 bz#1619357 bz#1622001 bz#1622308 bz#1631166 Resolves: bz#1631418 bz#1632563 bz#1634649 bz#1635071 bz#1635100 Resolves: bz#1635136 bz#1636291 bz#1638069 bz#1640347 bz#1642854 Resolves: bz#1643035 bz#1644120 bz#1644279 bz#1645916 bz#1647675 Signed-off-by: Milind Changire <mchangir@redhat.com>
353 lines
12 KiB
Diff
353 lines
12 KiB
Diff
From 8a3c0fb64c8798ecf5a3635fe0922e3cfd476817 Mon Sep 17 00:00:00 2001
|
|
From: Pranith Kumar K <pkarampu@redhat.com>
|
|
Date: Mon, 27 Aug 2018 12:40:16 +0530
|
|
Subject: [PATCH 425/444] cluster/afr: Delegate name-heal when possible
|
|
|
|
Problem:
|
|
When name-self-heal is triggered on the mount, it blocks
|
|
lookup until name-self-heal completes. But that can lead
|
|
to hangs when lot of clients are accessing a directory which
|
|
needs name heal and all of them trigger heals waiting
|
|
for other clients to complete heal.
|
|
|
|
Fix:
|
|
When a name-heal is needed but quorum number of names have the
|
|
file and pending xattrs exist on the parent, then better to
|
|
delegate the heal to SHD which will be completed as part of
|
|
entry-heal of the parent directory. We could also do the same
|
|
for quorum-number of names not present but we don't have
|
|
any known use-case where this is a frequent occurrence so
|
|
not changing that part at the moment. When there is a gfid
|
|
mismatch or missing gfid it is important to complete the heal
|
|
so that next rename doesn't assume everything is fine and
|
|
perform a rename etc
|
|
|
|
BUG: 1619357
|
|
Upstream Patch: https://review.gluster.org/c/glusterfs/+/21087
|
|
Change-Id: I8b002c85dffc6eb6f2833e742684a233daefeb2c
|
|
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
|
|
Reviewed-on: https://code.engineering.redhat.com/gerrit/155029
|
|
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
|
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
|
|
---
|
|
tests/afr.rc | 8 ++
|
|
tests/basic/afr/name-self-heal.t | 112 +++++++++++++++++++++++++++
|
|
xlators/cluster/afr/src/afr-common.c | 100 ++++++++++++++++++------
|
|
xlators/cluster/afr/src/afr-self-heal-name.c | 12 ++-
|
|
4 files changed, 205 insertions(+), 27 deletions(-)
|
|
create mode 100644 tests/basic/afr/name-self-heal.t
|
|
|
|
diff --git a/tests/afr.rc b/tests/afr.rc
|
|
index 1fd0310..a1e8a44 100644
|
|
--- a/tests/afr.rc
|
|
+++ b/tests/afr.rc
|
|
@@ -89,3 +89,11 @@ function count_index_entries()
|
|
{
|
|
ls $1/.glusterfs/indices/xattrop | wc -l
|
|
}
|
|
+
|
|
+function get_quorum_type()
|
|
+{
|
|
+ local m="$1"
|
|
+ local v="$2"
|
|
+ local repl_id="$3"
|
|
+ cat $m/.meta/graphs/active/$v-replicate-$repl_id/private|grep quorum-type|awk '{print $3}'
|
|
+}
|
|
diff --git a/tests/basic/afr/name-self-heal.t b/tests/basic/afr/name-self-heal.t
|
|
new file mode 100644
|
|
index 0000000..50fc2ec
|
|
--- /dev/null
|
|
+++ b/tests/basic/afr/name-self-heal.t
|
|
@@ -0,0 +1,112 @@
|
|
+#!/bin/bash
|
|
+#Self-heal tests
|
|
+
|
|
+. $(dirname $0)/../../include.rc
|
|
+. $(dirname $0)/../../volume.rc
|
|
+. $(dirname $0)/../../afr.rc
|
|
+cleanup;
|
|
+
|
|
+#Check that when quorum is not enabled name-heal happens correctly
|
|
+TEST glusterd
|
|
+TEST pidof glusterd
|
|
+TEST $CLI volume create $V0 replica 2 $H0:$B0/brick{0,1}
|
|
+TEST $CLI volume set $V0 performance.stat-prefetch off
|
|
+TEST $CLI volume start $V0
|
|
+TEST $CLI volume heal $V0 disable
|
|
+TEST $CLI volume set $V0 cluster.entry-self-heal off
|
|
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
|
|
+
|
|
+TEST touch $M0/a
|
|
+TEST touch $M0/c
|
|
+TEST kill_brick $V0 $H0 $B0/brick0
|
|
+TEST touch $M0/b
|
|
+TEST rm -f $M0/a
|
|
+TEST rm -f $M0/c
|
|
+TEST touch $M0/c #gfid mismatch case
|
|
+c_gfid=$(gf_get_gfid_xattr $B0/brick1/c)
|
|
+TEST $CLI volume start $V0 force
|
|
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
|
|
+TEST ! stat $M0/a
|
|
+TEST ! stat $B0/brick0/a
|
|
+TEST ! stat $B0/brick1/a
|
|
+
|
|
+TEST stat $M0/b
|
|
+TEST stat $B0/brick0/b
|
|
+TEST stat $B0/brick1/b
|
|
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/b)" == "$(gf_get_gfid_xattr $B0/brick1/b)" ]]
|
|
+
|
|
+TEST stat $M0/c
|
|
+TEST stat $B0/brick0/c
|
|
+TEST stat $B0/brick1/c
|
|
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/c)" == "$c_gfid" ]]
|
|
+
|
|
+cleanup;
|
|
+
|
|
+#Check that when quorum is enabled name-heal happens as expected
|
|
+TEST glusterd
|
|
+TEST pidof glusterd
|
|
+TEST $CLI volume create $V0 replica 3 $H0:$B0/brick{0,1,2}
|
|
+TEST $CLI volume set $V0 performance.stat-prefetch off
|
|
+TEST $CLI volume start $V0
|
|
+TEST $CLI volume heal $V0 disable
|
|
+TEST $CLI volume set $V0 cluster.entry-self-heal off
|
|
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
|
|
+
|
|
+TEST touch $M0/a
|
|
+TEST touch $M0/c
|
|
+TEST kill_brick $V0 $H0 $B0/brick0
|
|
+TEST touch $M0/b
|
|
+TEST rm -f $M0/a
|
|
+TEST rm -f $M0/c
|
|
+TEST touch $M0/c #gfid mismatch case
|
|
+c_gfid=$(gf_get_gfid_xattr $B0/brick1/c)
|
|
+TEST $CLI volume start $V0 force
|
|
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
|
|
+TEST ! stat $M0/a
|
|
+TEST ! stat $B0/brick0/a
|
|
+TEST ! stat $B0/brick1/a
|
|
+TEST ! stat $B0/brick2/a
|
|
+
|
|
+TEST stat $M0/b
|
|
+TEST ! stat $B0/brick0/b #Name heal shouldn't be triggered
|
|
+TEST stat $B0/brick1/b
|
|
+TEST stat $B0/brick2/b
|
|
+
|
|
+TEST stat $M0/c
|
|
+TEST stat $B0/brick0/c
|
|
+TEST stat $B0/brick1/c
|
|
+TEST stat $B0/brick2/c
|
|
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/c)" == "$c_gfid" ]]
|
|
+
|
|
+TEST $CLI volume set $V0 cluster.quorum-type none
|
|
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "none" get_quorum_type $M0 $V0 0
|
|
+TEST stat $M0/b
|
|
+TEST stat $B0/brick0/b #Name heal should be triggered
|
|
+TEST stat $B0/brick1/b
|
|
+TEST stat $B0/brick2/b
|
|
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/b)" == "$(gf_get_gfid_xattr $B0/brick1/b)" ]]
|
|
+TEST $CLI volume set $V0 cluster.quorum-type auto
|
|
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "auto" get_quorum_type $M0 $V0 0
|
|
+
|
|
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
|
|
+
|
|
+#Missing parent xattrs cases
|
|
+TEST $CLI volume heal $V0 enable
|
|
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
|
|
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
|
|
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
|
|
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
|
|
+TEST $CLI volume heal $V0
|
|
+EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0
|
|
+
|
|
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
|
|
+TEST $CLI volume heal $V0 disable
|
|
+#In cases where a good parent doesn't have pending xattrs and a file,
|
|
+#name-heal will be triggered
|
|
+TEST gf_rm_file_and_gfid_link $B0/brick1 c
|
|
+TEST stat $M0/c
|
|
+TEST stat $B0/brick0/c
|
|
+TEST stat $B0/brick1/c
|
|
+TEST stat $B0/brick2/c
|
|
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/c)" == "$c_gfid" ]]
|
|
+cleanup
|
|
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
|
|
index e74fdec..ce2b17a 100644
|
|
--- a/xlators/cluster/afr/src/afr-common.c
|
|
+++ b/xlators/cluster/afr/src/afr-common.c
|
|
@@ -2302,8 +2302,6 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
|
|
*/
|
|
for (i = 0; i < priv->child_count; i++) {
|
|
if (!replies[i].valid || replies[i].op_ret == -1) {
|
|
- if (priv->child_up[i])
|
|
- can_interpret = _gf_false;
|
|
continue;
|
|
}
|
|
|
|
@@ -2742,21 +2740,52 @@ afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
|
|
afr_private_t *priv = NULL;
|
|
call_frame_t *heal = NULL;
|
|
int i = 0, first = -1;
|
|
- gf_boolean_t need_heal = _gf_false;
|
|
+ gf_boolean_t name_state_mismatch = _gf_false;
|
|
struct afr_reply *replies = NULL;
|
|
int ret = 0;
|
|
+ unsigned char *par_readables = NULL;
|
|
+ unsigned char *success = NULL;
|
|
+ int32_t op_errno = 0;
|
|
+ uuid_t gfid = {0};
|
|
|
|
local = frame->local;
|
|
replies = local->replies;
|
|
priv = this->private;
|
|
+ par_readables = alloca0(priv->child_count);
|
|
+ success = alloca0(priv->child_count);
|
|
+
|
|
+ ret = afr_inode_read_subvol_get (local->loc.parent, this, par_readables,
|
|
+ NULL, NULL);
|
|
+ if (ret < 0 || AFR_COUNT (par_readables, priv->child_count) == 0) {
|
|
+ /* In this case set par_readables to all 1 so that name_heal
|
|
+ * need checks at the end of this function will flag missing
|
|
+ * entry when name state mismatches*/
|
|
+ memset (par_readables, 1, priv->child_count);
|
|
+ }
|
|
|
|
for (i = 0; i < priv->child_count; i++) {
|
|
if (!replies[i].valid)
|
|
continue;
|
|
|
|
+ if (replies[i].op_ret == 0) {
|
|
+ if (uuid_is_null (gfid)) {
|
|
+ gf_uuid_copy (gfid,
|
|
+ replies[i].poststat.ia_gfid);
|
|
+ }
|
|
+ success[i] = 1;
|
|
+ } else {
|
|
+ if ((replies[i].op_errno != ENOTCONN) &&
|
|
+ (replies[i].op_errno != ENOENT) &&
|
|
+ (replies[i].op_errno != ESTALE)) {
|
|
+ op_errno = replies[i].op_errno;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /*gfid is missing, needs heal*/
|
|
if ((replies[i].op_ret == -1) &&
|
|
- (replies[i].op_errno == ENODATA))
|
|
- need_heal = _gf_true;
|
|
+ (replies[i].op_errno == ENODATA)) {
|
|
+ goto name_heal;
|
|
+ }
|
|
|
|
if (first == -1) {
|
|
first = i;
|
|
@@ -2764,30 +2793,53 @@ afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
|
|
}
|
|
|
|
if (replies[i].op_ret != replies[first].op_ret) {
|
|
- need_heal = _gf_true;
|
|
- break;
|
|
+ name_state_mismatch = _gf_true;
|
|
}
|
|
|
|
- if (gf_uuid_compare (replies[i].poststat.ia_gfid,
|
|
- replies[first].poststat.ia_gfid)) {
|
|
- need_heal = _gf_true;
|
|
- break;
|
|
- }
|
|
+ if (replies[i].op_ret == 0) {
|
|
+ /* Rename after this lookup may succeed if we don't do
|
|
+ * a name-heal and the destination may not have pending xattrs
|
|
+ * to indicate which name is good and which is bad so always do
|
|
+ * this heal*/
|
|
+ if (gf_uuid_compare (replies[i].poststat.ia_gfid,
|
|
+ gfid)) {
|
|
+ goto name_heal;
|
|
+ }
|
|
+ }
|
|
}
|
|
|
|
- if (need_heal) {
|
|
- heal = afr_frame_create (this, NULL);
|
|
- if (!heal)
|
|
- goto metadata_heal;
|
|
-
|
|
- ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap,
|
|
- afr_refresh_selfheal_done, heal, frame);
|
|
- if (ret) {
|
|
- AFR_STACK_DESTROY (heal);
|
|
- goto metadata_heal;
|
|
+ if (name_state_mismatch) {
|
|
+ if (!priv->quorum_count)
|
|
+ goto name_heal;
|
|
+ if (!afr_has_quorum (success, this))
|
|
+ goto name_heal;
|
|
+ if (op_errno)
|
|
+ goto name_heal;
|
|
+ for (i = 0; i < priv->child_count; i++) {
|
|
+ if (!replies[i].valid)
|
|
+ continue;
|
|
+ if (par_readables[i] && replies[i].op_ret < 0 &&
|
|
+ replies[i].op_errno != ENOTCONN) {
|
|
+ goto name_heal;
|
|
+ }
|
|
}
|
|
- return ret;
|
|
- }
|
|
+ }
|
|
+
|
|
+ goto metadata_heal;
|
|
+
|
|
+name_heal:
|
|
+ heal = afr_frame_create (this, NULL);
|
|
+ if (!heal)
|
|
+ goto metadata_heal;
|
|
+
|
|
+ ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap,
|
|
+ afr_refresh_selfheal_done, heal, frame);
|
|
+ if (ret) {
|
|
+ AFR_STACK_DESTROY (heal);
|
|
+ goto metadata_heal;
|
|
+ }
|
|
+ return ret;
|
|
+
|
|
metadata_heal:
|
|
ret = afr_lookup_metadata_heal_check (frame, this);
|
|
|
|
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
|
|
index bcd0e60..0a5be29 100644
|
|
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
|
|
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
|
|
@@ -634,20 +634,26 @@ afr_selfheal_name_unlocked_inspect (call_frame_t *frame, xlator_t *this,
|
|
continue;
|
|
|
|
if ((replies[i].op_ret == -1) &&
|
|
- (replies[i].op_errno == ENODATA))
|
|
+ (replies[i].op_errno == ENODATA)) {
|
|
*need_heal = _gf_true;
|
|
+ break;
|
|
+ }
|
|
|
|
if (first_idx == -1) {
|
|
first_idx = i;
|
|
continue;
|
|
}
|
|
|
|
- if (replies[i].op_ret != replies[first_idx].op_ret)
|
|
+ if (replies[i].op_ret != replies[first_idx].op_ret) {
|
|
*need_heal = _gf_true;
|
|
+ break;
|
|
+ }
|
|
|
|
if (gf_uuid_compare (replies[i].poststat.ia_gfid,
|
|
- replies[first_idx].poststat.ia_gfid))
|
|
+ replies[first_idx].poststat.ia_gfid)) {
|
|
*need_heal = _gf_true;
|
|
+ break;
|
|
+ }
|
|
}
|
|
|
|
if (inode)
|
|
--
|
|
1.8.3.1
|
|
|