glusterfs/0488-cluster-afr-Allow-lookup-on-root-if-it-is-from-ADD_R.patch
Milind Changire 1f2f23ddef autobuild v3.12.2-33
Resolves: bz#1350745 bz#1362129 bz#1541568 bz#1597252 bz#1599220
Resolves: bz#1633177 bz#1637564 bz#1639476 bz#1639568 bz#1643370
Resolves: bz#1645480 bz#1648296 bz#1648893 bz#1651040 bz#1651460
Resolves: bz#1652466 bz#1652537 bz#1653224 bz#1653613 bz#1654103
Resolves: bz#1654161 bz#1655385 bz#1655578 bz#1656357 bz#1659439
Signed-off-by: Milind Changire <mchangir@redhat.com>
2018-12-18 11:23:13 -05:00

460 lines
19 KiB
Diff

From f398b6b9705f1b75d17d965ffcd72157d5be3daf Mon Sep 17 00:00:00 2001
From: karthik-us <ksubrahm@redhat.com>
Date: Tue, 18 Dec 2018 16:04:42 +0530
Subject: [PATCH 488/493] cluster/afr: Allow lookup on root if it is from
ADD_REPLICA_MOUNT
Problem: When trying to convert a plain distribute volume to replica-3
or arbiter type it is failing with ENOTCONN error as the lookup on
the root will fail as there is no quorum.
Fix: Allow lookup on root if it is coming from the ADD_REPLICA_MOUNT
which is used while adding bricks to a volume. It will try to set the
pending xattrs for the newly added bricks to allow the heal to happen
in the right direction and avoid data loss scenarios.
Note: This fix will solve the problem of type conversion only in the
case where the volume was mounted at least once. The conversion of
non mounted volumes will still fail since the dht selfheal tries to
set the directory layout will fail as they do that with the PID
GF_CLIENT_PID_NO_ROOT_SQUASH set in the frame->root.
Backport of: https://review.gluster.org/#/c/glusterfs/+/21791/
Change-Id: Ie31d429dfebbfb0f60610c9c5739595c54b19c46
BUG: 1645480
Signed-off-by: karthik-us <ksubrahm@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/158932
Tested-by: RHGS Build Bot <nigelb@redhat.com>
Reviewed-by: Ravishankar Narayanankutty <ravishankar@redhat.com>
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
---
libglusterfs/src/common-utils.h | 3 +-
...g-1655854-support-dist-to-rep3-arb-conversion.t | 95 ++++++++++++++++++++++
xlators/cluster/afr/src/afr-common.c | 80 +++++++++++++-----
xlators/cluster/afr/src/afr-inode-write.c | 2 +-
xlators/cluster/afr/src/afr-read-txn.c | 3 +-
xlators/cluster/afr/src/afr-transaction.c | 11 ++-
xlators/cluster/afr/src/afr-transaction.h | 3 +-
xlators/cluster/afr/src/afr.c | 2 +-
xlators/cluster/afr/src/afr.h | 4 +
xlators/mgmt/glusterd/src/glusterd-utils.c | 2 +-
10 files changed, 175 insertions(+), 30 deletions(-)
create mode 100644 tests/bugs/replicate/bug-1655854-support-dist-to-rep3-arb-conversion.t
diff --git a/libglusterfs/src/common-utils.h b/libglusterfs/src/common-utils.h
index c804ed5..50c1f9a 100644
--- a/libglusterfs/src/common-utils.h
+++ b/libglusterfs/src/common-utils.h
@@ -162,7 +162,8 @@ enum _gf_special_pid
GF_CLIENT_PID_BITD = -8,
GF_CLIENT_PID_SCRUB = -9,
GF_CLIENT_PID_TIER_DEFRAG = -10,
- GF_SERVER_PID_TRASH = -11
+ GF_SERVER_PID_TRASH = -11,
+ GF_CLIENT_PID_ADD_REPLICA_MOUNT = -12
};
enum _gf_xlator_ipc_targets {
diff --git a/tests/bugs/replicate/bug-1655854-support-dist-to-rep3-arb-conversion.t b/tests/bugs/replicate/bug-1655854-support-dist-to-rep3-arb-conversion.t
new file mode 100644
index 0000000..783016d
--- /dev/null
+++ b/tests/bugs/replicate/bug-1655854-support-dist-to-rep3-arb-conversion.t
@@ -0,0 +1,95 @@
+#!/bin/bash
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+
+# Conversion from 2x1 to 2x3
+
+TEST $CLI volume create $V0 $H0:$B0/${V0}{0,1}
+EXPECT 'Created' volinfo_field $V0 'Status';
+TEST $CLI volume start $V0
+EXPECT 'Started' volinfo_field $V0 'Status';
+
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
+
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
+TEST mkdir $M0/dir
+TEST dd if=/dev/urandom of=$M0/dir/file bs=100K count=5
+file_md5sum=$(md5sum $M0/dir/file | awk '{print $1}')
+
+TEST $CLI volume add-brick $V0 replica 3 $H0:$B0/${V0}{2..5}
+
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}3
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}4
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}5
+
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 3
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 4
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 5
+
+# Trigger heal and wait for for it to complete
+TEST $CLI volume heal $V0
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+# Check whether the directory & file are healed to the newly added bricks
+TEST ls $B0/${V0}2/dir
+TEST ls $B0/${V0}3/dir
+TEST ls $B0/${V0}4/dir
+TEST ls $B0/${V0}5/dir
+
+TEST [ $file_md5sum == $(md5sum $B0/${V0}4/dir/file | awk '{print $1}') ]
+TEST [ $file_md5sum == $(md5sum $B0/${V0}5/dir/file | awk '{print $1}') ]
+
+
+# Conversion from 2x1 to 2x(2+1)
+
+TEST $CLI volume create $V1 $H0:$B0/${V1}{0,1}
+EXPECT 'Created' volinfo_field $V1 'Status';
+TEST $CLI volume start $V1
+EXPECT 'Started' volinfo_field $V1 'Status';
+
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V1 $H0 $B0/${V1}0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V1 $H0 $B0/${V1}1
+
+TEST $GFS --volfile-id=$V1 --volfile-server=$H0 $M1;
+TEST mkdir $M1/dir
+TEST dd if=/dev/urandom of=$M1/dir/file bs=100K count=5
+file_md5sum=$(md5sum $M1/dir/file | awk '{print $1}')
+
+TEST $CLI volume add-brick $V1 replica 3 arbiter 1 $H0:$B0/${V1}{2..5}
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V1 $H0 $B0/${V1}2
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V1 $H0 $B0/${V1}3
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V1 $H0 $B0/${V1}4
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V1 $H0 $B0/${V1}5
+
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V1 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V1 1
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V1 2
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V1 3
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V1 4
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V1 5
+
+# Trigger heal and wait for for it to complete
+TEST $CLI volume heal $V1
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V1
+
+# Check whether the directory & file are healed to the newly added bricks
+TEST ls $B0/${V1}2/dir
+TEST ls $B0/${V1}3/dir
+TEST ls $B0/${V1}4/dir
+TEST ls $B0/${V1}5/dir
+
+EXPECT "0" stat -c %s $B0/${V1}5/dir/file
+TEST [ $file_md5sum == $(md5sum $B0/${V1}4/dir/file | awk '{print $1}') ]
+
+cleanup;
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 231de9d..322dfbe 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -2265,7 +2265,7 @@ afr_attempt_readsubvol_set (call_frame_t *frame, xlator_t *this,
} else if (!priv->quorum_count) {
*read_subvol = afr_first_up_child (frame, this);
} else if (priv->quorum_count &&
- afr_has_quorum (data_readable, this)) {
+ afr_has_quorum (data_readable, this, NULL)) {
/* read_subvol is guaranteed to be valid if we hit this path. */
*read_subvol = afr_first_up_child (frame, this);
} else {
@@ -2405,7 +2405,7 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
read_subvol = -1;
memset (readable, 0, sizeof (*readable) * priv->child_count);
if (can_interpret) {
- if (!afr_has_quorum (success_replies, this))
+ if (!afr_has_quorum (success_replies, this, NULL))
goto cant_interpret;
/* It is safe to call afr_replies_interpret() because we have
a response from all the UP subvolumes and all of them resolved
@@ -2887,7 +2887,7 @@ afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
if (name_state_mismatch) {
if (!priv->quorum_count)
goto name_heal;
- if (!afr_has_quorum (success, this))
+ if (!afr_has_quorum (success, this, NULL))
goto name_heal;
if (op_errno)
goto name_heal;
@@ -2979,7 +2979,6 @@ afr_discover_done (call_frame_t *frame, xlator_t *this)
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
int i = -1;
- int op_errno = 0;
int read_subvol = -1;
unsigned char *data_readable = NULL;
unsigned char *success_replies = NULL;
@@ -2998,15 +2997,13 @@ afr_discover_done (call_frame_t *frame, xlator_t *this)
}
}
- op_errno = afr_final_errno (frame->local, this->private);
-
if (local->op_ret < 0) {
- AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL,
- NULL, NULL);
- return;
- }
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno (frame->local, this->private);
+ goto error;
+ }
- if (!afr_has_quorum (success_replies, this))
+ if (!afr_has_quorum (success_replies, this, frame))
goto unwind;
afr_replies_interpret (frame, this, local->inode, NULL);
@@ -3017,11 +3014,8 @@ afr_discover_done (call_frame_t *frame, xlator_t *this)
unwind:
afr_attempt_readsubvol_set (frame, this, success_replies, data_readable,
&read_subvol);
- if (read_subvol == -1) {
- AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- NULL, NULL, NULL, NULL);
- return;
- }
+ if (read_subvol == -1)
+ goto error;
if (AFR_IS_ARBITER_BRICK (priv, read_subvol) && local->op_ret == 0) {
local->op_ret = -1;
@@ -3034,6 +3028,11 @@ unwind:
local->inode, &local->replies[read_subvol].poststat,
local->replies[read_subvol].xdata,
&local->replies[read_subvol].postparent);
+ return;
+
+error:
+ AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, NULL,
+ NULL, NULL, NULL);
}
@@ -3977,7 +3976,8 @@ afr_fop_lock_done (call_frame_t *frame, xlator_t *this)
if (afr_is_conflicting_lock_present (local->op_ret, local->op_errno)) {
afr_unlock_locks_and_proceed (frame, this, lock_count);
- } else if (priv->quorum_count && !afr_has_quorum (success, this)) {
+ } else if (priv->quorum_count &&
+ !afr_has_quorum (success, this, NULL)) {
local->fop_lock_state = AFR_FOP_LOCK_QUORUM_FAILED;
local->op_ret = -1;
local->op_errno = afr_final_errno (local, priv);
@@ -4485,7 +4485,7 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
&local->cont.lk.user_flock,
local->xdata_req);
} else if (priv->quorum_count &&
- !afr_has_quorum (local->cont.lk.locked_nodes, this)) {
+ !afr_has_quorum (local->cont.lk.locked_nodes, this, NULL)) {
local->op_ret = -1;
local->op_errno = afr_final_errno (local, priv);
@@ -5199,7 +5199,7 @@ afr_notify (xlator_t *this, int32_t event,
}
had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up,
- this);
+ this, NULL);
if (priv->halo_enabled) {
halo_max_latency_msec = afr_get_halo_latency (this);
@@ -5328,7 +5328,7 @@ afr_notify (xlator_t *this, int32_t event,
UNLOCK (&priv->lock);
if (priv->quorum_count) {
- has_quorum = afr_has_quorum (priv->child_up, this);
+ has_quorum = afr_has_quorum (priv->child_up, this, NULL);
if (!had_quorum && has_quorum) {
gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_QUORUM_MET,
"Client-quorum is met");
@@ -6543,3 +6543,43 @@ afr_set_inode_local (xlator_t *this, afr_local_t *local, inode_t *inode)
}
return ret;
}
+
+gf_boolean_t
+afr_is_add_replica_mount_lookup_on_root (call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT)
+ return _gf_false;
+
+ if (local->op != GF_FOP_LOOKUP)
+ /* TODO:If the replica count is being increased on a plain
+ * distribute volume that was never mounted, we need to allow
+ * setxattr on '/' with GF_CLIENT_PID_NO_ROOT_SQUASH to
+ * accomodate for DHT layout setting */
+ return _gf_false;
+
+ if (local->inode == NULL)
+ return _gf_false;
+
+ if (!__is_root_gfid (local->inode->gfid))
+ return _gf_false;
+
+ return _gf_true;
+}
+
+gf_boolean_t
+afr_lookup_has_quorum (call_frame_t *frame, xlator_t *this,
+ unsigned char *subvols)
+{
+ afr_private_t *priv = this->private;
+
+ if (frame && afr_is_add_replica_mount_lookup_on_root (frame)) {
+ if (AFR_COUNT (subvols, priv->child_count) > 0)
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index 8b1dcfd..ea4755a 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -1539,7 +1539,7 @@ afr_handle_empty_brick (xlator_t *this, call_frame_t *frame, loc_t *loc,
if (ret && ab_ret)
goto out;
- if (frame->root->pid != GF_CLIENT_PID_SELF_HEALD) {
+ if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT) {
gf_msg (this->name, GF_LOG_ERROR, EPERM,
afr_get_msg_id (op_type),
"'%s' is an internal extended attribute.",
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
index f6c491b..ec322ae 100644
--- a/xlators/cluster/afr/src/afr-read-txn.c
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -193,7 +193,8 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
local->inode = inode_ref (inode);
local->is_read_txn = _gf_true;
- if (priv->quorum_count && !afr_has_quorum (local->child_up, this)) {
+ if (priv->quorum_count &&
+ !afr_has_quorum (local->child_up, this, NULL)) {
local->op_ret = -1;
local->op_errno = ENOTCONN;
read_subvol = -1;
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index 0a67a83..0e58e02 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -160,7 +160,7 @@ afr_changelog_has_quorum (afr_local_t *local, xlator_t *this)
}
}
- if (afr_has_quorum (success_children, this)) {
+ if (afr_has_quorum (success_children, this, NULL)) {
return _gf_true;
}
@@ -690,7 +690,7 @@ afr_handle_symmetric_errors (call_frame_t *frame, xlator_t *this)
}
gf_boolean_t
-afr_has_quorum (unsigned char *subvols, xlator_t *this)
+afr_has_quorum (unsigned char *subvols, xlator_t *this, call_frame_t *frame)
{
unsigned int quorum_count = 0;
afr_private_t *priv = NULL;
@@ -699,6 +699,9 @@ afr_has_quorum (unsigned char *subvols, xlator_t *this)
priv = this->private;
up_children_count = AFR_COUNT (subvols, priv->child_count);
+ if (afr_lookup_has_quorum (frame, this, subvols))
+ return _gf_true;
+
if (priv->quorum_count == AFR_QUORUM_AUTO) {
/*
* Special case for auto-quorum with an even number of nodes.
@@ -753,7 +756,7 @@ afr_has_fop_quorum (call_frame_t *frame)
locked_nodes = afr_locked_nodes_get (local->transaction.type,
&local->internal_lock);
- return afr_has_quorum (locked_nodes, this);
+ return afr_has_quorum (locked_nodes, this, NULL);
}
static gf_boolean_t
@@ -771,7 +774,7 @@ afr_has_fop_cbk_quorum (call_frame_t *frame)
success[i] = 1;
}
- return afr_has_quorum (success, this);
+ return afr_has_quorum (success, this, NULL);
}
gf_boolean_t
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
index a27e9a3..1c42c66 100644
--- a/xlators/cluster/afr/src/afr-transaction.h
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -38,7 +38,8 @@ int afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol);
call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame);
-gf_boolean_t afr_has_quorum (unsigned char *subvols, xlator_t *this);
+gf_boolean_t afr_has_quorum (unsigned char *subvols, xlator_t *this,
+ call_frame_t *frame);
gf_boolean_t afr_needs_changelog_update (afr_local_t *local);
void afr_zero_fill_stat (afr_local_t *local);
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 1b738c0..d5347e9 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -244,7 +244,7 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("quorum-count", priv->quorum_count, options,
uint32, out);
fix_quorum_options (this, priv, qtype, options);
- if (priv->quorum_count && !afr_has_quorum (priv->child_up, this))
+ if (priv->quorum_count && !afr_has_quorum (priv->child_up, this, NULL))
gf_msg (this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL,
"Client-quorum is not met");
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 7010e9b..febc509 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -1227,4 +1227,8 @@ afr_is_symmetric_error (call_frame_t *frame, xlator_t *this);
gf_boolean_t
afr_is_pending_set (xlator_t *this, dict_t *xdata, int type);
+
+gf_boolean_t
+afr_lookup_has_quorum (call_frame_t *frame, xlator_t *this,
+ unsigned char *subvols);
#endif /* __AFR_H__ */
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
index 2290343..6468ecb 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
@@ -13788,7 +13788,7 @@ glusterd_handle_replicate_brick_ops (glusterd_volinfo_t *volinfo,
goto out;
}
- ret = gf_asprintf (&pid, "%d", GF_CLIENT_PID_SELF_HEALD);
+ ret = gf_asprintf (&pid, "%d", GF_CLIENT_PID_ADD_REPLICA_MOUNT);
if (ret < 0)
goto out;
--
1.8.3.1