205 lines
8.3 KiB
Diff
205 lines
8.3 KiB
Diff
|
From cca418b78ec976aa69eacd56b0e6127ea7e3dd26 Mon Sep 17 00:00:00 2001
|
||
|
From: Pranith Kumar K <pkarampu@redhat.com>
|
||
|
Date: Thu, 4 Apr 2019 15:31:56 +0530
|
||
|
Subject: [PATCH 095/124] cluster/afr: Remove local from owners_list on failure
|
||
|
of lock-acquisition
|
||
|
|
||
|
Backport of https://review.gluster.org/c/glusterfs/+/22515
|
||
|
|
||
|
When eager-lock lock acquisition fails because of say network failures, the
|
||
|
local is not being removed from owners_list, this leads to accumulation of
|
||
|
waiting frames and the application will hang because the waiting frames are
|
||
|
under the assumption that another transaction is in the process of acquiring
|
||
|
lock because owner-list is not empty. Handled this case as well in this patch.
|
||
|
Added asserts to make it easier to find these problems in future.
|
||
|
|
||
|
Change-Id: I3101393265e9827755725b1f2d94a93d8709e923
|
||
|
fixes: bz#1688395
|
||
|
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
|
||
|
Reviewed-on: https://code.engineering.redhat.com/gerrit/167859
|
||
|
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||
|
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
|
||
|
---
|
||
|
tests/bugs/replicate/bug-1696599-io-hang.t | 47 ++++++++++++++++++++++++++++++
|
||
|
xlators/cluster/afr/src/afr-common.c | 8 ++---
|
||
|
xlators/cluster/afr/src/afr-lk-common.c | 1 -
|
||
|
xlators/cluster/afr/src/afr-transaction.c | 19 +++++-------
|
||
|
xlators/cluster/afr/src/afr.h | 4 +--
|
||
|
5 files changed, 61 insertions(+), 18 deletions(-)
|
||
|
create mode 100755 tests/bugs/replicate/bug-1696599-io-hang.t
|
||
|
|
||
|
diff --git a/tests/bugs/replicate/bug-1696599-io-hang.t b/tests/bugs/replicate/bug-1696599-io-hang.t
|
||
|
new file mode 100755
|
||
|
index 0000000..869cdb9
|
||
|
--- /dev/null
|
||
|
+++ b/tests/bugs/replicate/bug-1696599-io-hang.t
|
||
|
@@ -0,0 +1,47 @@
|
||
|
+#!/bin/bash
|
||
|
+
|
||
|
+. $(dirname $0)/../../include.rc
|
||
|
+. $(dirname $0)/../../volume.rc
|
||
|
+. $(dirname $0)/../../fileio.rc
|
||
|
+
|
||
|
+#Tests that local structures in afr are removed from granted/blocked list of
|
||
|
+#locks when inodelk fails on all bricks
|
||
|
+
|
||
|
+cleanup;
|
||
|
+
|
||
|
+TEST glusterd
|
||
|
+TEST pidof glusterd
|
||
|
+
|
||
|
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3}
|
||
|
+TEST $CLI volume set $V0 performance.quick-read off
|
||
|
+TEST $CLI volume set $V0 performance.write-behind off
|
||
|
+TEST $CLI volume set $V0 performance.io-cache off
|
||
|
+TEST $CLI volume set $V0 performance.stat-prefetch off
|
||
|
+TEST $CLI volume set $V0 performance.client-io-threads off
|
||
|
+TEST $CLI volume set $V0 delay-gen locks
|
||
|
+TEST $CLI volume set $V0 delay-gen.delay-duration 5000000
|
||
|
+TEST $CLI volume set $V0 delay-gen.delay-percentage 100
|
||
|
+TEST $CLI volume set $V0 delay-gen.enable finodelk
|
||
|
+
|
||
|
+TEST $CLI volume start $V0
|
||
|
+EXPECT 'Started' volinfo_field $V0 'Status'
|
||
|
+
|
||
|
+TEST $GFS -s $H0 --volfile-id $V0 $M0
|
||
|
+TEST touch $M0/file
|
||
|
+#Trigger write and stop bricks so inodelks fail on all bricks leading to
|
||
|
+#lock failure condition
|
||
|
+echo abc >> $M0/file &
|
||
|
+
|
||
|
+TEST $CLI volume stop $V0
|
||
|
+TEST $CLI volume reset $V0 delay-gen
|
||
|
+wait
|
||
|
+TEST $CLI volume start $V0
|
||
|
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0
|
||
|
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1
|
||
|
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 2
|
||
|
+#Test that only one write succeeded, this tests that delay-gen worked as
|
||
|
+#expected
|
||
|
+echo abc >> $M0/file
|
||
|
+EXPECT "abc" cat $M0/file
|
||
|
+
|
||
|
+cleanup;
|
||
|
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
|
||
|
index 45b96e3..47a5d3a 100644
|
||
|
--- a/xlators/cluster/afr/src/afr-common.c
|
||
|
+++ b/xlators/cluster/afr/src/afr-common.c
|
||
|
@@ -5763,6 +5763,10 @@ afr_transaction_local_init(afr_local_t *local, xlator_t *this)
|
||
|
afr_private_t *priv = NULL;
|
||
|
|
||
|
priv = this->private;
|
||
|
+ INIT_LIST_HEAD(&local->transaction.wait_list);
|
||
|
+ INIT_LIST_HEAD(&local->transaction.owner_list);
|
||
|
+ INIT_LIST_HEAD(&local->ta_waitq);
|
||
|
+ INIT_LIST_HEAD(&local->ta_onwireq);
|
||
|
ret = afr_internal_lock_init(&local->internal_lock, priv->child_count);
|
||
|
if (ret < 0)
|
||
|
goto out;
|
||
|
@@ -5800,10 +5804,6 @@ afr_transaction_local_init(afr_local_t *local, xlator_t *this)
|
||
|
goto out;
|
||
|
|
||
|
ret = 0;
|
||
|
- INIT_LIST_HEAD(&local->transaction.wait_list);
|
||
|
- INIT_LIST_HEAD(&local->transaction.owner_list);
|
||
|
- INIT_LIST_HEAD(&local->ta_waitq);
|
||
|
- INIT_LIST_HEAD(&local->ta_onwireq);
|
||
|
out:
|
||
|
return ret;
|
||
|
}
|
||
|
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
|
||
|
index 4091671..bc8eabe 100644
|
||
|
--- a/xlators/cluster/afr/src/afr-lk-common.c
|
||
|
+++ b/xlators/cluster/afr/src/afr-lk-common.c
|
||
|
@@ -397,7 +397,6 @@ afr_unlock_now(call_frame_t *frame, xlator_t *this)
|
||
|
int_lock->lk_call_count = call_count;
|
||
|
|
||
|
if (!call_count) {
|
||
|
- GF_ASSERT(!local->transaction.do_eager_unlock);
|
||
|
gf_msg_trace(this->name, 0, "No internal locks unlocked");
|
||
|
int_lock->lock_cbk(frame, this);
|
||
|
goto out;
|
||
|
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
|
||
|
index 229820b..15f3a7e 100644
|
||
|
--- a/xlators/cluster/afr/src/afr-transaction.c
|
||
|
+++ b/xlators/cluster/afr/src/afr-transaction.c
|
||
|
@@ -372,6 +372,8 @@ afr_transaction_done(call_frame_t *frame, xlator_t *this)
|
||
|
}
|
||
|
local->transaction.unwind(frame, this);
|
||
|
|
||
|
+ GF_ASSERT(list_empty(&local->transaction.owner_list));
|
||
|
+ GF_ASSERT(list_empty(&local->transaction.wait_list));
|
||
|
AFR_STACK_DESTROY(frame);
|
||
|
|
||
|
return 0;
|
||
|
@@ -393,7 +395,7 @@ afr_lock_fail_shared(afr_local_t *local, struct list_head *list)
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
-afr_handle_lock_acquire_failure(afr_local_t *local, gf_boolean_t locked)
|
||
|
+afr_handle_lock_acquire_failure(afr_local_t *local)
|
||
|
{
|
||
|
struct list_head shared;
|
||
|
afr_lock_t *lock = NULL;
|
||
|
@@ -414,13 +416,8 @@ afr_handle_lock_acquire_failure(afr_local_t *local, gf_boolean_t locked)
|
||
|
afr_lock_fail_shared(local, &shared);
|
||
|
local->transaction.do_eager_unlock = _gf_true;
|
||
|
out:
|
||
|
- if (locked) {
|
||
|
- local->internal_lock.lock_cbk = afr_transaction_done;
|
||
|
- afr_unlock(local->transaction.frame, local->transaction.frame->this);
|
||
|
- } else {
|
||
|
- afr_transaction_done(local->transaction.frame,
|
||
|
- local->transaction.frame->this);
|
||
|
- }
|
||
|
+ local->internal_lock.lock_cbk = afr_transaction_done;
|
||
|
+ afr_unlock(local->transaction.frame, local->transaction.frame->this);
|
||
|
}
|
||
|
|
||
|
call_frame_t *
|
||
|
@@ -619,7 +616,7 @@ afr_transaction_perform_fop(call_frame_t *frame, xlator_t *this)
|
||
|
failure_count = AFR_COUNT(local->transaction.failed_subvols,
|
||
|
priv->child_count);
|
||
|
if (failure_count == priv->child_count) {
|
||
|
- afr_handle_lock_acquire_failure(local, _gf_true);
|
||
|
+ afr_handle_lock_acquire_failure(local);
|
||
|
return 0;
|
||
|
} else {
|
||
|
lock = &local->inode_ctx->lock[local->transaction.type];
|
||
|
@@ -2092,7 +2089,7 @@ err:
|
||
|
local->op_ret = -1;
|
||
|
local->op_errno = op_errno;
|
||
|
|
||
|
- afr_handle_lock_acquire_failure(local, _gf_true);
|
||
|
+ afr_handle_lock_acquire_failure(local);
|
||
|
|
||
|
if (xdata_req)
|
||
|
dict_unref(xdata_req);
|
||
|
@@ -2361,7 +2358,7 @@ afr_internal_lock_finish(call_frame_t *frame, xlator_t *this)
|
||
|
} else {
|
||
|
lock = &local->inode_ctx->lock[local->transaction.type];
|
||
|
if (local->internal_lock.lock_op_ret < 0) {
|
||
|
- afr_handle_lock_acquire_failure(local, _gf_false);
|
||
|
+ afr_handle_lock_acquire_failure(local);
|
||
|
} else {
|
||
|
lock->event_generation = local->event_generation;
|
||
|
afr_changelog_pre_op(frame, this);
|
||
|
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
|
||
|
index 2cc3797..e731cfa 100644
|
||
|
--- a/xlators/cluster/afr/src/afr.h
|
||
|
+++ b/xlators/cluster/afr/src/afr.h
|
||
|
@@ -1091,8 +1091,8 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd);
|
||
|
#define AFR_FRAME_INIT(frame, op_errno) \
|
||
|
({ \
|
||
|
frame->local = mem_get0(THIS->local_pool); \
|
||
|
- if (afr_local_init(frame->local, THIS->private, &op_errno)) { \
|
||
|
- afr_local_cleanup(frame->local, THIS); \
|
||
|
+ if (afr_local_init(frame->local, frame->this->private, &op_errno)) { \
|
||
|
+ afr_local_cleanup(frame->local, frame->this); \
|
||
|
mem_put(frame->local); \
|
||
|
frame->local = NULL; \
|
||
|
}; \
|
||
|
--
|
||
|
1.8.3.1
|
||
|
|