40397910b3
Resolves: bz#1631372 bz#1636902 bz#1637084 Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
264 lines
11 KiB
Diff
264 lines
11 KiB
Diff
From fad234b5a62df48b7abc726549f2abb6b0af7c04 Mon Sep 17 00:00:00 2001
|
|
From: Mohit Agrawal <moagrawa@redhat.com>
|
|
Date: Tue, 16 Oct 2018 07:50:47 +0530
|
|
Subject: [PATCH 402/404] core: glusterfsd keeping fd open in index xlator
|
|
|
|
Problem: At the time of processing GF_EVENT_PARENT_DOWN
|
|
at brick xlator, it forwards the event to next xlator
|
|
only while xlator ensures no stub is in progress.
|
|
At io-thread xlator it decreases stub_cnt before the process
|
|
a stub and notify EVENT to next xlator
|
|
|
|
Solution: Introduce a new counter to save stub_cnt and decrease
|
|
the counter after process the stub completely at io-thread
|
|
xlator.
|
|
To avoid brick crash at the time of call xlator_mem_cleanup
|
|
move only brick xlator if detach brick name has found in
|
|
the graph
|
|
|
|
Note: Thanks to pranith for sharing a simple reproducer to
|
|
reproduce the same
|
|
|
|
> fixes bz#1637934
|
|
> Change-Id: I1a694a001f7a5417e8771e3adf92c518969b6baa
|
|
> (Cherry pick from commit 7bf95631b52bd05b06122180f8bd4aa62c70b1a9)
|
|
> (Reviewed on upstream link https://review.gluster.org/#/c/glusterfs/+/21379/)
|
|
|
|
Change-Id: I54b8ebb19819f9bbcbdd1448474ab084c0fd2eb6
|
|
BUG: 1631372
|
|
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
|
|
Reviewed-on: https://code.engineering.redhat.com/gerrit/152908
|
|
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
|
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
|
|
---
|
|
glusterfsd/src/glusterfsd-mgmt.c | 15 +----
|
|
tests/bugs/glusterd/brick-mux-fd-cleanup.t | 78 +++++++++++++++++++++++++
|
|
xlators/performance/io-threads/src/io-threads.c | 23 ++++----
|
|
xlators/performance/io-threads/src/io-threads.h | 3 +-
|
|
4 files changed, 94 insertions(+), 25 deletions(-)
|
|
create mode 100644 tests/bugs/glusterd/brick-mux-fd-cleanup.t
|
|
|
|
diff --git a/glusterfsd/src/glusterfsd-mgmt.c b/glusterfsd/src/glusterfsd-mgmt.c
|
|
index cbd436a..e3fceeb 100644
|
|
--- a/glusterfsd/src/glusterfsd-mgmt.c
|
|
+++ b/glusterfsd/src/glusterfsd-mgmt.c
|
|
@@ -270,6 +270,7 @@ xlator_mem_cleanup (xlator_t *this) {
|
|
top = glusterfsd_ctx->active->first;
|
|
LOCK (&ctx->volfile_lock);
|
|
/* TODO here we have leak for xlator node in a graph */
|
|
+ /* Need to move only top xlator from a graph */
|
|
for (trav_p = &top->children; *trav_p; trav_p = &(*trav_p)->next) {
|
|
victim = (*trav_p)->xlator;
|
|
if (victim->call_cleanup && !strcmp (victim->name, this->name)) {
|
|
@@ -277,20 +278,6 @@ xlator_mem_cleanup (xlator_t *this) {
|
|
break;
|
|
}
|
|
}
|
|
- /* TODO Sometime brick xlator is not moved from graph so followed below
|
|
- approach to move brick xlator from a graph, will move specific brick
|
|
- xlator from graph only while inode table and mem_acct are cleaned up
|
|
- */
|
|
- trav_p = &top->children;
|
|
- while (*trav_p) {
|
|
- victim = (*trav_p)->xlator;
|
|
- if (victim->call_cleanup && !victim->itable && !victim->mem_acct) {
|
|
- (*trav_p) = (*trav_p)->next;
|
|
- } else {
|
|
- trav_p = &(*trav_p)->next;
|
|
- }
|
|
- }
|
|
- UNLOCK (&ctx->volfile_lock);
|
|
}
|
|
}
|
|
|
|
diff --git a/tests/bugs/glusterd/brick-mux-fd-cleanup.t b/tests/bugs/glusterd/brick-mux-fd-cleanup.t
|
|
new file mode 100644
|
|
index 0000000..de11c17
|
|
--- /dev/null
|
|
+++ b/tests/bugs/glusterd/brick-mux-fd-cleanup.t
|
|
@@ -0,0 +1,78 @@
|
|
+#!/bin/bash
|
|
+
|
|
+. $(dirname $0)/../../include.rc
|
|
+. $(dirname $0)/../../volume.rc
|
|
+
|
|
+#This .t tests that the fds from client are closed on brick when gluster volume
|
|
+#stop is executed in brick-mux setup.
|
|
+
|
|
+cleanup;
|
|
+TEST glusterd
|
|
+TEST pidof glusterd
|
|
+
|
|
+function keep_fd_open {
|
|
+#This function has to be run as background job because opening the fd in
|
|
+#foreground and running commands is leading to flush calls on these fds
|
|
+#which is making it very difficult to create the race where fds will be left
|
|
+#open even after the brick dies.
|
|
+ exec 5>$M1/a
|
|
+ exec 6>$M1/b
|
|
+ while [ -f $M0/a ]; do sleep 1; done
|
|
+}
|
|
+
|
|
+function count_open_files {
|
|
+ local brick_pid="$1"
|
|
+ local pattern="$2"
|
|
+ ls -l /proc/$brick_pid/fd | grep -i "$pattern" | wc -l
|
|
+}
|
|
+
|
|
+TEST $CLI volume set all cluster.brick-multiplex on
|
|
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
|
|
+TEST $CLI volume create $V1 replica 2 $H0:$B0/${V1}{2,3}
|
|
+#Have same configuration on both bricks so that they are multiplexed
|
|
+#Delay flush fop for a second
|
|
+TEST $CLI volume heal $V0 disable
|
|
+TEST $CLI volume heal $V1 disable
|
|
+TEST $CLI volume set $V0 delay-gen posix
|
|
+TEST $CLI volume set $V0 delay-gen.enable flush
|
|
+TEST $CLI volume set $V0 delay-gen.delay-percentage 100
|
|
+TEST $CLI volume set $V0 delay-gen.delay-duration 1000000
|
|
+TEST $CLI volume set $V1 delay-gen posix
|
|
+TEST $CLI volume set $V1 delay-gen.enable flush
|
|
+TEST $CLI volume set $V1 delay-gen.delay-percentage 100
|
|
+TEST $CLI volume set $V1 delay-gen.delay-duration 1000000
|
|
+
|
|
+TEST $CLI volume start $V0
|
|
+TEST $CLI volume start $V1
|
|
+
|
|
+TEST $GFS -s $H0 --volfile-id=$V0 --direct-io-mode=enable $M0
|
|
+TEST $GFS -s $H0 --volfile-id=$V1 --direct-io-mode=enable $M1
|
|
+
|
|
+TEST touch $M0/a
|
|
+keep_fd_open &
|
|
+TEST $CLI volume profile $V1 start
|
|
+brick_pid=$(get_brick_pid $V1 $H0 $B0/${V1}2)
|
|
+TEST count_open_files $brick_pid "$B0/${V1}2/a"
|
|
+TEST count_open_files $brick_pid "$B0/${V1}2/b"
|
|
+TEST count_open_files $brick_pid "$B0/${V1}3/a"
|
|
+TEST count_open_files $brick_pid "$B0/${V1}3/b"
|
|
+
|
|
+#If any other flush fops are introduced into the system other than the one at
|
|
+#cleanup it interferes with the race, so test for it
|
|
+EXPECT "^0$" echo "$($CLI volume profile $V1 info incremental | grep -i flush | wc -l)"
|
|
+#Stop the volume
|
|
+TEST $CLI volume stop $V1
|
|
+
|
|
+#Wait for cleanup resources or volume V1
|
|
+EXPECT_WITHIN $GRAPH_SWITCH_TIMEOUT "^0$" count_open_files $brick_pid "$B0/${V1}2/a"
|
|
+EXPECT_WITHIN $GRAPH_SWITCH_TIMEOUT "^0$" count_open_files $brick_pid "$B0/${V1}2/b"
|
|
+EXPECT_WITHIN $GRAPH_SWITCH_TIMEOUT "^0$" count_open_files $brick_pid "$B0/${V1}3/a"
|
|
+EXPECT_WITHIN $GRAPH_SWITCH_TIMEOUT "^0$" count_open_files $brick_pid "$B0/${V1}3/b"
|
|
+
|
|
+TEST rm -f $M0/a #Exit keep_fd_open()
|
|
+wait
|
|
+
|
|
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
|
|
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M1
|
|
+
|
|
+cleanup
|
|
diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c
|
|
index 41d48ab..2944c7d 100644
|
|
--- a/xlators/performance/io-threads/src/io-threads.c
|
|
+++ b/xlators/performance/io-threads/src/io-threads.c
|
|
@@ -120,7 +120,7 @@ __iot_dequeue (iot_conf_t *conf, int *pri)
|
|
if (!stub)
|
|
return NULL;
|
|
|
|
- GF_ATOMIC_DEC(conf->queue_size);
|
|
+ conf->queue_size--;
|
|
conf->queue_sizes[*pri]--;
|
|
|
|
return stub;
|
|
@@ -153,7 +153,8 @@ __iot_enqueue (iot_conf_t *conf, call_stub_t *stub, int pri)
|
|
}
|
|
list_add_tail (&stub->list, &ctx->reqs);
|
|
|
|
- GF_ATOMIC_INC(conf->queue_size);
|
|
+ conf->queue_size++;
|
|
+ GF_ATOMIC_INC(conf->stub_cnt);
|
|
conf->queue_sizes[pri]++;
|
|
}
|
|
|
|
@@ -182,7 +183,7 @@ iot_worker (void *data)
|
|
conf->ac_iot_count[pri]--;
|
|
pri = -1;
|
|
}
|
|
- while (GF_ATOMIC_GET(conf->queue_size) == 0) {
|
|
+ while (conf->queue_size == 0) {
|
|
if (conf->down) {
|
|
bye = _gf_true;/*Avoid sleep*/
|
|
break;
|
|
@@ -220,8 +221,10 @@ iot_worker (void *data)
|
|
}
|
|
pthread_mutex_unlock (&conf->mutex);
|
|
|
|
- if (stub) /* guard against spurious wakeups */
|
|
+ if (stub) { /* guard against spurious wakeups */
|
|
call_resume (stub);
|
|
+ GF_ATOMIC_DEC(conf->stub_cnt);
|
|
+ }
|
|
stub = NULL;
|
|
|
|
if (bye)
|
|
@@ -816,7 +819,7 @@ __iot_workers_scale (iot_conf_t *conf)
|
|
gf_msg_debug (conf->this->name, 0,
|
|
"scaled threads to %d (queue_size=%d/%d)",
|
|
conf->curr_count,
|
|
- GF_ATOMIC_GET(conf->queue_size), scale);
|
|
+ conf->queue_size, scale);
|
|
} else {
|
|
break;
|
|
}
|
|
@@ -1030,7 +1033,7 @@ init (xlator_t *this)
|
|
bool, out);
|
|
|
|
conf->this = this;
|
|
- GF_ATOMIC_INIT(conf->queue_size, 0);
|
|
+ GF_ATOMIC_INIT(conf->stub_cnt, 0);
|
|
|
|
for (i = 0; i < IOT_PRI_MAX; i++) {
|
|
INIT_LIST_HEAD (&conf->clients[i]);
|
|
@@ -1075,7 +1078,7 @@ notify (xlator_t *this, int32_t event, void *data, ...)
|
|
{
|
|
iot_conf_t *conf = this->private;
|
|
xlator_t *victim = data;
|
|
- uint64_t queue_size = 0;
|
|
+ uint64_t stub_cnt = 0;
|
|
struct timespec sleep_till = {0, };
|
|
|
|
if (GF_EVENT_PARENT_DOWN == event) {
|
|
@@ -1083,14 +1086,14 @@ notify (xlator_t *this, int32_t event, void *data, ...)
|
|
clock_gettime(CLOCK_REALTIME, &sleep_till);
|
|
sleep_till.tv_sec += 1;
|
|
/* Wait for draining stub from queue before notify PARENT_DOWN */
|
|
- queue_size = GF_ATOMIC_GET(conf->queue_size);
|
|
+ stub_cnt = GF_ATOMIC_GET(conf->stub_cnt);
|
|
|
|
pthread_mutex_lock(&conf->mutex);
|
|
{
|
|
- while (queue_size) {
|
|
+ while (stub_cnt) {
|
|
(void)pthread_cond_timedwait(&conf->cond, &conf->mutex,
|
|
&sleep_till);
|
|
- queue_size = GF_ATOMIC_GET(conf->queue_size);
|
|
+ stub_cnt = GF_ATOMIC_GET(conf->stub_cnt);
|
|
}
|
|
}
|
|
pthread_mutex_unlock(&conf->mutex);
|
|
diff --git a/xlators/performance/io-threads/src/io-threads.h b/xlators/performance/io-threads/src/io-threads.h
|
|
index 7a6973c..57a136e 100644
|
|
--- a/xlators/performance/io-threads/src/io-threads.h
|
|
+++ b/xlators/performance/io-threads/src/io-threads.h
|
|
@@ -75,7 +75,8 @@ struct iot_conf {
|
|
int32_t ac_iot_limit[IOT_PRI_MAX];
|
|
int32_t ac_iot_count[IOT_PRI_MAX];
|
|
int queue_sizes[IOT_PRI_MAX];
|
|
- gf_atomic_t queue_size;
|
|
+ int32_t queue_size;
|
|
+ gf_atomic_t stub_cnt;
|
|
pthread_attr_t w_attr;
|
|
gf_boolean_t least_priority; /*Enable/Disable least-priority */
|
|
|
|
--
|
|
1.8.3.1
|
|
|