772c9f37aa
Resolves: bz#1491785 bz#1518710 bz#1523599 bz#1528733 bz#1550474 Resolves: bz#1550982 bz#1551186 bz#1552360 bz#1552414 bz#1552425 Resolves: bz#1554255 bz#1554905 bz#1555261 bz#1556895 bz#1557297 Resolves: bz#1559084 bz#1559788 Signed-off-by: Milind Changire <mchangir@redhat.com>
384 lines
15 KiB
Diff
384 lines
15 KiB
Diff
From 09698d53b91786c990a0f7bc067e5c13551b0b12 Mon Sep 17 00:00:00 2001
|
|
From: Xavi Hernandez <jahernan@redhat.com>
|
|
Date: Wed, 21 Feb 2018 17:47:37 +0100
|
|
Subject: [PATCH 199/201] cluster/ec: avoid delays in self-heal
|
|
|
|
Self-heal creates a thread per brick to sweep the index looking for
|
|
files that need to be healed. These threads are started before the
|
|
volume comes online, so nothing is done but waiting for the next
|
|
sweep. This happens once per minute.
|
|
|
|
When a replace brick command is executed, the new graph is loaded and
|
|
all index sweeper threads started. When all bricks have reported, a
|
|
getxattr request is sent to the root directory of the volume. This
|
|
causes a heal on it (because the new brick doesn't have good data),
|
|
and marks its contents as pending to be healed. This is done by the
|
|
index sweeper thread on the next round, one minute later.
|
|
|
|
This patch solves this problem by waking all index sweeper threads
|
|
after a successful check on the root directory.
|
|
|
|
Additionally, the index sweep thread scans the index directory
|
|
sequentially, but it might happen that after healing a directory entry
|
|
more index entries are created but skipped by the current directory
|
|
scan. This causes the remaining entries to be processed on the next
|
|
round, one minute later. The same can happen in the next round, so
|
|
the heal is running in bursts and taking a lot to finish, specially
|
|
on volumes with many directory levels.
|
|
|
|
This patch solves this problem by immediately restarting the index
|
|
sweep if a directory has been healed.
|
|
|
|
> Upstream patch: https://review.gluster.org/19718
|
|
> master patch: https://review.gluster.org/#/c/19609/
|
|
|
|
Change-Id: I58d9ab6ef17b30f704dc322e1d3d53b904e5f30e
|
|
BUG: 1555261
|
|
Signed-off-by: Xavi Hernandez <jahernan@redhat.com>
|
|
Reviewed-on: https://code.engineering.redhat.com/gerrit/133570
|
|
Reviewed-by: Ashish Pandey <aspandey@redhat.com>
|
|
Tested-by: Ashish Pandey <aspandey@redhat.com>
|
|
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
|
---
|
|
tests/bugs/ec/bug-1547662.t | 41 ++++++++++++++++
|
|
xlators/cluster/ec/src/ec-heal.c | 9 ++++
|
|
xlators/cluster/ec/src/ec-heald.c | 27 +++++++---
|
|
xlators/cluster/ec/src/ec-heald.h | 4 +-
|
|
xlators/cluster/ec/src/ec.c | 101 ++++++++++++++++++++++----------------
|
|
5 files changed, 134 insertions(+), 48 deletions(-)
|
|
create mode 100644 tests/bugs/ec/bug-1547662.t
|
|
|
|
diff --git a/tests/bugs/ec/bug-1547662.t b/tests/bugs/ec/bug-1547662.t
|
|
new file mode 100644
|
|
index 0000000..5748218
|
|
--- /dev/null
|
|
+++ b/tests/bugs/ec/bug-1547662.t
|
|
@@ -0,0 +1,41 @@
|
|
+#!/bin/bash
|
|
+
|
|
+. $(dirname $0)/../../include.rc
|
|
+. $(dirname $0)/../../volume.rc
|
|
+
|
|
+# Immediately after replace-brick, trusted.ec.version will be absent, so if it
|
|
+# is present we can assume that heal was started on root
|
|
+function root_heal_attempted {
|
|
+ if [ -z $(get_hex_xattr trusted.ec.version $1) ]; then
|
|
+ echo "N"
|
|
+ else
|
|
+ echo "Y"
|
|
+ fi
|
|
+}
|
|
+
|
|
+cleanup
|
|
+
|
|
+TEST glusterd
|
|
+TEST pidof glusterd
|
|
+TEST ${CLI} volume create ${V0} disperse 6 redundancy 2 ${H0}:${B0}/${V0}{0..5}
|
|
+TEST ${CLI} volume start ${V0}
|
|
+TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0}
|
|
+EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count ${V0} 0
|
|
+
|
|
+TEST mkdir ${M0}/base
|
|
+TEST mkdir ${M0}/base/dir.{1,2}
|
|
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}
|
|
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}
|
|
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}
|
|
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}
|
|
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}
|
|
+
|
|
+TEST ${CLI} volume replace-brick ${V0} ${H0}:${B0}/${V0}5 ${H0}:${B0}/${V0}6 commit force
|
|
+EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count ${V0} 0
|
|
+EXPECT_WITHIN ${PROCESS_UP_TIMEOUT} "Y" glustershd_up_status
|
|
+EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count_shd ${V0} 0
|
|
+EXPECT_WITHIN ${HEAL_TIMEOUT} "Y" root_heal_attempted ${B0}/${V0}6
|
|
+EXPECT_WITHIN ${HEAL_TIMEOUT} "^0$" get_pending_heal_count ${V0}
|
|
+EXPECT "^127$" echo $(find ${B0}/${V0}6/base -type d | wc -l)
|
|
+
|
|
+cleanup;
|
|
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
|
|
index b8518d6..8e02986 100644
|
|
--- a/xlators/cluster/ec/src/ec-heal.c
|
|
+++ b/xlators/cluster/ec/src/ec-heal.c
|
|
@@ -25,6 +25,7 @@
|
|
#include "ec-combine.h"
|
|
#include "ec-method.h"
|
|
#include "ec-fops.h"
|
|
+#include "ec-heald.h"
|
|
|
|
#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr; })
|
|
#define EC_COUNT(array, max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res; })
|
|
@@ -2752,6 +2753,14 @@ ec_replace_heal (ec_t *ec, inode_t *inode)
|
|
gf_msg_debug (ec->xl->name, 0,
|
|
"Heal failed for replace brick ret = %d", ret);
|
|
|
|
+ /* Once the root inode has been checked, it might have triggered a
|
|
+ * self-heal on it after a replace brick command or for some other
|
|
+ * reason. It can also happen that the volume already had damaged
|
|
+ * files in the index, even if the heal on the root directory failed.
|
|
+ * In both cases we need to wake all index healers to continue
|
|
+ * healing remaining entries that are marked as dirty. */
|
|
+ ec_shd_index_healer_wake(ec);
|
|
+
|
|
loc_wipe (&loc);
|
|
return ret;
|
|
}
|
|
diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c
|
|
index b4fa6f8..a703379 100644
|
|
--- a/xlators/cluster/ec/src/ec-heald.c
|
|
+++ b/xlators/cluster/ec/src/ec-heald.c
|
|
@@ -184,8 +184,19 @@ ec_shd_index_purge (xlator_t *subvol, inode_t *inode, char *name)
|
|
int
|
|
ec_shd_selfheal (struct subvol_healer *healer, int child, loc_t *loc)
|
|
{
|
|
- return syncop_getxattr (healer->this, loc, NULL, EC_XATTR_HEAL, NULL,
|
|
- NULL);
|
|
+ int32_t ret;
|
|
+
|
|
+ ret = syncop_getxattr (healer->this, loc, NULL, EC_XATTR_HEAL, NULL,
|
|
+ NULL);
|
|
+ if ((ret >= 0) && (loc->inode->ia_type == IA_IFDIR)) {
|
|
+ /* If we have just healed a directory, it's possible that
|
|
+ * other index entries have appeared to be healed. We put a
|
|
+ * mark so that we can check it later and restart a scan
|
|
+ * without delay. */
|
|
+ healer->rerun = _gf_true;
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
}
|
|
|
|
|
|
@@ -472,11 +483,15 @@ ec_shd_index_healer_spawn (xlator_t *this, int subvol)
|
|
}
|
|
|
|
void
|
|
-ec_selfheal_childup (ec_t *ec, int child)
|
|
+ec_shd_index_healer_wake(ec_t *ec)
|
|
{
|
|
- if (!ec->shd.iamshd)
|
|
- return;
|
|
- ec_shd_index_healer_spawn (ec->xl, child);
|
|
+ int32_t i;
|
|
+
|
|
+ for (i = 0; i < ec->nodes; i++) {
|
|
+ if (((ec->xl_up >> i) & 1) != 0) {
|
|
+ ec_shd_index_healer_spawn(ec->xl, i);
|
|
+ }
|
|
+ }
|
|
}
|
|
|
|
int
|
|
diff --git a/xlators/cluster/ec/src/ec-heald.h b/xlators/cluster/ec/src/ec-heald.h
|
|
index 4ae02e2..2a84881 100644
|
|
--- a/xlators/cluster/ec/src/ec-heald.h
|
|
+++ b/xlators/cluster/ec/src/ec-heald.h
|
|
@@ -20,6 +20,8 @@ ec_xl_op (xlator_t *this, dict_t *input, dict_t *output);
|
|
|
|
int
|
|
ec_selfheal_daemon_init (xlator_t *this);
|
|
-void ec_selfheal_childup (ec_t *ec, int child);
|
|
+
|
|
+void
|
|
+ec_shd_index_healer_wake(ec_t *ec);
|
|
|
|
#endif /* __EC_HEALD_H__ */
|
|
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
|
|
index bfdca64..956b45b 100644
|
|
--- a/xlators/cluster/ec/src/ec.c
|
|
+++ b/xlators/cluster/ec/src/ec.c
|
|
@@ -322,7 +322,7 @@ ec_get_event_from_state (ec_t *ec)
|
|
/* If ec is up but some subvolumes are yet to notify, give
|
|
* grace time for other subvols to notify to prevent start of
|
|
* I/O which may result in self-heals */
|
|
- if (ec->timer && ec->xl_notify_count < ec->nodes)
|
|
+ if (ec->xl_notify_count < ec->nodes)
|
|
return GF_EVENT_MAXVAL;
|
|
|
|
return GF_EVENT_CHILD_UP;
|
|
@@ -344,8 +344,8 @@ ec_up (xlator_t *this, ec_t *ec)
|
|
}
|
|
|
|
ec->up = 1;
|
|
- gf_msg (this->name, GF_LOG_INFO, 0,
|
|
- EC_MSG_EC_UP, "Going UP");
|
|
+ gf_msg (this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP, "Going UP");
|
|
+
|
|
gf_event (EVENT_EC_MIN_BRICKS_UP, "subvol=%s", this->name);
|
|
}
|
|
|
|
@@ -358,8 +358,8 @@ ec_down (xlator_t *this, ec_t *ec)
|
|
}
|
|
|
|
ec->up = 0;
|
|
- gf_msg (this->name, GF_LOG_INFO, 0,
|
|
- EC_MSG_EC_DOWN, "Going DOWN");
|
|
+ gf_msg (this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN, "Going DOWN");
|
|
+
|
|
gf_event (EVENT_EC_MIN_BRICKS_NOT_UP, "subvol=%s", this->name);
|
|
}
|
|
|
|
@@ -383,31 +383,38 @@ ec_notify_cbk (void *data)
|
|
gf_timer_call_cancel (ec->xl->ctx, ec->timer);
|
|
ec->timer = NULL;
|
|
|
|
+ /* The timeout has expired, so any subvolume that has not
|
|
+ * already reported its state, will be considered to be down.
|
|
+ * We mark as if all bricks had reported. */
|
|
+ ec->xl_notify = (1ULL << ec->nodes) - 1ULL;
|
|
+ ec->xl_notify_count = ec->nodes;
|
|
+
|
|
+ /* Since we have marked all subvolumes as notified, it's
|
|
+ * guaranteed that ec_get_event_from_state() will return
|
|
+ * CHILD_UP or CHILD_DOWN, but not MAXVAL. */
|
|
event = ec_get_event_from_state (ec);
|
|
- /* If event is still MAXVAL then enough subvolumes didn't
|
|
- * notify, treat it as CHILD_DOWN. */
|
|
- if (event == GF_EVENT_MAXVAL) {
|
|
- event = GF_EVENT_CHILD_DOWN;
|
|
- ec->xl_notify = (1ULL << ec->nodes) - 1ULL;
|
|
- ec->xl_notify_count = ec->nodes;
|
|
- } else if (event == GF_EVENT_CHILD_UP) {
|
|
- /* Rest of the bricks are still not coming up,
|
|
- * notify that ec is up. Files/directories will be
|
|
- * healed as in when they come up. */
|
|
+ if (event == GF_EVENT_CHILD_UP) {
|
|
+ /* We are ready to bring the volume up. If there are
|
|
+ * still bricks DOWN, they will be healed when they
|
|
+ * come up. */
|
|
ec_up (ec->xl, ec);
|
|
}
|
|
|
|
- /* CHILD_DOWN should not come here as no grace period is given
|
|
- * for notifying CHILD_DOWN. */
|
|
-
|
|
propagate = _gf_true;
|
|
}
|
|
unlock:
|
|
UNLOCK(&ec->lock);
|
|
|
|
if (propagate) {
|
|
+ if ((event == GF_EVENT_CHILD_UP) && ec->shd.iamshd) {
|
|
+ /* We have just brought the volume UP, so we trigger
|
|
+ * a self-heal check on the root directory. */
|
|
+ ec_launch_replace_heal (ec);
|
|
+ }
|
|
+
|
|
default_notify (ec->xl, event, NULL);
|
|
}
|
|
+
|
|
}
|
|
|
|
void
|
|
@@ -442,7 +449,7 @@ ec_pending_fops_completed(ec_t *ec)
|
|
}
|
|
}
|
|
|
|
-static void
|
|
+static gf_boolean_t
|
|
ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state)
|
|
{
|
|
uintptr_t current_state = 0;
|
|
@@ -455,23 +462,28 @@ ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state)
|
|
if (current_state != new_state) {
|
|
ec->xl_up ^= index_mask;
|
|
ec->xl_up_count += (current_state ? -1 : 1);
|
|
+
|
|
+ return _gf_true;
|
|
}
|
|
+
|
|
+ return _gf_false;
|
|
}
|
|
|
|
int32_t
|
|
ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
|
|
{
|
|
- ec_t *ec = this->private;
|
|
- int32_t idx = 0;
|
|
- int32_t error = 0;
|
|
- glusterfs_event_t old_event = GF_EVENT_MAXVAL;
|
|
- dict_t *input = NULL;
|
|
- dict_t *output = NULL;
|
|
- gf_boolean_t propagate = _gf_true;
|
|
- int32_t orig_event = event;
|
|
+ ec_t *ec = this->private;
|
|
+ int32_t idx = 0;
|
|
+ int32_t error = 0;
|
|
+ glusterfs_event_t old_event = GF_EVENT_MAXVAL;
|
|
+ dict_t *input = NULL;
|
|
+ dict_t *output = NULL;
|
|
+ gf_boolean_t propagate = _gf_true;
|
|
+ gf_boolean_t needs_shd_check = _gf_false;
|
|
+ int32_t orig_event = event;
|
|
struct gf_upcall *up_data = NULL;
|
|
struct gf_upcall_cache_invalidation *up_ci = NULL;
|
|
- uintptr_t mask = 0;
|
|
+ uintptr_t mask = 0;
|
|
|
|
gf_msg_trace (this->name, 0, "NOTIFY(%d): %p, %p",
|
|
event, data, data2);
|
|
@@ -498,8 +510,6 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
|
|
|
|
for (idx = 0; idx < ec->nodes; idx++) {
|
|
if (ec->xl_list[idx] == data) {
|
|
- if (event == GF_EVENT_CHILD_UP)
|
|
- ec_selfheal_childup (ec, idx);
|
|
break;
|
|
}
|
|
}
|
|
@@ -525,17 +535,27 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
|
|
|
|
mask = 1ULL << idx;
|
|
if (event == GF_EVENT_CHILD_UP) {
|
|
- ec_set_up_state(ec, mask, mask);
|
|
+ /* We need to trigger a selfheal if a brick changes
|
|
+ * to UP state. */
|
|
+ needs_shd_check = ec_set_up_state(ec, mask, mask);
|
|
} else if (event == GF_EVENT_CHILD_DOWN) {
|
|
- ec_set_up_state(ec, mask, 0);
|
|
+ ec_set_up_state(ec, mask, 0);
|
|
}
|
|
|
|
event = ec_get_event_from_state (ec);
|
|
|
|
- if (event == GF_EVENT_CHILD_UP && !ec->up) {
|
|
- ec_up (this, ec);
|
|
- } else if (event == GF_EVENT_CHILD_DOWN && ec->up) {
|
|
- ec_down (this, ec);
|
|
+ if (event == GF_EVENT_CHILD_UP) {
|
|
+ if (!ec->up) {
|
|
+ ec_up (this, ec);
|
|
+ }
|
|
+ } else {
|
|
+ /* If the volume is not UP, it's irrelevant if one
|
|
+ * brick has come up. We cannot heal anything. */
|
|
+ needs_shd_check = _gf_false;
|
|
+
|
|
+ if ((event == GF_EVENT_CHILD_DOWN) && ec->up) {
|
|
+ ec_down (this, ec);
|
|
+ }
|
|
}
|
|
|
|
if (event != GF_EVENT_MAXVAL) {
|
|
@@ -554,14 +574,13 @@ unlock:
|
|
|
|
done:
|
|
if (propagate) {
|
|
+ if (needs_shd_check && ec->shd.iamshd) {
|
|
+ ec_launch_replace_heal (ec);
|
|
+ }
|
|
+
|
|
error = default_notify (this, event, data);
|
|
}
|
|
|
|
- if (ec->shd.iamshd &&
|
|
- ec->xl_notify_count == ec->nodes &&
|
|
- event == GF_EVENT_CHILD_UP) {
|
|
- ec_launch_replace_heal (ec);
|
|
- }
|
|
out:
|
|
return error;
|
|
}
|
|
--
|
|
1.8.3.1
|
|
|